From 07d43784b4b90a42b367c5dba4b73abe5ded6c1c Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Tue, 23 Nov 2021 15:22:58 +0800 Subject: [PATCH 01/14] add query_memtracker, add tcmalloc hook, add thread status --- be/src/exec/olap_scan_node.cpp | 3 + be/src/runtime/exec_env.h | 9 ++- be/src/runtime/exec_env_init.cpp | 10 ++- be/src/runtime/fragment_mgr.cpp | 3 + be/src/runtime/mem_tracker.cpp | 45 +++++++------ be/src/runtime/mem_tracker.h | 71 +++++++++++---------- be/src/runtime/plan_fragment_executor.cpp | 2 + be/src/runtime/runtime_state.cpp | 4 ++ be/src/runtime/runtime_state.h | 1 + be/src/runtime/tcmalloc_hook.h | 47 ++++++++++++++ be/src/runtime/thread_status.h | 78 +++++++++++++++++++++++ be/src/service/doris_main.cpp | 3 +- 12 files changed, 218 insertions(+), 58 deletions(-) create mode 100644 be/src/runtime/tcmalloc_hook.h create mode 100644 be/src/runtime/thread_status.h diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index 4b1f775bcdb72c..af9b6dbce991f8 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -29,6 +29,7 @@ #include "exprs/expr_context.h" #include "exprs/runtime_filter.h" #include "gen_cpp/PlanNodes_types.h" +#include "runtime/thread_status.h" #include "runtime/exec_env.h" #include "runtime/row_batch.h" #include "runtime/runtime_filter_mgr.h" @@ -1501,6 +1502,7 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { } void OlapScanNode::scanner_thread(OlapScanner* scanner) { + current_thread.attach_query(scanner->runtime_state()->query_id()); if (UNLIKELY(_transfer_done)) { _scanner_done = true; std::unique_lock l(_scan_batches_lock); @@ -1659,6 +1661,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { // and transfer thread _scan_batch_added_cv.notify_one(); _scan_thread_exit_cv.notify_one(); + current_thread.update_mem_tracker(nullptr); } Status OlapScanNode::add_one_batch(RowBatch* row_batch) { diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index 39808bf8704201..ec16ce2911acfa 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -45,7 +45,7 @@ class LoadPathMgr; class LoadStreamMgr; class MemTracker; class StorageEngine; -class PoolMemTrackerRegistry; +class QueryMemTrackerRegistry; class PriorityThreadPool; class ReservationTracker; class ResultBufferMgr; @@ -96,6 +96,7 @@ class ExecEnv { // declarations for classes in scoped_ptrs. ~ExecEnv(); + bool is_init() { return _is_init; } const std::string& token() const; ExternalScanContextMgr* external_scan_context_mgr() { return _external_scan_context_mgr; } DataStreamMgr* stream_mgr() { return _stream_mgr; } @@ -116,7 +117,8 @@ class ExecEnv { } std::shared_ptr process_mem_tracker() { return _mem_tracker; } - PoolMemTrackerRegistry* pool_mem_trackers() { return _pool_mem_trackers; } + std::shared_ptr hook_process_mem_tracker() { return _hook_mem_tracker; } + QueryMemTrackerRegistry* query_mem_trackers() { return _query_mem_trackers; } ThreadResourceMgr* thread_mgr() { return _thread_mgr; } PriorityThreadPool* scan_thread_pool() { return _scan_thread_pool; } ThreadPool* limited_scan_thread_pool() { return _limited_scan_thread_pool.get(); } @@ -181,7 +183,8 @@ class ExecEnv { ClientCache* _broker_client_cache = nullptr; ClientCache* _extdatasource_client_cache = nullptr; std::shared_ptr _mem_tracker; - PoolMemTrackerRegistry* _pool_mem_trackers = nullptr; + std::shared_ptr _hook_mem_tracker = nullptr; + QueryMemTrackerRegistry* _query_mem_trackers = nullptr; ThreadResourceMgr* _thread_mgr = nullptr; // The following two thread pools are used in different scenarios. diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index eb29e5eaae3048..c98af26011564a 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -93,7 +93,7 @@ Status ExecEnv::_init(const std::vector& store_paths) { _broker_client_cache = new BrokerServiceClientCache(config::max_client_cache_size_per_host); _extdatasource_client_cache = new ExtDataSourceServiceClientCache(config::max_client_cache_size_per_host); - _pool_mem_trackers = new PoolMemTrackerRegistry(); + _query_mem_trackers = new QueryMemTrackerRegistry(); _thread_mgr = new ThreadResourceMgr(); _scan_thread_pool = new PriorityThreadPool(config::doris_scanner_thread_pool_thread_num, config::doris_scanner_thread_pool_queue_size); @@ -177,6 +177,12 @@ Status ExecEnv::_init_mem_tracker() { MemTracker::GetRootTracker(), false, false, MemTrackerLevel::OVERVIEW); REGISTER_HOOK_METRIC(query_mem_consumption, [this]() { return _mem_tracker->consumption(); }); + // TODO(zxy): Will replace _mem_tracker as process_mem_tracker in future. + // The statistic memory consumption is duplicated with _mem_tracker, + // which will cause the RootTracker statistic value to be much larger than actual, + _hook_mem_tracker = MemTracker::CreateTracker(global_memory_limit_bytes, "TcmallocHook Process", + MemTracker::GetRootTracker(), false, false, + MemTrackerLevel::OVERVIEW); LOG(INFO) << "Using global memory limit: " << PrettyPrinter::print(global_memory_limit_bytes, TUnit::BYTES) << ", origin config value: " << config::mem_limit; @@ -300,7 +306,7 @@ void ExecEnv::_destroy() { SAFE_DELETE(_etl_thread_pool); SAFE_DELETE(_scan_thread_pool); SAFE_DELETE(_thread_mgr); - SAFE_DELETE(_pool_mem_trackers); + SAFE_DELETE(_query_mem_trackers); SAFE_DELETE(_broker_client_cache); SAFE_DELETE(_extdatasource_client_cache); SAFE_DELETE(_frontend_client_cache); diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index dcff85c2d5a2df..8d629a92380086 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -43,6 +43,7 @@ #include "runtime/stream_load/load_stream_mgr.h" #include "runtime/stream_load/stream_load_context.h" #include "runtime/stream_load/stream_load_pipe.h" +#include "runtime/thread_status.h" #include "service/backend_options.h" #include "util/debug_util.h" #include "util/doris_metrics.h" @@ -461,6 +462,7 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi .query_id(exec_state->query_id()) .instance_id(exec_state->fragment_instance_id()) .tag("pthread_id", std::to_string((uintptr_t)pthread_self())); + current_thread.attach_query(exec_state->query_id()); exec_state->execute(); std::shared_ptr fragments_ctx = exec_state->get_fragments_ctx(); @@ -481,6 +483,7 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi // Callback after remove from this id cb(exec_state->executor()); + current_thread.update_mem_tracker(nullptr); } Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params) { diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index 350f7bc3119668..b4e23ff8ae74f2 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -57,8 +57,8 @@ namespace doris { const std::string MemTracker::COUNTER_NAME = "PeakMemoryUsage"; -// Name for request pool MemTrackers. '$0' is replaced with the pool name. -const std::string REQUEST_POOL_MEM_TRACKER_LABEL_FORMAT = "RequestPool=$0"; +// Name for query MemTrackers. '$0' is replaced with the query id. +const std::string QUERY_MEM_TRACKER_LABEL_FORMAT = "queryId=$0"; /// Calculate the soft limit for a MemTracker based on the hard limit 'limit'. static int64_t CalcSoftLimit(int64_t limit) { @@ -213,8 +213,6 @@ void MemTracker::RefreshConsumptionFromMetric() { } int64_t MemTracker::GetPoolMemReserved() { - // Pool trackers should have a pool_name_ and no limit. - DCHECK(!pool_name_.empty()); DCHECK_EQ(limit_, -1) << LogUsage(UNLIMITED_DEPTH); // Use cache to avoid holding child_trackers_lock_ @@ -242,23 +240,34 @@ int64_t MemTracker::GetPoolMemReserved() { return mem_reserved; } -std::shared_ptr PoolMemTrackerRegistry::GetRequestPoolMemTracker( - const string& pool_name, bool create_if_not_present) { - DCHECK(!pool_name.empty()); - lock_guard l(pool_to_mem_trackers_lock_); - PoolTrackersMap::iterator it = pool_to_mem_trackers_.find(pool_name); - if (it != pool_to_mem_trackers_.end()) { +std::shared_ptr QueryMemTrackerRegistry::RegisterQueryMemTracker(const std::string& query_id, + int64_t mem_limit) { + DCHECK(!query_id.empty()); + if (mem_limit != -1) { + if (mem_limit > MemInfo::physical_mem()) { + LOG(WARNING) << "Memory limit " << PrettyPrinter::print(mem_limit, TUnit::BYTES) + << " exceeds physical memory of " + << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES); + } + VLOG(2) << "Using query memory limit: " + << PrettyPrinter::print(mem_limit, TUnit::BYTES); + } + + lock_guard l(_query_mem_trackers_lock); + QueryTrackersMap::iterator it = _query_mem_trackers.find(query_id); + if (it != _query_mem_trackers.end()) { MemTracker* tracker = it->second.get(); - DCHECK(pool_name == tracker->pool_name_); + DCHECK(query_id == tracker->query_id()); return it->second; } - if (!create_if_not_present) return nullptr; - // First time this pool_name registered, make a new object. + + // First time this query_id registered, make a new object. std::shared_ptr tracker = MemTracker::CreateTracker( - -1, strings::Substitute(REQUEST_POOL_MEM_TRACKER_LABEL_FORMAT, pool_name), - ExecEnv::GetInstance()->process_mem_tracker()); - tracker->pool_name_ = pool_name; - pool_to_mem_trackers_.emplace(pool_name, std::shared_ptr(tracker)); + mem_limit, strings::Substitute(QUERY_MEM_TRACKER_LABEL_FORMAT, query_id), + ExecEnv::GetInstance()->hook_process_mem_tracker(), false, false, + MemTrackerLevel::OVERVIEW); + tracker->set_query_id(query_id); + _query_mem_trackers.emplace(query_id, std::shared_ptr(tracker)); return tracker; } @@ -471,7 +480,7 @@ void MemTracker::GetTopNQueries( MemTracker* MemTracker::GetQueryMemTracker() { MemTracker* tracker = this; - while (tracker != nullptr) { + while (tracker != nullptr && !tracker->is_query_mem_tracker_) { tracker = tracker->parent_.get(); } return tracker; diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index 1622a70e71adba..914ee17b1e515d 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -117,8 +118,6 @@ class MemTracker : public std::enable_shared_from_this { // Gets a shared_ptr to the "root" tracker, creating it if necessary. static std::shared_ptr GetRootTracker(); - // delete static CreateQueryMemTracker(), cuz it cannot use shared tracker - /// Increases consumption of this tracker and its ancestors by 'bytes'. void Consume(int64_t bytes) { // DCHECK_GE(bytes, 0); @@ -311,6 +310,14 @@ class MemTracker : public std::enable_shared_from_this { } const std::string& label() const { return label_; } + std::string query_id() { + return query_id_; + } + void set_query_id(const std::string& query_id) { + query_id_ = query_id; + is_query_mem_tracker_ = true; + } + /// Returns the lowest limit for this tracker and its ancestors. Returns /// -1 if there is no limit. int64_t GetLowestLimit(MemLimit mode) const; @@ -431,7 +438,7 @@ class MemTracker : public std::enable_shared_from_this { const std::shared_ptr& parent, bool log_usage_if_zero, MemTrackerLevel); private: - friend class PoolMemTrackerRegistry; + friend class QueryMemTrackerRegistry; // TODO(HW): remove later /// Closes this MemTracker. After closing it is invalid to consume memory on this @@ -499,14 +506,11 @@ class MemTracker : public std::enable_shared_from_this { /// Lock to protect GcMemory(). This prevents many GCs from occurring at once. std::mutex gc_lock_; - /// Only used if 'is_query_mem_tracker_' is true. - /// 0 if the query is still executing or 1 if it has finished executing. Before - /// it has finished executing, the tracker limit is treated as "reserved memory" - /// for the purpose of admission control - see GetPoolMemReserved(). - std::atomic query_exec_finished_ {0}; + /// True if this is a Query MemTracker returned from RegisterQueryMemTracker(). + bool is_query_mem_tracker_ = false; - /// Only valid for MemTrackers returned from GetRequestPoolMemTracker() - std::string pool_name_; + /// Only valid for MemTrackers returned from RegisterQueryMemTracker() + std::string query_id_; /// Hard limit on memory consumption, in bytes. May not be exceeded. If limit_ == -1, /// there is no consumption limit. @@ -573,33 +577,32 @@ class MemTracker : public std::enable_shared_from_this { IntGauge* limit_metric_; }; -/// Global registry for query and pool MemTrackers. Owned by ExecEnv. -class PoolMemTrackerRegistry { +// Global registry for query MemTrackers. Owned by ExecEnv. +class QueryMemTrackerRegistry { public: - /// Returns a MemTracker object for request pool 'pool_name'. Calling this with the same - /// 'pool_name' will return the same MemTracker object. This is used to track the local - /// memory usage of all requests executing in this pool. If 'create_if_not_present' is - /// true, the first time this is called for a pool, a new MemTracker object is created - /// with the process tracker as its parent. There is no explicit per-pool byte_limit - /// set at any particular impalad, so newly created trackers will always have a limit - /// of -1. - /// TODO(cmy): this function is not used for now. the memtracker returned from here is - /// got from a shared_ptr in `pool_to_mem_trackers_`. - /// This funtion is from - /// https://github.com/cloudera/Impala/blob/495397101e5807c701df71ea288f4815d69c2c8a/be/src/runtime/mem-tracker.h#L497 - /// And in impala this function will return a raw pointer. - std::shared_ptr GetRequestPoolMemTracker(const std::string& pool_name, - bool create_if_not_present); + // Construct a MemTracker object for 'query_id' with 'mem_limit' as the memory limit. + // The MemTracker is a child of the process MemTracker, Calling this with the same + // 'query_id' will return the same MemTracker object. This is used to track the local + // memory usage of all querys executing. The first time this is called for a query, + // a new MemTracker object is created with the process tracker as its parent. + // Newly created trackers will always have a limit of -1. + std::shared_ptr RegisterQueryMemTracker(const std::string& query_id, + int64_t mem_limit = -1); private: - /// All per-request pool MemTracker objects. It is assumed that request pools will live - /// for the entire duration of the process lifetime so MemTrackers are never removed - /// from this map. Protected by '_pool_to_mem_trackers_lock' - typedef std::unordered_map> PoolTrackersMap; - PoolTrackersMap pool_to_mem_trackers_; - /// IMPALA-3068: Use SpinLock instead of std::mutex so that the lock won't - /// automatically destroy itself as part of process teardown, which could cause races. - SpinLock pool_to_mem_trackers_lock_; + // All per-query MemTracker objects. + // The life cycle of Query memtracker in the process is the same as `query_timeout`, + // MemTrackers will be removed from this map after timeout. + using QueryTrackersMap = phmap::parallel_flat_hash_map< + std::string, std::shared_ptr, + phmap::priv::hash_default_hash, + phmap::priv::hash_default_eq, + std::allocator>>, + 12, std::mutex>; + QueryTrackersMap _query_mem_trackers; + // Use SpinLock instead of std::mutex so that the lock won't + // automatically destroy itself as part of process teardown, which could cause races. + SpinLock _query_mem_trackers_lock; }; } // namespace doris diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp index 92aefc4d282588..98a33ab6bb2bce 100644 --- a/be/src/runtime/plan_fragment_executor.cpp +++ b/be/src/runtime/plan_fragment_executor.cpp @@ -28,6 +28,7 @@ #include "exec/scan_node.h" #include "exprs/expr.h" #include "runtime/data_stream_mgr.h" +#include "runtime/thread_status.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/mem_tracker.h" @@ -86,6 +87,7 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, _runtime_state->set_query_fragments_ctx(fragments_ctx); RETURN_IF_ERROR(_runtime_state->init_mem_trackers(_query_id)); + current_thread.attach_query(_query_id); _runtime_state->set_be_number(request.backend_num); if (request.__isset.backend_id) { _runtime_state->set_backend_id(request.backend_id); diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index e506189383a241..c75863d63be384 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -31,6 +31,7 @@ #include "runtime/buffered_block_mgr2.h" #include "runtime/bufferpool/reservation_tracker.h" #include "runtime/bufferpool/reservation_util.h" +#include "runtime/thread_status.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/initial_reservations.h" @@ -220,6 +221,9 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { _query_mem_tracker = MemTracker::CreateTracker(bytes_limit, "RuntimeState:query:" + print_id(query_id), _exec_env->process_mem_tracker(), true, false); + _hook_query_mem_tracker = + _exec_env->query_mem_trackers()->RegisterQueryMemTracker(doris::print_id(query_id), bytes_limit); + _instance_mem_tracker = MemTracker::CreateTracker(&_profile, -1, "RuntimeState:instance:", _query_mem_tracker); diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 449b4c2a1738ac..030cfb952756a3 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -406,6 +406,7 @@ class RuntimeState { // MemTracker that is shared by all fragment instances running on this host. // The query mem tracker must be released after the _instance_mem_tracker. std::shared_ptr _query_mem_tracker; + std::shared_ptr _hook_query_mem_tracker; // Memory usage of this fragment instance std::shared_ptr _instance_mem_tracker; diff --git a/be/src/runtime/tcmalloc_hook.h b/be/src/runtime/tcmalloc_hook.h new file mode 100644 index 00000000000000..a3a8b887b6f237 --- /dev/null +++ b/be/src/runtime/tcmalloc_hook.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include + +#include "runtime/thread_status.h" + +static int new_hook_calls = 0; +static int delete_hook_calls = 0; + +void new_hook(const void* ptr, size_t size) { + new_hook_calls++; + doris::current_thread.consume_mem(tc_nallocx(size, 0)); +} + +void delete_hook(const void* ptr) { + delete_hook_calls++; + doris::current_thread.release_mem(tc_malloc_size(const_cast(ptr))); +} + +void init_hook() { + assert(MallocHook::AddNewHook(&new_hook)); + assert(MallocHook::AddDeleteHook(&delete_hook)); +} + +void destroy_hook() { + assert(MallocHook::RemoveNewHook(&new_hook)); + assert(MallocHook::RemoveDeleteHook(&delete_hook)); +} \ No newline at end of file diff --git a/be/src/runtime/thread_status.h b/be/src/runtime/thread_status.h new file mode 100644 index 00000000000000..edec3d61613bd5 --- /dev/null +++ b/be/src/runtime/thread_status.h @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "runtime/exec_env.h" +#include "runtime/mem_tracker.h" + +namespace doris { + +class ThreadStatus { +public: + ThreadStatus() : _thread_id(std::this_thread::get_id()) {} + ~ThreadStatus() { update_mem_tracker(nullptr); } + + void attach_query(const doris::TUniqueId& query_id) { + _query_id = doris::print_id(query_id); + update_mem_tracker(ExecEnv::GetInstance()->query_mem_trackers()->RegisterQueryMemTracker( + doris::print_id(query_id))); + } + + void update_mem_tracker(std::shared_ptr mem_tracker) { + if (_untracked_mem != 0 && _mem_tracker != nullptr) { + if (!_mem_tracker->TryConsume(_untracked_mem)) { + return; // add call back + } + _untracked_mem = 0; + } + _mem_tracker = mem_tracker; + } + + void consume(int64_t size) { + if (_mem_tracker == nullptr && ExecEnv::GetInstance()->is_init()) { + _mem_tracker = ExecEnv::GetInstance()->hook_process_mem_tracker(); + } + _untracked_mem += size; + if (_mem_tracker != nullptr && (_untracked_mem >= _s_untracked_mem_limit || + _untracked_mem <= -_s_untracked_mem_limit)) { + if (!_mem_tracker->TryConsume(_untracked_mem)) { + return; // add call back + } + _untracked_mem = 0; + } + } + + void consume_mem(int64_t size) { consume(size); } + + void release_mem(int64_t size) { consume(-size); } + + const std::string& query_id() { return _query_id; } + const std::thread::id& thread_id() { return _thread_id; } + +private: + std::thread::id _thread_id; + std::string _query_id; + std::shared_ptr _mem_tracker = nullptr; + int64_t _untracked_mem = 0; + int64_t _s_untracked_mem_limit = 1 * 1024 * 1024; +}; + +inline thread_local ThreadStatus current_thread; +} // namespace doris diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index fba366931eb391..21f8779e48aa1e 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -46,6 +46,7 @@ #include "runtime/exec_env.h" #include "runtime/heartbeat_flags.h" #include "runtime/minidump.h" +#include "runtime/tcmalloc_hook.h" #include "service/backend_options.h" #include "service/backend_service.h" #include "service/brpc_service.h" @@ -75,7 +76,7 @@ static void thrift_output(const char* x) { } // namespace doris int main(int argc, char** argv) { - + init_hook(); // check if print version or help if (argc > 1) { if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { From 40bce2f29113856e8130bcbac2d8f41eb2091efd Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Tue, 23 Nov 2021 18:56:07 +0800 Subject: [PATCH 02/14] remove lock, delete query tracker --- be/src/runtime/mem_tracker.cpp | 52 +++++++++++++++++--------------- be/src/runtime/mem_tracker.h | 30 +++++++++--------- be/src/runtime/runtime_state.cpp | 1 + be/src/runtime/tcmalloc_hook.h | 12 +++----- 4 files changed, 50 insertions(+), 45 deletions(-) diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index b4e23ff8ae74f2..ab44695d5d90d3 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -78,7 +78,7 @@ void MemTracker::CreateRootTracker() { std::shared_ptr MemTracker::CreateTracker(RuntimeProfile* profile, int64_t byte_limit, const std::string& label, const std::shared_ptr& parent, - bool reset_label_name, MemTrackerLevel level) { + bool reset_label_name, MemTrackerLevel level, std::string query_id) { std::shared_ptr real_parent; std::string label_name; // if parent is not null, reset label name to query id. @@ -105,12 +105,14 @@ std::shared_ptr MemTracker::CreateTracker(RuntimeProfile* profile, i level > real_parent->_level ? level : real_parent->_level)); real_parent->AddChildTracker(tracker); tracker->Init(); + tracker->set_query_id(query_id); return tracker; } std::shared_ptr MemTracker::CreateTracker(int64_t byte_limit, const std::string& label, - std::shared_ptr parent, bool log_usage_if_zero, bool reset_label_name, MemTrackerLevel level) { + std::shared_ptr parent, bool log_usage_if_zero, bool reset_label_name, + MemTrackerLevel level, std::string query_id) { std::shared_ptr real_parent; std::string label_name; // if parent is not null, reset label name to query id. @@ -138,6 +140,7 @@ std::shared_ptr MemTracker::CreateTracker(int64_t byte_limit, const level > real_parent->_level ? level : real_parent->_level)); real_parent->AddChildTracker(tracker); tracker->Init(); + tracker->set_query_id(query_id); return tracker; } @@ -240,37 +243,38 @@ int64_t MemTracker::GetPoolMemReserved() { return mem_reserved; } -std::shared_ptr QueryMemTrackerRegistry::RegisterQueryMemTracker(const std::string& query_id, - int64_t mem_limit) { +std::shared_ptr QueryMemTrackerRegistry::RegisterQueryMemTracker( + const std::string& query_id, int64_t mem_limit) { DCHECK(!query_id.empty()); if (mem_limit != -1) { if (mem_limit > MemInfo::physical_mem()) { - LOG(WARNING) << "Memory limit " << PrettyPrinter::print(mem_limit, TUnit::BYTES) - << " exceeds physical memory of " - << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES); + LOG(WARNING) << "Memory limit " << PrettyPrinter::print(mem_limit, TUnit::BYTES) + << " exceeds physical memory of " + << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES); } - VLOG(2) << "Using query memory limit: " - << PrettyPrinter::print(mem_limit, TUnit::BYTES); - } - - lock_guard l(_query_mem_trackers_lock); - QueryTrackersMap::iterator it = _query_mem_trackers.find(query_id); - if (it != _query_mem_trackers.end()) { - MemTracker* tracker = it->second.get(); - DCHECK(query_id == tracker->query_id()); - return it->second; + VLOG(2) << "Using query memory limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); } - // First time this query_id registered, make a new object. - std::shared_ptr tracker = MemTracker::CreateTracker( - mem_limit, strings::Substitute(QUERY_MEM_TRACKER_LABEL_FORMAT, query_id), - ExecEnv::GetInstance()->hook_process_mem_tracker(), false, false, - MemTrackerLevel::OVERVIEW); - tracker->set_query_id(query_id); - _query_mem_trackers.emplace(query_id, std::shared_ptr(tracker)); + // First time this query_id registered, make a new object, otherwise do nothing. + _query_mem_trackers.try_emplace_l( + query_id, [](std::shared_ptr) {}, + MemTracker::CreateTracker(mem_limit, + strings::Substitute(QUERY_MEM_TRACKER_LABEL_FORMAT, query_id), + ExecEnv::GetInstance()->hook_process_mem_tracker(), false, + false, MemTrackerLevel::OVERVIEW, query_id)); + + std::shared_ptr tracker = nullptr; + _query_mem_trackers.if_contains(query_id, + [&tracker](std::shared_ptr v) { tracker = v; }); return tracker; } +void QueryMemTrackerRegistry::DeregisterQueryMemTracker(const std::string& query_id) { + DCHECK(!query_id.empty()); + _query_mem_trackers.erase_if(query_id, [](std::shared_ptr) { return true; }); + LOG(WARNING) << "DeregisterQueryMemTracker " << query_id << " len " << _query_mem_trackers.size(); +} + MemTracker::~MemTracker() { delete reservation_counters_.load(); diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index 914ee17b1e515d..2dfe9e4ae6ebd3 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -95,12 +95,13 @@ class MemTracker : public std::enable_shared_from_this { int64_t byte_limit = -1, const std::string& label = std::string(), std::shared_ptr parent = std::shared_ptr(), bool log_usage_if_zero = true, bool reset_label_name = true, - MemTrackerLevel level = MemTrackerLevel::VERBOSE); + MemTrackerLevel level = MemTrackerLevel::VERBOSE, std::string query_id = std::string()); static std::shared_ptr CreateTracker( RuntimeProfile* profile, int64_t byte_limit, const std::string& label = std::string(), const std::shared_ptr& parent = std::shared_ptr(), - bool reset_label_name = true, MemTrackerLevel level = MemTrackerLevel::VERBOSE); + bool reset_label_name = true, MemTrackerLevel level = MemTrackerLevel::VERBOSE, + std::string query_id = std::string()); // this is used for creating an orphan mem tracker, or for unit test. // If a mem tracker has parent, it should be created by `CreateTracker()` @@ -314,8 +315,10 @@ class MemTracker : public std::enable_shared_from_this { return query_id_; } void set_query_id(const std::string& query_id) { - query_id_ = query_id; - is_query_mem_tracker_ = true; + if (query_id != std::string()) { + query_id_ = query_id; + is_query_mem_tracker_ = true; + } } /// Returns the lowest limit for this tracker and its ancestors. Returns @@ -587,22 +590,21 @@ class QueryMemTrackerRegistry { // a new MemTracker object is created with the process tracker as its parent. // Newly created trackers will always have a limit of -1. std::shared_ptr RegisterQueryMemTracker(const std::string& query_id, - int64_t mem_limit = -1); + int64_t mem_limit = -1); + + void DeregisterQueryMemTracker(const std::string& query_id); private: // All per-query MemTracker objects. - // The life cycle of Query memtracker in the process is the same as `query_timeout`, - // MemTrackers will be removed from this map after timeout. + // The life cycle of query memtracker in the process is the same as query runtime state, + // MemTrackers will be removed from this map after query finish or cancel. using QueryTrackersMap = phmap::parallel_flat_hash_map< - std::string, std::shared_ptr, - phmap::priv::hash_default_hash, + std::string, std::shared_ptr, phmap::priv::hash_default_hash, phmap::priv::hash_default_eq, - std::allocator>>, - 12, std::mutex>; + std::allocator>>, 12, + std::mutex>; + QueryTrackersMap _query_mem_trackers; - // Use SpinLock instead of std::mutex so that the lock won't - // automatically destroy itself as part of process teardown, which could cause races. - SpinLock _query_mem_trackers_lock; }; } // namespace doris diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index c75863d63be384..23a230f7d909f0 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -159,6 +159,7 @@ RuntimeState::~RuntimeState() { if (_buffer_reservation != nullptr) { _buffer_reservation->Close(); } + _exec_env->query_mem_trackers()->DeregisterQueryMemTracker(doris::print_id(_query_id)); } Status RuntimeState::init(const TUniqueId& fragment_instance_id, const TQueryOptions& query_options, diff --git a/be/src/runtime/tcmalloc_hook.h b/be/src/runtime/tcmalloc_hook.h index a3a8b887b6f237..6c9b3dce1662f1 100644 --- a/be/src/runtime/tcmalloc_hook.h +++ b/be/src/runtime/tcmalloc_hook.h @@ -19,8 +19,6 @@ #include #include -#include - #include "runtime/thread_status.h" static int new_hook_calls = 0; @@ -37,11 +35,11 @@ void delete_hook(const void* ptr) { } void init_hook() { - assert(MallocHook::AddNewHook(&new_hook)); - assert(MallocHook::AddDeleteHook(&delete_hook)); + MallocHook::AddNewHook(&new_hook); + MallocHook::AddDeleteHook(&delete_hook); } void destroy_hook() { - assert(MallocHook::RemoveNewHook(&new_hook)); - assert(MallocHook::RemoveDeleteHook(&delete_hook)); -} \ No newline at end of file + MallocHook::RemoveNewHook(&new_hook); + MallocHook::RemoveDeleteHook(&delete_hook); +} From 5d7dd970a62448328c392bbd6774690b65f3ee3a Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Thu, 25 Nov 2021 10:50:54 +0800 Subject: [PATCH 03/14] weak ptr --- be/src/runtime/runtime_state.cpp | 3 +- be/src/runtime/thread_context.h | 98 ++++++++++++++++++++++++++++++++ be/src/runtime/thread_status.h | 78 ------------------------- 3 files changed, 99 insertions(+), 80 deletions(-) create mode 100644 be/src/runtime/thread_context.h delete mode 100644 be/src/runtime/thread_status.h diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index 23a230f7d909f0..89d9d63bce58c6 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -31,7 +31,7 @@ #include "runtime/buffered_block_mgr2.h" #include "runtime/bufferpool/reservation_tracker.h" #include "runtime/bufferpool/reservation_util.h" -#include "runtime/thread_status.h" +#include "runtime/thread_context.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/initial_reservations.h" @@ -159,7 +159,6 @@ RuntimeState::~RuntimeState() { if (_buffer_reservation != nullptr) { _buffer_reservation->Close(); } - _exec_env->query_mem_trackers()->DeregisterQueryMemTracker(doris::print_id(_query_id)); } Status RuntimeState::init(const TUniqueId& fragment_instance_id, const TQueryOptions& query_options, diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h new file mode 100644 index 00000000000000..4df053e0fb453e --- /dev/null +++ b/be/src/runtime/thread_context.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "runtime/exec_env.h" +#include "runtime/mem_tracker.h" + +namespace doris { + +class TheadContext { +public: + TheadContext() + : _thread_id(std::this_thread::get_id()), + _global_hook_tracker(MemTracker::GetGlobalHookTracker()) {} + ~TheadContext() { update_query_mem_tracker(); } + + void attach_query(const doris::TUniqueId& query_id) { + _query_id = doris::print_id(query_id); + update_query_mem_tracker(ExecEnv::GetInstance()->query_mem_trackers()->RegisterQueryMemTracker( + doris::print_id(query_id))); + } + + void unattach_query() { + _query_id = ""; + update_query_mem_tracker(); + } + + void update_query_mem_tracker( + std::weak_ptr mem_tracker = std::weak_ptr()) { + if (_untracked_mem != 0 && !_query_mem_tracker.expired()) { + if (!_query_mem_tracker.lock()->TryConsume(_untracked_mem)) { + return; // add call back + } + _untracked_mem = 0; + } + _query_mem_tracker = mem_tracker; + } + + void consume(int64_t size) { + _untracked_mem += size; + if (_untracked_mem >= _untracked_mem_limit || _untracked_mem <= -_untracked_mem_limit) { + // TODO(zxy): _untracked_mem <0 means that there is the same block of memory, + // tracker A calls consume, and tracker B calls release. This will make the memory + // statistics inaccurate and should be avoided as much as possible. + // This DCHECK should be turned on in the future. + // DCHECK(_untracked_mem >= 0); + + // There is no default tracker to avoid repeated releases of MemTacker. + // When the consume is called on the child MemTracker, + // after the release is called on the parent MemTracker, + // the child ~MemTracker will cause repeated releases. + if (!_query_mem_tracker.expired()) { + if (!_query_mem_tracker.lock()->TryConsume(_untracked_mem)) { + return; // add call back + } + } + if (!_global_hook_tracker->TryConsume(_untracked_mem)) { + return; // add call back + } + _untracked_mem = 0; + } + } + + void consume_mem(int64_t size) { consume(size); } + + void release_mem(int64_t size) { consume(-size); } + + const std::string& query_id() { return _query_id; } + const std::thread::id& thread_id() { return _thread_id; } + +private: + std::thread::id _thread_id; + std::string _query_id; + std::weak_ptr _query_mem_tracker; + std::shared_ptr _global_hook_tracker = nullptr; + int64_t _untracked_mem = 0; + int64_t _untracked_mem_limit = 1 * 1024 * 1024; +}; + +inline thread_local TheadContext thread_local_ctx; +} // namespace doris diff --git a/be/src/runtime/thread_status.h b/be/src/runtime/thread_status.h deleted file mode 100644 index edec3d61613bd5..00000000000000 --- a/be/src/runtime/thread_status.h +++ /dev/null @@ -1,78 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" - -namespace doris { - -class ThreadStatus { -public: - ThreadStatus() : _thread_id(std::this_thread::get_id()) {} - ~ThreadStatus() { update_mem_tracker(nullptr); } - - void attach_query(const doris::TUniqueId& query_id) { - _query_id = doris::print_id(query_id); - update_mem_tracker(ExecEnv::GetInstance()->query_mem_trackers()->RegisterQueryMemTracker( - doris::print_id(query_id))); - } - - void update_mem_tracker(std::shared_ptr mem_tracker) { - if (_untracked_mem != 0 && _mem_tracker != nullptr) { - if (!_mem_tracker->TryConsume(_untracked_mem)) { - return; // add call back - } - _untracked_mem = 0; - } - _mem_tracker = mem_tracker; - } - - void consume(int64_t size) { - if (_mem_tracker == nullptr && ExecEnv::GetInstance()->is_init()) { - _mem_tracker = ExecEnv::GetInstance()->hook_process_mem_tracker(); - } - _untracked_mem += size; - if (_mem_tracker != nullptr && (_untracked_mem >= _s_untracked_mem_limit || - _untracked_mem <= -_s_untracked_mem_limit)) { - if (!_mem_tracker->TryConsume(_untracked_mem)) { - return; // add call back - } - _untracked_mem = 0; - } - } - - void consume_mem(int64_t size) { consume(size); } - - void release_mem(int64_t size) { consume(-size); } - - const std::string& query_id() { return _query_id; } - const std::thread::id& thread_id() { return _thread_id; } - -private: - std::thread::id _thread_id; - std::string _query_id; - std::shared_ptr _mem_tracker = nullptr; - int64_t _untracked_mem = 0; - int64_t _s_untracked_mem_limit = 1 * 1024 * 1024; -}; - -inline thread_local ThreadStatus current_thread; -} // namespace doris From caabb49356bb5bda5634345445f76bb1b24ecf7f Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Fri, 26 Nov 2021 20:24:37 +0800 Subject: [PATCH 04/14] modify hook process tracker, query tracker, add mem limit exceeded call back --- be/src/common/config.h | 5 + be/src/exec/olap_scan_node.cpp | 7 +- be/src/runtime/exec_env.h | 17 ++- be/src/runtime/exec_env_init.cpp | 26 ++-- be/src/runtime/fragment_mgr.cpp | 43 ++++-- be/src/runtime/fragment_mgr.h | 5 +- be/src/runtime/mem_tracker.cpp | 77 +++++++--- be/src/runtime/mem_tracker.h | 44 +++++- be/src/runtime/plan_fragment_executor.cpp | 15 +- be/src/runtime/plan_fragment_executor.h | 8 +- be/src/runtime/runtime_state.cpp | 2 +- be/src/runtime/runtime_state.h | 1 + be/src/runtime/tcmalloc_hook.h | 11 +- be/src/runtime/thread_context.h | 136 +++++++++++++----- be/src/service/doris_main.cpp | 1 + be/test/runtime/fragment_mgr_test.cpp | 3 +- be/test/runtime/test_env.cc | 2 +- .../administrator-guide/config/be_config.md | 6 + .../administrator-guide/config/be_config.md | 6 + .../java/org/apache/doris/qe/Coordinator.java | 3 +- gensrc/proto/internal_service.proto | 1 + 21 files changed, 309 insertions(+), 110 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index cc3448928da584..231c7c97d8fa4c 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -602,6 +602,11 @@ CONF_mInt32(remote_storage_read_buffer_mb, "16"); // the level equal or lower than mem_tracker_level will show in web page CONF_Int16(mem_tracker_level, "0"); +// The maximum buffer length allowed when TCMalloc Hook consumes/releases MemTracker, +// that is, the minimum batch of consume/release, specified as number of bytes. +// Increasing this value will increase the frequency of consume/release. +CONF_mInt32(untracked_mem_limit, "4194304"); + // The version information of the tablet will be stored in the memory // in an adjacency graph data structure. // And as the new version is written and the old version is deleted, diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index af9b6dbce991f8..48e0e9dac7e1bb 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -29,7 +29,7 @@ #include "exprs/expr_context.h" #include "exprs/runtime_filter.h" #include "gen_cpp/PlanNodes_types.h" -#include "runtime/thread_status.h" +#include "runtime/thread_context.h" #include "runtime/exec_env.h" #include "runtime/row_batch.h" #include "runtime/runtime_filter_mgr.h" @@ -1502,7 +1502,7 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { } void OlapScanNode::scanner_thread(OlapScanner* scanner) { - current_thread.attach_query(scanner->runtime_state()->query_id()); + thread_local_ctx.attach_query(scanner->runtime_state()->query_id(), _runtime_state->fragment_instance_id()); if (UNLIKELY(_transfer_done)) { _scanner_done = true; std::unique_lock l(_scan_batches_lock); @@ -1512,6 +1512,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { _scan_batch_added_cv.notify_one(); _scan_thread_exit_cv.notify_one(); LOG(INFO) << "Scan thread cancelled, cause query done, scan thread started to exit"; + thread_local_ctx.unattach_query(); return; } int64_t wait_time = scanner->update_wait_worker_timer(); @@ -1661,7 +1662,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { // and transfer thread _scan_batch_added_cv.notify_one(); _scan_thread_exit_cv.notify_one(); - current_thread.update_mem_tracker(nullptr); + thread_local_ctx.unattach_query(); } Status OlapScanNode::add_one_batch(RowBatch* row_batch) { diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index ec16ce2911acfa..ff50a2f0cc650d 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -96,7 +96,7 @@ class ExecEnv { // declarations for classes in scoped_ptrs. ~ExecEnv(); - bool is_init() { return _is_init; } + const bool is_init() { return _is_init; } const std::string& token() const; ExternalScanContextMgr* external_scan_context_mgr() { return _external_scan_context_mgr; } DataStreamMgr* stream_mgr() { return _stream_mgr; } @@ -116,9 +116,9 @@ class ExecEnv { return nullptr; } - std::shared_ptr process_mem_tracker() { return _mem_tracker; } - std::shared_ptr hook_process_mem_tracker() { return _hook_mem_tracker; } - QueryMemTrackerRegistry* query_mem_trackers() { return _query_mem_trackers; } + std::shared_ptr process_mem_tracker() { return _process_mem_tracker; } + std::shared_ptr all_query_mem_tracker() { return _all_query_mem_tracker; } + QueryMemTrackerRegistry* query_mem_tracker_registry() { return _query_mem_tracker_registry; } ThreadResourceMgr* thread_mgr() { return _thread_mgr; } PriorityThreadPool* scan_thread_pool() { return _scan_thread_pool; } ThreadPool* limited_scan_thread_pool() { return _limited_scan_thread_pool.get(); } @@ -182,9 +182,12 @@ class ExecEnv { ClientCache* _frontend_client_cache = nullptr; ClientCache* _broker_client_cache = nullptr; ClientCache* _extdatasource_client_cache = nullptr; - std::shared_ptr _mem_tracker; - std::shared_ptr _hook_mem_tracker = nullptr; - QueryMemTrackerRegistry* _query_mem_trackers = nullptr; + // The ancestor of all trackers in the process. It is the only child of the root tracker. + // All manually created trackers should specify the process tracker as the parent. + std::shared_ptr _process_mem_tracker = nullptr; + // The ancestor for all querys tracker. + std::shared_ptr _all_query_mem_tracker = nullptr; + QueryMemTrackerRegistry* _query_mem_tracker_registry = nullptr; ThreadResourceMgr* _thread_mgr = nullptr; // The following two thread pools are used in different scenarios. diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index c98af26011564a..9a7f18b64c2e05 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -93,7 +93,7 @@ Status ExecEnv::_init(const std::vector& store_paths) { _broker_client_cache = new BrokerServiceClientCache(config::max_client_cache_size_per_host); _extdatasource_client_cache = new ExtDataSourceServiceClientCache(config::max_client_cache_size_per_host); - _query_mem_trackers = new QueryMemTrackerRegistry(); + _query_mem_tracker_registry = new QueryMemTrackerRegistry(); _thread_mgr = new ThreadResourceMgr(); _scan_thread_pool = new PriorityThreadPool(config::doris_scanner_thread_pool_thread_num, config::doris_scanner_thread_pool_queue_size); @@ -146,7 +146,7 @@ Status ExecEnv::_init(const std::vector& store_paths) { _small_file_mgr->init(); _init_mem_tracker(); - RETURN_IF_ERROR(_load_channel_mgr->init(_mem_tracker->limit())); + RETURN_IF_ERROR(_load_channel_mgr->init(_process_mem_tracker->limit())); _heartbeat_flags = new HeartbeatFlags(); _register_metrics(); _is_init = true; @@ -173,16 +173,14 @@ Status ExecEnv::_init_mem_tracker() { << ". Using physical memory instead"; global_memory_limit_bytes = MemInfo::physical_mem(); } - _mem_tracker = MemTracker::CreateTracker(global_memory_limit_bytes, "Process", - MemTracker::GetRootTracker(), false, false, - MemTrackerLevel::OVERVIEW); - REGISTER_HOOK_METRIC(query_mem_consumption, [this]() { return _mem_tracker->consumption(); }); - // TODO(zxy): Will replace _mem_tracker as process_mem_tracker in future. - // The statistic memory consumption is duplicated with _mem_tracker, - // which will cause the RootTracker statistic value to be much larger than actual, - _hook_mem_tracker = MemTracker::CreateTracker(global_memory_limit_bytes, "TcmallocHook Process", - MemTracker::GetRootTracker(), false, false, - MemTrackerLevel::OVERVIEW); + _process_mem_tracker = MemTracker::CreateTracker(global_memory_limit_bytes, "Process", + MemTracker::GetRootTracker(), false, false, + MemTrackerLevel::OVERVIEW); + REGISTER_HOOK_METRIC(query_mem_consumption, + [this]() { return _process_mem_tracker->consumption(); }); + _all_query_mem_tracker = + MemTracker::CreateTracker(global_memory_limit_bytes, "All Query", _process_mem_tracker, + false, false, MemTrackerLevel::OVERVIEW); LOG(INFO) << "Using global memory limit: " << PrettyPrinter::print(global_memory_limit_bytes, TUnit::BYTES) << ", origin config value: " << config::mem_limit; @@ -247,7 +245,7 @@ Status ExecEnv::_init_mem_tracker() { SegmentLoader::create_global_instance(config::segment_cache_capacity); // 4. init other managers - RETURN_IF_ERROR(_disk_io_mgr->init(_mem_tracker)); + RETURN_IF_ERROR(_disk_io_mgr->init(_process_mem_tracker)); RETURN_IF_ERROR(_tmp_file_mgr->init()); // TODO(zc): The current memory usage configuration is a bit confusing, @@ -306,7 +304,7 @@ void ExecEnv::_destroy() { SAFE_DELETE(_etl_thread_pool); SAFE_DELETE(_scan_thread_pool); SAFE_DELETE(_thread_mgr); - SAFE_DELETE(_query_mem_trackers); + SAFE_DELETE(_query_mem_tracker_registry); SAFE_DELETE(_broker_client_cache); SAFE_DELETE(_extdatasource_client_cache); SAFE_DELETE(_frontend_client_cache); diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 8d629a92380086..c57a691533188d 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -43,7 +43,7 @@ #include "runtime/stream_load/load_stream_mgr.h" #include "runtime/stream_load/stream_load_context.h" #include "runtime/stream_load/stream_load_pipe.h" -#include "runtime/thread_status.h" +#include "runtime/thread_context.h" #include "service/backend_options.h" #include "util/debug_util.h" #include "util/doris_metrics.h" @@ -83,6 +83,7 @@ class FragmentExecState { FragmentExecState(const TUniqueId& query_id, const TUniqueId& instance_id, int backend_num, ExecEnv* exec_env, const TNetworkAddress& coord_addr); + ~FragmentExecState(); Status prepare(const TExecPlanFragmentParams& params); @@ -95,7 +96,7 @@ class FragmentExecState { Status cancel_before_execute(); - Status cancel(const PPlanFragmentCancelReason& reason); + Status cancel(const PPlanFragmentCancelReason& reason, const std::string& msg = ""); TUniqueId fragment_instance_id() const { return _fragment_instance_id; } @@ -135,6 +136,10 @@ class FragmentExecState { return false; } + bool is_canceling() const { return _is_canceling; } + + void set_is_canceling() { _is_canceling = true; } + int get_timeout_second() const { return _timeout_second; } std::shared_ptr get_fragments_ctx() { return _fragments_ctx; } @@ -156,6 +161,7 @@ class FragmentExecState { PlanFragmentExecutor _executor; DateTimeValue _start_time; + bool _is_canceling = false; std::mutex _status_lock; Status _exec_status; @@ -166,6 +172,7 @@ class FragmentExecState { int _timeout_second; + std::unique_ptr _exec_thread; // This context is shared by all fragments of this host in a query std::shared_ptr _fragments_ctx; @@ -208,6 +215,7 @@ FragmentExecState::FragmentExecState(const TUniqueId& query_id, _start_time = DateTimeValue::local_time(); } +FragmentExecState::~FragmentExecState() {} Status FragmentExecState::prepare(const TExecPlanFragmentParams& params) { if (params.__isset.query_options) { @@ -251,13 +259,13 @@ Status FragmentExecState::cancel_before_execute() { return Status::OK(); } -Status FragmentExecState::cancel(const PPlanFragmentCancelReason& reason) { +Status FragmentExecState::cancel(const PPlanFragmentCancelReason& reason, const std::string& msg) { std::lock_guard l(_status_lock); RETURN_IF_ERROR(_exec_status); if (reason == PPlanFragmentCancelReason::LIMIT_REACH) { _executor.set_is_report_on_cancel(false); } - _executor.cancel(); + _executor.cancel(reason, msg); if (_pipe != nullptr) { _pipe->cancel(PPlanFragmentCancelReason_Name(reason)); } @@ -462,7 +470,7 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi .query_id(exec_state->query_id()) .instance_id(exec_state->fragment_instance_id()) .tag("pthread_id", std::to_string((uintptr_t)pthread_self())); - current_thread.attach_query(exec_state->query_id()); + thread_local_ctx.attach_query(exec_state->query_id(), exec_state->fragment_instance_id()); exec_state->execute(); std::shared_ptr fragments_ctx = exec_state->get_fragments_ctx(); @@ -483,7 +491,7 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi // Callback after remove from this id cb(exec_state->executor()); - current_thread.update_mem_tracker(nullptr); + thread_local_ctx.unattach_query(); } Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params) { @@ -646,7 +654,8 @@ Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params, Fi return Status::OK(); } -Status FragmentMgr::cancel(const TUniqueId& fragment_id, const PPlanFragmentCancelReason& reason) { +Status FragmentMgr::cancel(const TUniqueId& fragment_id, const PPlanFragmentCancelReason& reason, + const std::string& msg) { std::shared_ptr exec_state; { std::lock_guard lock(_lock); @@ -656,12 +665,30 @@ Status FragmentMgr::cancel(const TUniqueId& fragment_id, const PPlanFragmentCanc return Status::OK(); } exec_state = iter->second; + exec_state->set_is_canceling(); } - exec_state->cancel(reason); + exec_state->cancel(reason, msg); return Status::OK(); } +Status FragmentMgr::is_canceling(const TUniqueId& fragment_id) { + std::shared_ptr exec_state; + { + std::lock_guard lock(_lock); + auto iter = _fragment_map.find(fragment_id); + if (iter != _fragment_map.end()) { + exec_state = iter->second; + if (exec_state->is_canceling()) { + return Status::Cancelled("Canceling"); + } else { + return Status::OK(); + } + } + } + return Status::InternalError("FragmentID not found"); +} + void FragmentMgr::cancel_worker() { LOG(INFO) << "FragmentMgr cancel worker start working."; do { diff --git a/be/src/runtime/fragment_mgr.h b/be/src/runtime/fragment_mgr.h index ba562164a2bbb9..8be4255ea8d2f2 100644 --- a/be/src/runtime/fragment_mgr.h +++ b/be/src/runtime/fragment_mgr.h @@ -70,7 +70,10 @@ class FragmentMgr : public RestMonitorIface { return cancel(fragment_id, PPlanFragmentCancelReason::INTERNAL_ERROR); } - Status cancel(const TUniqueId& fragment_id, const PPlanFragmentCancelReason& reason); + Status cancel(const TUniqueId& fragment_id, const PPlanFragmentCancelReason& reason, + const std::string& msg = ""); + + Status is_canceling(const TUniqueId& fragment_id); void cancel_worker(); diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index ab44695d5d90d3..c960626708be7e 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -62,7 +62,15 @@ const std::string QUERY_MEM_TRACKER_LABEL_FORMAT = "queryId=$0"; /// Calculate the soft limit for a MemTracker based on the hard limit 'limit'. static int64_t CalcSoftLimit(int64_t limit) { - if (limit < 0) return -1; + if (limit < 0) { + return -1; + } + if (limit > MemInfo::physical_mem()) { + LOG(WARNING) << "Memory limit " << PrettyPrinter::print(limit, TUnit::BYTES) + << " exceeds physical memory of " + << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES); + limit = MemInfo::physical_mem(); + } double frac = std::max(0.0, std::min(1.0, config::soft_mem_limit_frac)); return static_cast(limit * frac); } @@ -76,9 +84,20 @@ void MemTracker::CreateRootTracker() { root_tracker->Init(); } +// An independent tracker, no parent and child, +// used in tcmalloc new/delete hook to count the real memory of the process +static std::shared_ptr global_hook_mem_tracker; +static GoogleOnceType global_hook_mem_tracker_once = GOOGLE_ONCE_INIT; + +void MemTracker::CreateGlobalHookTracker() { + global_hook_mem_tracker.reset(new MemTracker(nullptr, -1, "Global Hook", nullptr, true, MemTrackerLevel::OVERVIEW)); + global_hook_mem_tracker->Init(); +} + std::shared_ptr MemTracker::CreateTracker(RuntimeProfile* profile, int64_t byte_limit, const std::string& label, const std::shared_ptr& parent, - bool reset_label_name, MemTrackerLevel level, std::string query_id) { + bool reset_label_name, MemTrackerLevel level, + const std::string& query_id) { std::shared_ptr real_parent; std::string label_name; // if parent is not null, reset label name to query id. @@ -112,7 +131,7 @@ std::shared_ptr MemTracker::CreateTracker(RuntimeProfile* profile, i std::shared_ptr MemTracker::CreateTracker(int64_t byte_limit, const std::string& label, std::shared_ptr parent, bool log_usage_if_zero, bool reset_label_name, - MemTrackerLevel level, std::string query_id) { + MemTrackerLevel level, const std::string& query_id) { std::shared_ptr real_parent; std::string label_name; // if parent is not null, reset label name to query id. @@ -246,41 +265,51 @@ int64_t MemTracker::GetPoolMemReserved() { std::shared_ptr QueryMemTrackerRegistry::RegisterQueryMemTracker( const std::string& query_id, int64_t mem_limit) { DCHECK(!query_id.empty()); - if (mem_limit != -1) { - if (mem_limit > MemInfo::physical_mem()) { - LOG(WARNING) << "Memory limit " << PrettyPrinter::print(mem_limit, TUnit::BYTES) - << " exceeds physical memory of " - << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES); - } - VLOG(2) << "Using query memory limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); - } + VLOG(2) << "Register query memory tracker, query id: " << query_id + << " limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); // First time this query_id registered, make a new object, otherwise do nothing. + // Combine CreateTracker and emplace into one operation to avoid the use of locks _query_mem_trackers.try_emplace_l( query_id, [](std::shared_ptr) {}, MemTracker::CreateTracker(mem_limit, strings::Substitute(QUERY_MEM_TRACKER_LABEL_FORMAT, query_id), - ExecEnv::GetInstance()->hook_process_mem_tracker(), false, + ExecEnv::GetInstance()->all_query_mem_tracker(), false, false, MemTrackerLevel::OVERVIEW, query_id)); + std::shared_ptr tracker = GetQueryMemTracker(query_id); + // tracker->set_exist_transfer_control(); + return tracker; +} +std::shared_ptr QueryMemTrackerRegistry::GetQueryMemTracker(const std::string& query_id) { std::shared_ptr tracker = nullptr; - _query_mem_trackers.if_contains(query_id, - [&tracker](std::shared_ptr v) { tracker = v; }); + QueryTrackersMap::iterator it = _query_mem_trackers.find(query_id); + if (it != _query_mem_trackers.end()) { + DCHECK(query_id == it->second->query_id()); + tracker = it->second; + } return tracker; } -void QueryMemTrackerRegistry::DeregisterQueryMemTracker(const std::string& query_id) { - DCHECK(!query_id.empty()); - _query_mem_trackers.erase_if(query_id, [](std::shared_ptr) { return true; }); - LOG(WARNING) << "DeregisterQueryMemTracker " << query_id << " len " << _query_mem_trackers.size(); +void QueryMemTrackerRegistry::DeregisterQueryMemTracker() { + for (auto it = _query_mem_trackers.begin(); it != _query_mem_trackers.end();) { + // No RuntimeState uses this query MemTracker, it is only referenced by this map, delete it + if (it->second.use_count() == 1) { + VLOG(2) << "Deregister query memory tracker, query id: " << it->first; + _query_mem_trackers.erase(it->first); + } + it++; + } } MemTracker::~MemTracker() { delete reservation_counters_.load(); if (parent()) { - DCHECK(consumption() == 0) << "Memory tracker " << debug_string() - << " has unreleased consumption " << consumption(); + if (!exist_transfer_control()) { + DCHECK(consumption() == 0) << "Memory tracker " << debug_string() + << " has unreleased consumption " << consumption(); + } parent_->Release(consumption()); lock_guard l(parent_->child_trackers_lock_); @@ -295,6 +324,7 @@ void MemTracker::ListTrackers(vector>* trackers) { trackers->clear(); deque> to_process; to_process.push_front(GetRootTracker()); + to_process.push_front(GetGlobalHookTracker()); while (!to_process.empty()) { shared_ptr t = to_process.back(); to_process.pop_back(); @@ -484,7 +514,7 @@ void MemTracker::GetTopNQueries( MemTracker* MemTracker::GetQueryMemTracker() { MemTracker* tracker = this; - while (tracker != nullptr && !tracker->is_query_mem_tracker_) { + while (tracker != nullptr && !tracker->_is_query_mem_tracker) { tracker = tracker->parent_.get(); } return tracker; @@ -582,4 +612,9 @@ std::shared_ptr MemTracker::GetRootTracker() { return root_tracker; } +std::shared_ptr MemTracker::GetGlobalHookTracker() { + GoogleOnceInit(&global_hook_mem_tracker_once, &MemTracker::CreateGlobalHookTracker); + return global_hook_mem_tracker; +} + } // namespace doris diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index 2dfe9e4ae6ebd3..5914344f260a18 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -95,13 +95,13 @@ class MemTracker : public std::enable_shared_from_this { int64_t byte_limit = -1, const std::string& label = std::string(), std::shared_ptr parent = std::shared_ptr(), bool log_usage_if_zero = true, bool reset_label_name = true, - MemTrackerLevel level = MemTrackerLevel::VERBOSE, std::string query_id = std::string()); + MemTrackerLevel level = MemTrackerLevel::VERBOSE, const std::string& query_id = std::string()); static std::shared_ptr CreateTracker( RuntimeProfile* profile, int64_t byte_limit, const std::string& label = std::string(), const std::shared_ptr& parent = std::shared_ptr(), bool reset_label_name = true, MemTrackerLevel level = MemTrackerLevel::VERBOSE, - std::string query_id = std::string()); + const std::string& query_id = std::string()); // this is used for creating an orphan mem tracker, or for unit test. // If a mem tracker has parent, it should be created by `CreateTracker()` @@ -119,6 +119,9 @@ class MemTracker : public std::enable_shared_from_this { // Gets a shared_ptr to the "root" tracker, creating it if necessary. static std::shared_ptr GetRootTracker(); + // Gets a shared_ptr to the "global_hook" tracker, creating it if necessary. + static std::shared_ptr GetGlobalHookTracker(); + /// Increases consumption of this tracker and its ancestors by 'bytes'. void Consume(int64_t bytes) { // DCHECK_GE(bytes, 0); @@ -137,7 +140,7 @@ class MemTracker : public std::enable_shared_from_this { for (auto& tracker : all_trackers_) { tracker->consumption_->add(bytes); if (LIKELY(tracker->consumption_metric_ == nullptr)) { - DCHECK_GE(tracker->consumption_->current_value(), 0); + DCHECK_GE(tracker->consumption_->current_value(), -config::untracked_mem_limit * 10); } } } @@ -245,7 +248,17 @@ class MemTracker : public std::enable_shared_from_this { /// trackers since we can enforce that the reported memory usage is internally /// consistent.) if (LIKELY(tracker->consumption_metric_ == nullptr)) { - DCHECK_GE(tracker->consumption_->current_value(), 0) + // A small range of negative values is allowed, because TCMalloc Hook consume/release + // MemTracker may cause tracker->consumption to be temporarily less than 0. + // + // Note that, this may obscure other errors. + // consumption_ < 0 will make the memory statistics inaccurate, so it should be avoided. + // 1. The released memory is not consumed. + // 2. The same block of memory, tracker A calls consume, and tracker B calls release. + // 3. Repeated releases of MemTacker. When the consume is called on the child MemTracker, + // after the release is called on the parent MemTracker, + // the child ~MemTracker will cause repeated releases. + DCHECK_GE(tracker->consumption_->current_value(), -config::untracked_mem_limit * 10) << std::endl << tracker->LogUsage(UNLIMITED_DEPTH); } @@ -317,10 +330,18 @@ class MemTracker : public std::enable_shared_from_this { void set_query_id(const std::string& query_id) { if (query_id != std::string()) { query_id_ = query_id; - is_query_mem_tracker_ = true; + _is_query_mem_tracker = true; } } + bool exist_transfer_control() { + return _exist_transfer_control; + } + + void set_exist_transfer_control() { + _exist_transfer_control = true; + } + /// Returns the lowest limit for this tracker and its ancestors. Returns /// -1 if there is no limit. int64_t GetLowestLimit(MemLimit mode) const; @@ -506,11 +527,14 @@ class MemTracker : public std::enable_shared_from_this { // Creates the root tracker. static void CreateRootTracker(); + // Creates the global hook tracker. + static void CreateGlobalHookTracker(); + /// Lock to protect GcMemory(). This prevents many GCs from occurring at once. std::mutex gc_lock_; /// True if this is a Query MemTracker returned from RegisterQueryMemTracker(). - bool is_query_mem_tracker_ = false; + bool _is_query_mem_tracker = false; /// Only valid for MemTrackers returned from RegisterQueryMemTracker() std::string query_id_; @@ -523,6 +547,10 @@ class MemTracker : public std::enable_shared_from_this { /// TryConsume() can opt not to exceed this limit. If -1, there is no consumption limit. const int64_t soft_limit_; + // Whether memory control transfer occurs, between mem trackers. + // The current tracker calls consume/release, and other threads call release/consume. + bool _exist_transfer_control = false; + std::string label_; /// The parent of this tracker. The pointer is never modified, even after this tracker @@ -592,7 +620,9 @@ class QueryMemTrackerRegistry { std::shared_ptr RegisterQueryMemTracker(const std::string& query_id, int64_t mem_limit = -1); - void DeregisterQueryMemTracker(const std::string& query_id); + std::shared_ptr GetQueryMemTracker(const std::string& query_id); + + void DeregisterQueryMemTracker(); private: // All per-query MemTracker objects. diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp index 98a33ab6bb2bce..6e0e83fcb89e4b 100644 --- a/be/src/runtime/plan_fragment_executor.cpp +++ b/be/src/runtime/plan_fragment_executor.cpp @@ -28,7 +28,7 @@ #include "exec/scan_node.h" #include "exprs/expr.h" #include "runtime/data_stream_mgr.h" -#include "runtime/thread_status.h" +#include "runtime/thread_context.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/mem_tracker.h" @@ -60,6 +60,8 @@ PlanFragmentExecutor::PlanFragmentExecutor(ExecEnv* exec_env, _closed(false), _is_report_success(true), _is_report_on_cancel(true), + _cancel_reason(PPlanFragmentCancelReason::INTERNAL_ERROR), + _cancel_msg(""), _collect_query_statistics_with_every_batch(false) {} PlanFragmentExecutor::~PlanFragmentExecutor() { @@ -87,7 +89,6 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, _runtime_state->set_query_fragments_ctx(fragments_ctx); RETURN_IF_ERROR(_runtime_state->init_mem_trackers(_query_id)); - current_thread.attach_query(_query_id); _runtime_state->set_be_number(request.backend_num); if (request.__isset.backend_id) { _runtime_state->set_backend_id(request.backend_id); @@ -264,6 +265,12 @@ Status PlanFragmentExecutor::open() { _runtime_state->log_error(status.get_error_msg()); } + if (status.is_cancelled()) { + if (_cancel_reason == PPlanFragmentCancelReason::MEMORY_EXCEED_LIMIT) { + status = Status::MemoryLimitExceeded(_cancel_msg); + } + } + update_status(status); return status; } @@ -612,10 +619,12 @@ void PlanFragmentExecutor::update_status(const Status& new_status) { send_report(true); } -void PlanFragmentExecutor::cancel() { +void PlanFragmentExecutor::cancel(const PPlanFragmentCancelReason& reason, const std::string& msg) { TAG(LOG(INFO)).log("PlanFragmentExecutor::cancel") .query_id(_query_id).instance_id(_runtime_state->fragment_instance_id()); DCHECK(_prepared); + _cancel_reason = reason; + _cancel_msg = msg; _runtime_state->set_is_cancelled(true); // must close stream_mgr to avoid dead lock in Exchange Node diff --git a/be/src/runtime/plan_fragment_executor.h b/be/src/runtime/plan_fragment_executor.h index 3cdb6bb2495f50..e40bbcb023ba87 100644 --- a/be/src/runtime/plan_fragment_executor.h +++ b/be/src/runtime/plan_fragment_executor.h @@ -24,6 +24,7 @@ #include "common/object_pool.h" #include "common/status.h" +#include "gen_cpp/internal_service.pb.h" #include "runtime/datetime_value.h" #include "runtime/query_fragments_ctx.h" #include "runtime/query_statistics.h" @@ -127,7 +128,8 @@ class PlanFragmentExecutor { void set_abort(); // Initiate cancellation. Must not be called until after prepare() returned. - void cancel(); + void cancel(const PPlanFragmentCancelReason& reason = PPlanFragmentCancelReason::INTERNAL_ERROR, + const std::string& msg = ""); // call these only after prepare() RuntimeState* runtime_state() { return _runtime_state.get(); } @@ -177,6 +179,10 @@ class PlanFragmentExecutor { // This executor will not report status to FE on being cancelled. bool _is_report_on_cancel; + // Record the cancel information when calling the cancel() method, return it to FE + PPlanFragmentCancelReason _cancel_reason; + std::string _cancel_msg; + // Overall execution status. Either ok() or set to the first error status that // was encountered. Status _status; diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index 89d9d63bce58c6..192b3837b60d93 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -222,7 +222,7 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { MemTracker::CreateTracker(bytes_limit, "RuntimeState:query:" + print_id(query_id), _exec_env->process_mem_tracker(), true, false); _hook_query_mem_tracker = - _exec_env->query_mem_trackers()->RegisterQueryMemTracker(doris::print_id(query_id), bytes_limit); + _exec_env->query_mem_tracker_registry()->RegisterQueryMemTracker(print_id(query_id), bytes_limit); _instance_mem_tracker = MemTracker::CreateTracker(&_profile, -1, "RuntimeState:instance:", _query_mem_tracker); diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 030cfb952756a3..62483aa0b84755 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -406,6 +406,7 @@ class RuntimeState { // MemTracker that is shared by all fragment instances running on this host. // The query mem tracker must be released after the _instance_mem_tracker. std::shared_ptr _query_mem_tracker; + // TODO(zxy): Will replace _query_mem_tracker in future. std::shared_ptr _hook_query_mem_tracker; // Memory usage of this fragment instance diff --git a/be/src/runtime/tcmalloc_hook.h b/be/src/runtime/tcmalloc_hook.h index 6c9b3dce1662f1..38b0a99f3aec20 100644 --- a/be/src/runtime/tcmalloc_hook.h +++ b/be/src/runtime/tcmalloc_hook.h @@ -19,19 +19,14 @@ #include #include -#include "runtime/thread_status.h" - -static int new_hook_calls = 0; -static int delete_hook_calls = 0; +#include "runtime/thread_context.h" void new_hook(const void* ptr, size_t size) { - new_hook_calls++; - doris::current_thread.consume_mem(tc_nallocx(size, 0)); + doris::thread_local_ctx.consume_mem(tc_nallocx(size, 0)); } void delete_hook(const void* ptr) { - delete_hook_calls++; - doris::current_thread.release_mem(tc_malloc_size(const_cast(ptr))); + doris::thread_local_ctx.release_mem(tc_malloc_size(const_cast(ptr))); } void init_hook() { diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index 4df053e0fb453e..cec0b1a551c692 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -19,8 +19,11 @@ #include +#include "gen_cpp/Types_types.h" #include "runtime/exec_env.h" +#include "runtime/fragment_mgr.h" #include "runtime/mem_tracker.h" +#include "service/backend_options.h" namespace doris { @@ -31,68 +34,135 @@ class TheadContext { _global_hook_tracker(MemTracker::GetGlobalHookTracker()) {} ~TheadContext() { update_query_mem_tracker(); } - void attach_query(const doris::TUniqueId& query_id) { - _query_id = doris::print_id(query_id); - update_query_mem_tracker(ExecEnv::GetInstance()->query_mem_trackers()->RegisterQueryMemTracker( - doris::print_id(query_id))); + void attach_query(const TUniqueId& query_id, + const TUniqueId& fragment_instance_id = TUniqueId()) { + _query_id = query_id; + _fragment_instance_id = fragment_instance_id; + update_query_mem_tracker(ExecEnv::GetInstance()->query_mem_tracker_registry()->GetQueryMemTracker( + print_id(query_id))); } void unattach_query() { - _query_id = ""; + _query_id = TUniqueId(); + _fragment_instance_id = TUniqueId(); update_query_mem_tracker(); } void update_query_mem_tracker( std::weak_ptr mem_tracker = std::weak_ptr()) { - if (_untracked_mem != 0 && !_query_mem_tracker.expired()) { - if (!_query_mem_tracker.lock()->TryConsume(_untracked_mem)) { - return; // add call back - } + if (_untracked_mem != 0) { + consume(); _untracked_mem = 0; } _query_mem_tracker = mem_tracker; } - void consume(int64_t size) { - _untracked_mem += size; - if (_untracked_mem >= _untracked_mem_limit || _untracked_mem <= -_untracked_mem_limit) { - // TODO(zxy): _untracked_mem <0 means that there is the same block of memory, - // tracker A calls consume, and tracker B calls release. This will make the memory - // statistics inaccurate and should be avoided as much as possible. - // This DCHECK should be turned on in the future. - // DCHECK(_untracked_mem >= 0); - - // There is no default tracker to avoid repeated releases of MemTacker. - // When the consume is called on the child MemTracker, - // after the release is called on the parent MemTracker, - // the child ~MemTracker will cause repeated releases. - if (!_query_mem_tracker.expired()) { - if (!_query_mem_tracker.lock()->TryConsume(_untracked_mem)) { - return; // add call back + void query_mem_limit_exceeded(int64_t mem_usage) { + if (_query_id != TUniqueId() && _fragment_instance_id != TUniqueId() && + ExecEnv::GetInstance()->is_init() && + ExecEnv::GetInstance()->fragment_mgr()->is_canceling(_fragment_instance_id).ok()) { + std::string detail = "Query Memory exceed limit in TCMalloc Hook New."; + auto st = _query_mem_tracker.lock()->MemLimitExceeded(nullptr, detail, mem_usage); + detail += ", Backend: " + BackendOptions::get_localhost() + + ", Fragment: " + print_id(_fragment_instance_id) + + ", Used: " + std::to_string(_query_mem_tracker.lock()->consumption()) + + ", Limit: " + std::to_string(_query_mem_tracker.lock()->limit()) + + ". You can change the limit by session variable exec_mem_limit."; + ExecEnv::GetInstance()->fragment_mgr()->cancel( + _fragment_instance_id, PPlanFragmentCancelReason::MEMORY_EXCEED_LIMIT, detail); + _fragment_instance_id = TUniqueId(); // Make sure it will only be canceled once + } + } + + void global_mem_limit_exceeded(int64_t mem_usage) { + std::string detail = "Global Memory exceed limit in TCMalloc Hook New."; + auto st = _query_mem_tracker.lock()->MemLimitExceeded(nullptr, detail, mem_usage); + } + + // Note that, If call the memory allocation operation in TCMalloc new/delete Hook, + // such as calling LOG/iostream/sstream/stringstream/etc. related methods, + // must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck, + void consume() { + // Query_mem_tracker and global_hook_tracker are counted separately, + // in order to ensure that the process memory counted by global_hook_tracker is accurate enough. + // + // Otherwise, if query_mem_tracker is the child of global_hook_tracker and global_hook_tracker + // is the default tracker, it may be the same block of memory. Consume is called in query_mem_tracker, + // and release is called in global_hook_tracker, which is repeatedly released after ~query_mem_tracker. + if (!_query_mem_tracker.expired()) { + if (_query_mem_limit_exceeded == false) { + if (!_query_mem_tracker.lock()->TryConsume(_missed_query_tracker_mem + + _untracked_mem)) { + _query_mem_limit_exceeded = true; + query_mem_limit_exceeded(_missed_query_tracker_mem + _untracked_mem); + _query_mem_limit_exceeded = false; + _missed_query_tracker_mem += _untracked_mem; + } else { + _missed_query_tracker_mem = 0; } + } else { + _missed_query_tracker_mem += _untracked_mem; } - if (!_global_hook_tracker->TryConsume(_untracked_mem)) { - return; // add call back + } + + // The first time GetGlobalHookTracker is called after the main thread starts, == nullptr + if (_global_hook_tracker != nullptr) { + if (_global_mem_limit_exceeded == false) { + if (!_global_hook_tracker->TryConsume(_missed_global_tracker_mem + _untracked_mem)) { + _global_mem_limit_exceeded = true; + global_mem_limit_exceeded(_missed_global_tracker_mem + _untracked_mem); + _global_mem_limit_exceeded = false; + _missed_global_tracker_mem += _untracked_mem; + } else { + _missed_global_tracker_mem = 0; + } + } else { + _missed_global_tracker_mem += _untracked_mem; } + } else { + _missed_global_tracker_mem += _untracked_mem; + } + } + + void try_consume(int64_t size) { + _untracked_mem += size; + // When some threads `0 <_untracked_mem <_untracked_mem_limit` + // and some threads `_untracked_mem <= -_untracked_mem_limit` trigger consumption(), + // it will cause tracker->consumption to be temporarily less than 0. + if (_untracked_mem >= _untracked_mem_limit || _untracked_mem <= -_untracked_mem_limit) { + consume(); _untracked_mem = 0; } } - void consume_mem(int64_t size) { consume(size); } + void consume_mem(int64_t size) { try_consume(size); } - void release_mem(int64_t size) { consume(-size); } + void release_mem(int64_t size) { try_consume(-size); } - const std::string& query_id() { return _query_id; } + const TUniqueId& query_id() { return _query_id; } const std::thread::id& thread_id() { return _thread_id; } private: std::thread::id _thread_id; - std::string _query_id; + TUniqueId _query_id; + TUniqueId _fragment_instance_id; std::weak_ptr _query_mem_tracker; std::shared_ptr _global_hook_tracker = nullptr; + + // The memory size that is not tracker is used to control batch trackers, + // avoid frequent consume/release. int64_t _untracked_mem = 0; - int64_t _untracked_mem_limit = 1 * 1024 * 1024; -}; + int64_t _untracked_mem_limit = config::untracked_mem_limit; + + // Memory size of tracker failure after mem limit exceeded, + // expect to be successfully consumed later. + int64_t _missed_query_tracker_mem = 0; + int64_t _missed_global_tracker_mem = 0; + + // After mem limit exceeded, avoid entering infinite recursion. + bool _query_mem_limit_exceeded = false; + bool _global_mem_limit_exceeded = false; +}; // namespace doris inline thread_local TheadContext thread_local_ctx; } // namespace doris diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 21f8779e48aa1e..0de13cceb9c342 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -286,6 +286,7 @@ int main(int argc, char** argv) { #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER) doris::MemInfo::refresh_current_mem(); #endif + doris::ExecEnv::GetInstance()->query_mem_tracker_registry()->DeregisterQueryMemTracker(); sleep(10); } diff --git a/be/test/runtime/fragment_mgr_test.cpp b/be/test/runtime/fragment_mgr_test.cpp index ffe11b44eb2c82..304d6d8f77db8c 100644 --- a/be/test/runtime/fragment_mgr_test.cpp +++ b/be/test/runtime/fragment_mgr_test.cpp @@ -47,7 +47,8 @@ Status PlanFragmentExecutor::open() { return s_open_status; } -void PlanFragmentExecutor::cancel() {} +void PlanFragmentExecutor::cancel(const PPlanFragmentCancelReason& reason, const std::string& msg) { +} void PlanFragmentExecutor::set_abort() { LOG(INFO) << "Plan Aborted"; diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc index eafaaed4badc80..92b695456b6278 100644 --- a/be/test/runtime/test_env.cc +++ b/be/test/runtime/test_env.cc @@ -37,7 +37,7 @@ TestEnv::TestEnv() _exec_env = ExecEnv::GetInstance(); _exec_env->_thread_mgr = new ThreadResourceMgr(2); _exec_env->_buffer_reservation = new ReservationTracker(); - _exec_env->_mem_tracker = MemTracker::CreateTracker(-1, "TestEnv"); + _exec_env->_process_mem_tracker = MemTracker::CreateTracker(-1, "TestEnv"); _exec_env->_disk_io_mgr = new DiskIoMgr(1, 1, 1, 10); _exec_env->disk_io_mgr()->init(_io_mgr_tracker); _exec_env->_scan_thread_pool = new PriorityThreadPool(1, 16); diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index 98d0c9b0991103..4b3b137af8e93e 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -1441,6 +1441,12 @@ The size of the buffer before flashing ``` * Default: 0 +### `untracked_mem_limit` + +* Type: int32 +* Description: The maximum buffer length allowed when TCMalloc Hook consumes/releases MemTracker, that is, the minimum batch of consume/release. Increasing this value will increase the frequency of consume/release. +* Default: 4M + ### `max_segment_num_per_rowset` * Type: int32 diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index 83ce1563586950..57dc49ff11cdd2 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -1460,6 +1460,12 @@ webserver默认工作线程数 ``` * 默认值: 0 +### `untracked_mem_limit` + +* 类型: int32 +* 描述: TCMalloc Hook consume/release MemTracker时允许的最大缓存长度,即consume/release的最小批次。增大该值会增加consume/release的频率。 +* 默认值: 4M + ### `max_segment_num_per_rowset` * 类型: int32 diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java index dd3fcdce1703c8..d01d6f99b362c2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java @@ -534,7 +534,8 @@ private void sendFragment() throws TException, RpcException, UserException { if (colocateFragmentIds.contains(fragment.getFragmentId().asInt())) { int rate = Math.min(Config.query_colocate_join_memory_limit_penalty_factor, instanceNum); long newmemory = memoryLimit / rate; - + // TODO(zxy): The meaning of mem limit in query_options has become the real once query mem limit. + // The logic to modify mem_limit here needs to be modified or deleted. for (TExecPlanFragmentParams tParam : tParams) { tParam.query_options.setMemLimit(newmemory); } diff --git a/gensrc/proto/internal_service.proto b/gensrc/proto/internal_service.proto index 41a0dce4bb13ba..5afb8679c281b8 100644 --- a/gensrc/proto/internal_service.proto +++ b/gensrc/proto/internal_service.proto @@ -143,6 +143,7 @@ enum PPlanFragmentCancelReason { USER_CANCEL = 2; INTERNAL_ERROR = 3; TIMEOUT = 4; + MEMORY_EXCEED_LIMIT = 5; }; message PCancelPlanFragmentRequest { From f4da9927413b6aad6b0dd86d0fd7b190f65137f7 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Sun, 28 Nov 2021 17:46:03 +0800 Subject: [PATCH 05/14] Resolve conflicts, fix ut, fix bugs --- be/src/runtime/mem_tracker.cpp | 27 +++++++++------- be/src/runtime/mem_tracker.h | 5 +-- be/src/runtime/runtime_state.cpp | 8 ++++- be/src/runtime/tcmalloc_hook.h | 6 ++++ be/src/runtime/thread_context.h | 34 +++++++++++++-------- be/src/util/mem_info.h | 4 +++ be/test/exec/tablet_sink_test.cpp | 2 ++ be/test/runtime/test_env.cc | 2 ++ be/test/util/arrow/arrow_work_flow_test.cpp | 1 + 9 files changed, 62 insertions(+), 27 deletions(-) diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index c960626708be7e..d8c91c0870fc2e 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -65,7 +65,7 @@ static int64_t CalcSoftLimit(int64_t limit) { if (limit < 0) { return -1; } - if (limit > MemInfo::physical_mem()) { + if (MemInfo::initialized() && limit > MemInfo::physical_mem()) { LOG(WARNING) << "Memory limit " << PrettyPrinter::print(limit, TUnit::BYTES) << " exceeds physical memory of " << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES); @@ -277,28 +277,33 @@ std::shared_ptr QueryMemTrackerRegistry::RegisterQueryMemTracker( ExecEnv::GetInstance()->all_query_mem_tracker(), false, false, MemTrackerLevel::OVERVIEW, query_id)); std::shared_ptr tracker = GetQueryMemTracker(query_id); - // tracker->set_exist_transfer_control(); + if (tracker != nullptr) { + tracker->set_exist_transfer_control(); + } return tracker; } std::shared_ptr QueryMemTrackerRegistry::GetQueryMemTracker(const std::string& query_id) { + DCHECK(!query_id.empty()); std::shared_ptr tracker = nullptr; - QueryTrackersMap::iterator it = _query_mem_trackers.find(query_id); - if (it != _query_mem_trackers.end()) { - DCHECK(query_id == it->second->query_id()); - tracker = it->second; - } + // Avoid using locks to resolve erase conflicts + _query_mem_trackers.if_contains(query_id, + [&tracker](std::shared_ptr v) { tracker = v; }); return tracker; } void QueryMemTrackerRegistry::DeregisterQueryMemTracker() { - for (auto it = _query_mem_trackers.begin(); it != _query_mem_trackers.end();) { + std::vector expired_querys; + for (auto it = _query_mem_trackers.begin(); it != _query_mem_trackers.end(); it++) { // No RuntimeState uses this query MemTracker, it is only referenced by this map, delete it if (it->second.use_count() == 1) { - VLOG(2) << "Deregister query memory tracker, query id: " << it->first; - _query_mem_trackers.erase(it->first); + expired_querys.emplace_back(it->first); } - it++; + } + for (auto qid: expired_querys) { + DCHECK(_query_mem_trackers[qid].use_count() == 1); + _query_mem_trackers.erase(qid); + VLOG(2) << "Deregister query memory tracker, query id: " << qid; } } diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index 5914344f260a18..d9cf619e08209c 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -176,7 +176,8 @@ class MemTracker : public std::enable_shared_from_this { Release(-bytes); return Status::OK(); } - if (MemInfo::current_mem() + bytes >= MemInfo::mem_limit()) { + // TCMalloc new/delete hook will call consume before MemInfo is initialized. + if (MemInfo::initialized() && MemInfo::current_mem() + bytes >= MemInfo::mem_limit()) { return Status::MemoryLimitExceeded(fmt::format( "{}: TryConsume failed, bytes={} process whole consumption={} mem limit={}", label_, bytes, MemInfo::current_mem(), MemInfo::mem_limit())); @@ -547,7 +548,7 @@ class MemTracker : public std::enable_shared_from_this { /// TryConsume() can opt not to exceed this limit. If -1, there is no consumption limit. const int64_t soft_limit_; - // Whether memory control transfer occurs, between mem trackers. + // Whether memory control transfer occurs, between mem trackers. Happened at: // The current tracker calls consume/release, and other threads call release/consume. bool _exist_transfer_control = false; diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index 192b3837b60d93..5f725bd14e677a 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -221,9 +221,15 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { _query_mem_tracker = MemTracker::CreateTracker(bytes_limit, "RuntimeState:query:" + print_id(query_id), _exec_env->process_mem_tracker(), true, false); +#ifdef BE_TEST + if (ExecEnv::GetInstance()->query_mem_tracker_registry() == nullptr) { + _hook_query_mem_tracker = + _exec_env->query_mem_tracker_registry()->RegisterQueryMemTracker(print_id(query_id), bytes_limit); + } +#else _hook_query_mem_tracker = _exec_env->query_mem_tracker_registry()->RegisterQueryMemTracker(print_id(query_id), bytes_limit); - +#endif _instance_mem_tracker = MemTracker::CreateTracker(&_profile, -1, "RuntimeState:instance:", _query_mem_tracker); diff --git a/be/src/runtime/tcmalloc_hook.h b/be/src/runtime/tcmalloc_hook.h index 38b0a99f3aec20..5d8f80df1d45f7 100644 --- a/be/src/runtime/tcmalloc_hook.h +++ b/be/src/runtime/tcmalloc_hook.h @@ -29,6 +29,12 @@ void delete_hook(const void* ptr) { doris::thread_local_ctx.release_mem(tc_malloc_size(const_cast(ptr))); } +// Notice: modify the command in New/Delete Hook should be careful enough, +// and should be as simple as possible, otherwise it may cause weird errors. E.g: +// 1. The first New Hook call of the process may be before some variables of +// the process are initialized. +// 2. Allocating memory in the Hook command causes the Hook to be entered again, +// infinite recursion. void init_hook() { MallocHook::AddNewHook(&new_hook); MallocHook::AddDeleteHook(&delete_hook); diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index cec0b1a551c692..1f14b7ddadc8ee 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -38,8 +38,15 @@ class TheadContext { const TUniqueId& fragment_instance_id = TUniqueId()) { _query_id = query_id; _fragment_instance_id = fragment_instance_id; - update_query_mem_tracker(ExecEnv::GetInstance()->query_mem_tracker_registry()->GetQueryMemTracker( - print_id(query_id))); +#ifdef BE_TEST + if (ExecEnv::GetInstance()->query_mem_tracker_registry() == nullptr) { + return; + } +#endif + update_query_mem_tracker( + ExecEnv::GetInstance()->query_mem_tracker_registry()->GetQueryMemTracker( + print_id(query_id))); + } void unattach_query() { @@ -90,16 +97,16 @@ class TheadContext { // is the default tracker, it may be the same block of memory. Consume is called in query_mem_tracker, // and release is called in global_hook_tracker, which is repeatedly released after ~query_mem_tracker. if (!_query_mem_tracker.expired()) { - if (_query_mem_limit_exceeded == false) { + if (_query_mem_consuming == false) { + _query_mem_consuming = true; if (!_query_mem_tracker.lock()->TryConsume(_missed_query_tracker_mem + _untracked_mem)) { - _query_mem_limit_exceeded = true; query_mem_limit_exceeded(_missed_query_tracker_mem + _untracked_mem); - _query_mem_limit_exceeded = false; _missed_query_tracker_mem += _untracked_mem; } else { _missed_query_tracker_mem = 0; } + _query_mem_consuming = false; } else { _missed_query_tracker_mem += _untracked_mem; } @@ -107,15 +114,16 @@ class TheadContext { // The first time GetGlobalHookTracker is called after the main thread starts, == nullptr if (_global_hook_tracker != nullptr) { - if (_global_mem_limit_exceeded == false) { - if (!_global_hook_tracker->TryConsume(_missed_global_tracker_mem + _untracked_mem)) { - _global_mem_limit_exceeded = true; + if (_global_mem_consuming == false) { + _global_mem_consuming = true; + if (!_global_hook_tracker->TryConsume(_missed_global_tracker_mem + + _untracked_mem)) { global_mem_limit_exceeded(_missed_global_tracker_mem + _untracked_mem); - _global_mem_limit_exceeded = false; _missed_global_tracker_mem += _untracked_mem; } else { _missed_global_tracker_mem = 0; } + _global_mem_consuming = false; } else { _missed_global_tracker_mem += _untracked_mem; } @@ -159,10 +167,10 @@ class TheadContext { int64_t _missed_query_tracker_mem = 0; int64_t _missed_global_tracker_mem = 0; - // After mem limit exceeded, avoid entering infinite recursion. - bool _query_mem_limit_exceeded = false; - bool _global_mem_limit_exceeded = false; -}; // namespace doris + // When memory is being consumed, avoid entering infinite recursion. + bool _query_mem_consuming = false; + bool _global_mem_consuming = false; +}; inline thread_local TheadContext thread_local_ctx; } // namespace doris diff --git a/be/src/util/mem_info.h b/be/src/util/mem_info.h index 6ae8669f868d50..384fcc140f91c7 100644 --- a/be/src/util/mem_info.h +++ b/be/src/util/mem_info.h @@ -34,6 +34,10 @@ class MemInfo { // Initialize MemInfo. static void init(); + static inline bool initialized() { + return _s_initialized; + } + // Get total physical memory in bytes (if has cgroups memory limits, return the limits). static inline int64_t physical_mem() { DCHECK(_s_initialized); diff --git a/be/test/exec/tablet_sink_test.cpp b/be/test/exec/tablet_sink_test.cpp index 3d55699a6f6a8c..9fb6db40cd852b 100644 --- a/be/test/exec/tablet_sink_test.cpp +++ b/be/test/exec/tablet_sink_test.cpp @@ -57,6 +57,7 @@ class OlapTableSinkTest : public testing::Test { _env->_internal_client_cache = new BrpcClientCache(); _env->_function_client_cache = new BrpcClientCache(); _env->_buffer_reservation = new ReservationTracker(); + _env->_query_mem_tracker_registry = new QueryMemTrackerRegistry(); ThreadPoolBuilder("SendBatchThreadPool") .set_min_threads(1) .set_max_threads(5) @@ -73,6 +74,7 @@ class OlapTableSinkTest : public testing::Test { SAFE_DELETE(_env->_master_info); SAFE_DELETE(_env->_thread_mgr); SAFE_DELETE(_env->_buffer_reservation); + SAFE_DELETE(_env->_query_mem_tracker_registry); if (_server) { _server->Stop(100); _server->Join(); diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc index 92b695456b6278..9967142c3249ff 100644 --- a/be/test/runtime/test_env.cc +++ b/be/test/runtime/test_env.cc @@ -38,6 +38,7 @@ TestEnv::TestEnv() _exec_env->_thread_mgr = new ThreadResourceMgr(2); _exec_env->_buffer_reservation = new ReservationTracker(); _exec_env->_process_mem_tracker = MemTracker::CreateTracker(-1, "TestEnv"); + _exec_env->_query_mem_tracker_registry = new QueryMemTrackerRegistry(); _exec_env->_disk_io_mgr = new DiskIoMgr(1, 1, 1, 10); _exec_env->disk_io_mgr()->init(_io_mgr_tracker); _exec_env->_scan_thread_pool = new PriorityThreadPool(1, 16); @@ -65,6 +66,7 @@ TestEnv::~TestEnv() { SAFE_DELETE(_exec_env->_scan_thread_pool); SAFE_DELETE(_exec_env->_disk_io_mgr); SAFE_DELETE(_exec_env->_buffer_reservation); + SAFE_DELETE(_exec_env->_query_mem_tracker_registry); SAFE_DELETE(_exec_env->_thread_mgr); if (_engine == StorageEngine::_s_instance) { diff --git a/be/test/util/arrow/arrow_work_flow_test.cpp b/be/test/util/arrow/arrow_work_flow_test.cpp index 658a5ac3a6f698..4f34b1a395c6fb 100644 --- a/be/test/util/arrow/arrow_work_flow_test.cpp +++ b/be/test/util/arrow/arrow_work_flow_test.cpp @@ -91,6 +91,7 @@ void ArrowWorkFlowTest::init_runtime_state() { _exec_env->_result_queue_mgr = new ResultQueueMgr(); _exec_env->_thread_mgr = new ThreadResourceMgr(); _exec_env->_buffer_reservation = new ReservationTracker(); + _exec_env->_query_mem_tracker_registry = new QueryMemTrackerRegistry(); TQueryOptions query_options; query_options.batch_size = 1024; TUniqueId query_id; From be4ed240c01db027562759619b28ccc3109cb5d8 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Sun, 28 Nov 2021 17:54:38 +0800 Subject: [PATCH 06/14] format --- be/src/runtime/mem_tracker.cpp | 53 ++++++++++++++++++-------------- be/src/runtime/mem_tracker.h | 48 +++++++++++++++-------------- be/src/runtime/runtime_state.cpp | 10 +++--- be/src/runtime/thread_context.h | 1 - be/src/service/doris_main.cpp | 4 +-- be/src/util/mem_info.h | 4 +-- 6 files changed, 63 insertions(+), 57 deletions(-) diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index d8c91c0870fc2e..4e3fc9c592da7b 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -17,11 +17,9 @@ #include "runtime/mem_tracker.h" -#include - -#include #include - +#include +#include #include #include @@ -80,7 +78,8 @@ static std::shared_ptr root_tracker; static GoogleOnceType root_tracker_once = GOOGLE_ONCE_INIT; void MemTracker::CreateRootTracker() { - root_tracker.reset(new MemTracker(nullptr, -1, "Root", nullptr, true, MemTrackerLevel::OVERVIEW)); + root_tracker.reset( + new MemTracker(nullptr, -1, "Root", nullptr, true, MemTrackerLevel::OVERVIEW)); root_tracker->Init(); } @@ -90,12 +89,14 @@ static std::shared_ptr global_hook_mem_tracker; static GoogleOnceType global_hook_mem_tracker_once = GOOGLE_ONCE_INIT; void MemTracker::CreateGlobalHookTracker() { - global_hook_mem_tracker.reset(new MemTracker(nullptr, -1, "Global Hook", nullptr, true, MemTrackerLevel::OVERVIEW)); + global_hook_mem_tracker.reset( + new MemTracker(nullptr, -1, "Global Hook", nullptr, true, MemTrackerLevel::OVERVIEW)); global_hook_mem_tracker->Init(); } std::shared_ptr MemTracker::CreateTracker(RuntimeProfile* profile, int64_t byte_limit, - const std::string& label, const std::shared_ptr& parent, + const std::string& label, + const std::shared_ptr& parent, bool reset_label_name, MemTrackerLevel level, const std::string& query_id) { std::shared_ptr real_parent; @@ -120,8 +121,9 @@ std::shared_ptr MemTracker::CreateTracker(RuntimeProfile* profile, i label_name = label; } - shared_ptr tracker(new MemTracker(profile, byte_limit, label_name, real_parent, true, - level > real_parent->_level ? level : real_parent->_level)); + shared_ptr tracker( + new MemTracker(profile, byte_limit, label_name, real_parent, true, + level > real_parent->_level ? level : real_parent->_level)); real_parent->AddChildTracker(tracker); tracker->Init(); tracker->set_query_id(query_id); @@ -130,8 +132,10 @@ std::shared_ptr MemTracker::CreateTracker(RuntimeProfile* profile, i } std::shared_ptr MemTracker::CreateTracker(int64_t byte_limit, const std::string& label, - std::shared_ptr parent, bool log_usage_if_zero, bool reset_label_name, - MemTrackerLevel level, const std::string& query_id) { + std::shared_ptr parent, + bool log_usage_if_zero, bool reset_label_name, + MemTrackerLevel level, + const std::string& query_id) { std::shared_ptr real_parent; std::string label_name; // if parent is not null, reset label name to query id. @@ -156,7 +160,7 @@ std::shared_ptr MemTracker::CreateTracker(int64_t byte_limit, const shared_ptr tracker( new MemTracker(nullptr, byte_limit, label_name, real_parent, log_usage_if_zero, - level > real_parent->_level ? level : real_parent->_level)); + level > real_parent->_level ? level : real_parent->_level)); real_parent->AddChildTracker(tracker); tracker->Init(); tracker->set_query_id(query_id); @@ -165,10 +169,12 @@ std::shared_ptr MemTracker::CreateTracker(int64_t byte_limit, const } MemTracker::MemTracker(int64_t byte_limit, const std::string& label) - : MemTracker(nullptr, byte_limit, label, std::shared_ptr(), true, MemTrackerLevel::VERBOSE) {} + : MemTracker(nullptr, byte_limit, label, std::shared_ptr(), true, + MemTrackerLevel::VERBOSE) {} MemTracker::MemTracker(RuntimeProfile* profile, int64_t byte_limit, const string& label, - const std::shared_ptr& parent, bool log_usage_if_zero, MemTrackerLevel level) + const std::shared_ptr& parent, bool log_usage_if_zero, + MemTrackerLevel level) : limit_(byte_limit), soft_limit_(CalcSoftLimit(byte_limit)), label_(label), @@ -253,8 +259,7 @@ int64_t MemTracker::GetPoolMemReserved() { // Make sure we don't overflow if the query limits are set to ridiculous values. mem_reserved += std::min(child_limit, MemInfo::physical_mem()); } else { - DCHECK(child_limit == -1) - << child->LogUsage(UNLIMITED_DEPTH); + DCHECK(child_limit == -1) << child->LogUsage(UNLIMITED_DEPTH); mem_reserved += child->consumption(); } } @@ -274,8 +279,8 @@ std::shared_ptr QueryMemTrackerRegistry::RegisterQueryMemTracker( query_id, [](std::shared_ptr) {}, MemTracker::CreateTracker(mem_limit, strings::Substitute(QUERY_MEM_TRACKER_LABEL_FORMAT, query_id), - ExecEnv::GetInstance()->all_query_mem_tracker(), false, - false, MemTrackerLevel::OVERVIEW, query_id)); + ExecEnv::GetInstance()->all_query_mem_tracker(), false, false, + MemTrackerLevel::OVERVIEW, query_id)); std::shared_ptr tracker = GetQueryMemTracker(query_id); if (tracker != nullptr) { tracker->set_exist_transfer_control(); @@ -283,7 +288,8 @@ std::shared_ptr QueryMemTrackerRegistry::RegisterQueryMemTracker( return tracker; } -std::shared_ptr QueryMemTrackerRegistry::GetQueryMemTracker(const std::string& query_id) { +std::shared_ptr QueryMemTrackerRegistry::GetQueryMemTracker( + const std::string& query_id) { DCHECK(!query_id.empty()); std::shared_ptr tracker = nullptr; // Avoid using locks to resolve erase conflicts @@ -297,10 +303,10 @@ void QueryMemTrackerRegistry::DeregisterQueryMemTracker() { for (auto it = _query_mem_trackers.begin(); it != _query_mem_trackers.end(); it++) { // No RuntimeState uses this query MemTracker, it is only referenced by this map, delete it if (it->second.use_count() == 1) { - expired_querys.emplace_back(it->first); + expired_querys.emplace_back(it->first); } } - for (auto qid: expired_querys) { + for (auto qid : expired_querys) { DCHECK(_query_mem_trackers[qid].use_count() == 1); _query_mem_trackers.erase(qid); VLOG(2) << "Deregister query memory tracker, query id: " << qid; @@ -342,7 +348,8 @@ void MemTracker::ListTrackers(vector>* trackers) { } for (const auto& child_weak : children) { shared_ptr child = child_weak.lock(); - if (child && static_cast(child->_level) <= config::mem_tracker_level) { + if (child && static_cast(child->_level) <= + config::mem_tracker_level) { to_process.emplace_back(std::move(child)); } } @@ -416,7 +423,7 @@ std::string MemTracker::LogUsage(int max_recursive_depth, const string& prefix, if (CheckLimitExceeded(MemLimit::HARD)) ss << " memory limit exceeded."; if (limit_ > 0) ss << " Limit=" << PrettyPrinter::print(limit_, TUnit::BYTES); - // TODO(zxy): ReservationTrackerCounters is not actually used in the current Doris. + // TODO(zxy): ReservationTrackerCounters is not actually used in the current Doris. // Printing here ReservationTrackerCounters may cause BE crash when high concurrency. // The memory tracker in Doris will be redesigned in the future. // ReservationTrackerCounters* reservation_counters = reservation_counters_.load(); diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index d9cf619e08209c..9d75111711ee33 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -17,12 +17,13 @@ #pragma once +#include + #include #include #include #include #include -#include #include #include #include @@ -46,6 +47,20 @@ enum class MemLimit { HARD, SOFT }; /// each MemTracker have a Level equals to parent, only be set explicit enum class MemTrackerLevel { OVERVIEW = 0, TASK, VERBOSE }; +// The smallest negative number allowed for consumption value, Unit byte. +// Usually, a negative values means that the statistics are not accurate, +// but A small range of negative values ​​is allowed, because TCMalloc Hook will cache +// a batch of untracked values ​​when it consumes/releases MemTracker, +// which may cause tracker->consumption to be temporarily less than 0. +// Note that, this may obscure other errors. +// consumption_ < 0 will make the memory statistics inaccurate, so it should be avoided. +// 1. The released memory is not consumed. +// 2. The same block of memory, tracker A calls consume, and tracker B calls release. +// 3. Repeated releases of MemTacker. When the consume is called on the child MemTracker, +// after the release is called on the parent MemTracker, +// the child ~MemTracker will cause repeated releases. +static const int MIN_NEGATIVE_CONSUMPTION_VALUE = -4 * 1024 * 1024; + class ObjectPool; class MemTracker; struct ReservationTrackerCounters; @@ -95,7 +110,8 @@ class MemTracker : public std::enable_shared_from_this { int64_t byte_limit = -1, const std::string& label = std::string(), std::shared_ptr parent = std::shared_ptr(), bool log_usage_if_zero = true, bool reset_label_name = true, - MemTrackerLevel level = MemTrackerLevel::VERBOSE, const std::string& query_id = std::string()); + MemTrackerLevel level = MemTrackerLevel::VERBOSE, + const std::string& query_id = std::string()); static std::shared_ptr CreateTracker( RuntimeProfile* profile, int64_t byte_limit, const std::string& label = std::string(), @@ -140,7 +156,8 @@ class MemTracker : public std::enable_shared_from_this { for (auto& tracker : all_trackers_) { tracker->consumption_->add(bytes); if (LIKELY(tracker->consumption_metric_ == nullptr)) { - DCHECK_GE(tracker->consumption_->current_value(), -config::untracked_mem_limit * 10); + DCHECK_GE(tracker->consumption_->current_value(), + std::min(MIN_NEGATIVE_CONSUMPTION_VALUE, -config::untracked_mem_limit)); } } } @@ -249,17 +266,8 @@ class MemTracker : public std::enable_shared_from_this { /// trackers since we can enforce that the reported memory usage is internally /// consistent.) if (LIKELY(tracker->consumption_metric_ == nullptr)) { - // A small range of negative values is allowed, because TCMalloc Hook consume/release - // MemTracker may cause tracker->consumption to be temporarily less than 0. - // - // Note that, this may obscure other errors. - // consumption_ < 0 will make the memory statistics inaccurate, so it should be avoided. - // 1. The released memory is not consumed. - // 2. The same block of memory, tracker A calls consume, and tracker B calls release. - // 3. Repeated releases of MemTacker. When the consume is called on the child MemTracker, - // after the release is called on the parent MemTracker, - // the child ~MemTracker will cause repeated releases. - DCHECK_GE(tracker->consumption_->current_value(), -config::untracked_mem_limit * 10) + DCHECK_GE(tracker->consumption_->current_value(), + std::min(MIN_NEGATIVE_CONSUMPTION_VALUE, -config::untracked_mem_limit)) << std::endl << tracker->LogUsage(UNLIMITED_DEPTH); } @@ -325,9 +333,7 @@ class MemTracker : public std::enable_shared_from_this { } const std::string& label() const { return label_; } - std::string query_id() { - return query_id_; - } + std::string query_id() { return query_id_; } void set_query_id(const std::string& query_id) { if (query_id != std::string()) { query_id_ = query_id; @@ -335,13 +341,9 @@ class MemTracker : public std::enable_shared_from_this { } } - bool exist_transfer_control() { - return _exist_transfer_control; - } + bool exist_transfer_control() { return _exist_transfer_control; } - void set_exist_transfer_control() { - _exist_transfer_control = true; - } + void set_exist_transfer_control() { _exist_transfer_control = true; } /// Returns the lowest limit for this tracker and its ancestors. Returns /// -1 if there is no limit. diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index 5f725bd14e677a..262cc0dc5b4bd1 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -31,13 +31,13 @@ #include "runtime/buffered_block_mgr2.h" #include "runtime/bufferpool/reservation_tracker.h" #include "runtime/bufferpool/reservation_util.h" -#include "runtime/thread_context.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/initial_reservations.h" #include "runtime/load_path_mgr.h" #include "runtime/mem_tracker.h" #include "runtime/runtime_filter_mgr.h" +#include "runtime/thread_context.h" #include "util/cpu_info.h" #include "util/disk_info.h" #include "util/file_utils.h" @@ -223,12 +223,12 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { _exec_env->process_mem_tracker(), true, false); #ifdef BE_TEST if (ExecEnv::GetInstance()->query_mem_tracker_registry() == nullptr) { - _hook_query_mem_tracker = - _exec_env->query_mem_tracker_registry()->RegisterQueryMemTracker(print_id(query_id), bytes_limit); + _hook_query_mem_tracker = _exec_env->query_mem_tracker_registry()->RegisterQueryMemTracker( + print_id(query_id), bytes_limit); } #else - _hook_query_mem_tracker = - _exec_env->query_mem_tracker_registry()->RegisterQueryMemTracker(print_id(query_id), bytes_limit); + _hook_query_mem_tracker = _exec_env->query_mem_tracker_registry()->RegisterQueryMemTracker( + print_id(query_id), bytes_limit); #endif _instance_mem_tracker = MemTracker::CreateTracker(&_profile, -1, "RuntimeState:instance:", _query_mem_tracker); diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index 1f14b7ddadc8ee..6b13601fb8c4b2 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -46,7 +46,6 @@ class TheadContext { update_query_mem_tracker( ExecEnv::GetInstance()->query_mem_tracker_registry()->GetQueryMemTracker( print_id(query_id))); - } void unattach_query() { diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 0de13cceb9c342..52e23c21b80f32 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -264,7 +264,8 @@ int main(int argc, char** argv) { status = heartbeat_thrift_server->start(); if (!status.ok()) { - LOG(ERROR) << "Doris BE HeartBeat Service did not start correctly, exiting: " << status.get_error_msg(); + LOG(ERROR) << "Doris BE HeartBeat Service did not start correctly, exiting: " + << status.get_error_msg(); doris::shutdown_logging(); exit(1); } @@ -317,4 +318,3 @@ static void help(const char* progname) { printf(" -v, --version output version information, then exit\n"); printf(" -?, --help show this help, then exit\n"); } - diff --git a/be/src/util/mem_info.h b/be/src/util/mem_info.h index 384fcc140f91c7..b2c556e60f4c22 100644 --- a/be/src/util/mem_info.h +++ b/be/src/util/mem_info.h @@ -34,9 +34,7 @@ class MemInfo { // Initialize MemInfo. static void init(); - static inline bool initialized() { - return _s_initialized; - } + static inline bool initialized() { return _s_initialized; } // Get total physical memory in bytes (if has cgroups memory limits, return the limits). static inline int64_t physical_mem() { From cfc779d70e8647453ba7a61939dfb9d53bfa7572 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Tue, 30 Nov 2021 14:41:57 +0800 Subject: [PATCH 07/14] fix local bug --- be/src/runtime/mem_tracker.cpp | 4 ++++ be/src/runtime/thread_context.h | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index 4e3fc9c592da7b..b20df3e80eaf6e 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -29,6 +29,7 @@ #include "runtime/bufferpool/reservation_tracker_counters.h" #include "runtime/exec_env.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "service/backend_options.h" #include "util/debug_util.h" #include "util/doris_metrics.h" @@ -314,6 +315,9 @@ void QueryMemTrackerRegistry::DeregisterQueryMemTracker() { } MemTracker::~MemTracker() { + if (label_ == "Global Hook") { + thread_local_ctx.stop_mem_tracker(); + } delete reservation_counters_.load(); if (parent()) { diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index 6b13601fb8c4b2..bc2f8c36922f03 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -132,6 +132,9 @@ class TheadContext { } void try_consume(int64_t size) { + if (_stop_mem_tracker == true) { + return; + } _untracked_mem += size; // When some threads `0 <_untracked_mem <_untracked_mem_limit` // and some threads `_untracked_mem <= -_untracked_mem_limit` trigger consumption(), @@ -146,6 +149,10 @@ class TheadContext { void release_mem(int64_t size) { try_consume(-size); } + void stop_mem_tracker() { + _stop_mem_tracker = true; + } + const TUniqueId& query_id() { return _query_id; } const std::thread::id& thread_id() { return _thread_id; } @@ -169,6 +176,11 @@ class TheadContext { // When memory is being consumed, avoid entering infinite recursion. bool _query_mem_consuming = false; bool _global_mem_consuming = false; + + // In some cases, we want to turn off memory statistics. + // For example, when ~GlobalHookTracker, TCMalloc delete hook + // release GlobalHookTracker will crash. + bool _stop_mem_tracker = false; }; inline thread_local TheadContext thread_local_ctx; From 11ba01b69a33eda1998e893681fb22321789a2b6 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Mon, 6 Dec 2021 19:19:53 +0800 Subject: [PATCH 08/14] modify ThreadContext, Some details and naming --- be/src/common/config.h | 6 +- be/src/exec/olap_scan_node.cpp | 9 +- be/src/runtime/exec_env.h | 9 +- be/src/runtime/exec_env_init.cpp | 3 +- be/src/runtime/fragment_mgr.cpp | 5 +- be/src/runtime/mem_tracker.cpp | 38 ++-- be/src/runtime/mem_tracker.h | 33 ++-- be/src/runtime/plan_fragment_executor.cpp | 41 ++-- be/src/runtime/runtime_state.cpp | 7 +- be/src/runtime/thread_context.cpp | 128 +++++++++++++ be/src/runtime/thread_context.h | 176 +++++++----------- be/src/service/doris_main.cpp | 2 +- be/test/exec/tablet_sink_test.cpp | 3 +- be/test/runtime/test_env.cc | 3 +- be/test/util/arrow/arrow_work_flow_test.cpp | 2 +- .../administrator-guide/config/be_config.md | 4 +- .../administrator-guide/config/be_config.md | 4 +- gensrc/proto/internal_service.proto | 2 +- 18 files changed, 286 insertions(+), 189 deletions(-) create mode 100644 be/src/runtime/thread_context.cpp diff --git a/be/src/common/config.h b/be/src/common/config.h index 231c7c97d8fa4c..fc0c4f922832c2 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -602,10 +602,10 @@ CONF_mInt32(remote_storage_read_buffer_mb, "16"); // the level equal or lower than mem_tracker_level will show in web page CONF_Int16(mem_tracker_level, "0"); -// The maximum buffer length allowed when TCMalloc Hook consumes/releases MemTracker, -// that is, the minimum batch of consume/release, specified as number of bytes. +// The minimum length when TCMalloc Hook consumes/releases MemTracker, consume size +// smaller than this value will continue to accumulate. specified as number of bytes. // Increasing this value will increase the frequency of consume/release. -CONF_mInt32(untracked_mem_limit, "4194304"); +CONF_mInt32(mem_tracker_consume_min_size_mbytes, "1048576"); // The version information of the tablet will be stored in the memory // in an adjacency graph data structure. diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index 48e0e9dac7e1bb..1e788d771be6d0 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -29,12 +29,12 @@ #include "exprs/expr_context.h" #include "exprs/runtime_filter.h" #include "gen_cpp/PlanNodes_types.h" -#include "runtime/thread_context.h" #include "runtime/exec_env.h" #include "runtime/row_batch.h" #include "runtime/runtime_filter_mgr.h" #include "runtime/runtime_state.h" #include "runtime/string_value.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "util/priority_thread_pool.hpp" #include "util/runtime_profile.h" @@ -1502,7 +1502,8 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { } void OlapScanNode::scanner_thread(OlapScanner* scanner) { - thread_local_ctx.attach_query(scanner->runtime_state()->query_id(), _runtime_state->fragment_instance_id()); + thread_local_ctx.attach(ThreadContext::QUERY, print_id(scanner->runtime_state()->query_id()), + _runtime_state->fragment_instance_id()); if (UNLIKELY(_transfer_done)) { _scanner_done = true; std::unique_lock l(_scan_batches_lock); @@ -1512,7 +1513,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { _scan_batch_added_cv.notify_one(); _scan_thread_exit_cv.notify_one(); LOG(INFO) << "Scan thread cancelled, cause query done, scan thread started to exit"; - thread_local_ctx.unattach_query(); + thread_local_ctx.detach(); return; } int64_t wait_time = scanner->update_wait_worker_timer(); @@ -1662,7 +1663,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { // and transfer thread _scan_batch_added_cv.notify_one(); _scan_thread_exit_cv.notify_one(); - thread_local_ctx.unattach_query(); + thread_local_ctx.detach(); } Status OlapScanNode::add_one_batch(RowBatch* row_batch) { diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index ff50a2f0cc650d..a5a10712fa7d28 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -20,6 +20,7 @@ #include "common/status.h" #include "olap/options.h" +#include "runtime/mem_tracker.h" #include "util/threadpool.h" namespace doris { @@ -96,7 +97,7 @@ class ExecEnv { // declarations for classes in scoped_ptrs. ~ExecEnv(); - const bool is_init() { return _is_init; } + const bool initialized() { return _is_init; } const std::string& token() const; ExternalScanContextMgr* external_scan_context_mgr() { return _external_scan_context_mgr; } DataStreamMgr* stream_mgr() { return _stream_mgr; } @@ -118,7 +119,9 @@ class ExecEnv { std::shared_ptr process_mem_tracker() { return _process_mem_tracker; } std::shared_ptr all_query_mem_tracker() { return _all_query_mem_tracker; } - QueryMemTrackerRegistry* query_mem_tracker_registry() { return _query_mem_tracker_registry; } + QueryMemTrackerRegistry* query_mem_tracker_registry() { + return _query_mem_tracker_registry.get(); + } ThreadResourceMgr* thread_mgr() { return _thread_mgr; } PriorityThreadPool* scan_thread_pool() { return _scan_thread_pool; } ThreadPool* limited_scan_thread_pool() { return _limited_scan_thread_pool.get(); } @@ -187,7 +190,7 @@ class ExecEnv { std::shared_ptr _process_mem_tracker = nullptr; // The ancestor for all querys tracker. std::shared_ptr _all_query_mem_tracker = nullptr; - QueryMemTrackerRegistry* _query_mem_tracker_registry = nullptr; + std::unique_ptr _query_mem_tracker_registry; ThreadResourceMgr* _thread_mgr = nullptr; // The following two thread pools are used in different scenarios. diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 9a7f18b64c2e05..129dcb1dbdb94f 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -93,7 +93,7 @@ Status ExecEnv::_init(const std::vector& store_paths) { _broker_client_cache = new BrokerServiceClientCache(config::max_client_cache_size_per_host); _extdatasource_client_cache = new ExtDataSourceServiceClientCache(config::max_client_cache_size_per_host); - _query_mem_tracker_registry = new QueryMemTrackerRegistry(); + _query_mem_tracker_registry.reset(new QueryMemTrackerRegistry()); _thread_mgr = new ThreadResourceMgr(); _scan_thread_pool = new PriorityThreadPool(config::doris_scanner_thread_pool_thread_num, config::doris_scanner_thread_pool_queue_size); @@ -304,7 +304,6 @@ void ExecEnv::_destroy() { SAFE_DELETE(_etl_thread_pool); SAFE_DELETE(_scan_thread_pool); SAFE_DELETE(_thread_mgr); - SAFE_DELETE(_query_mem_tracker_registry); SAFE_DELETE(_broker_client_cache); SAFE_DELETE(_extdatasource_client_cache); SAFE_DELETE(_frontend_client_cache); diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index c57a691533188d..efc8e59d94ba7d 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -470,7 +470,8 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi .query_id(exec_state->query_id()) .instance_id(exec_state->fragment_instance_id()) .tag("pthread_id", std::to_string((uintptr_t)pthread_self())); - thread_local_ctx.attach_query(exec_state->query_id(), exec_state->fragment_instance_id()); + thread_local_ctx.attach(ThreadContext::QUERY, print_id(exec_state->query_id()), + exec_state->fragment_instance_id()); exec_state->execute(); std::shared_ptr fragments_ctx = exec_state->get_fragments_ctx(); @@ -491,7 +492,7 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi // Callback after remove from this id cb(exec_state->executor()); - thread_local_ctx.unattach_query(); + thread_local_ctx.detach(); } Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params) { diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index b20df3e80eaf6e..e5b36b04549a7c 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -17,15 +17,14 @@ #include "runtime/mem_tracker.h" -#include -#include +#include + #include #include #include #include "exec/exec_node.h" #include "gutil/once.h" -#include "gutil/strings/substitute.h" #include "runtime/bufferpool/reservation_tracker_counters.h" #include "runtime/exec_env.h" #include "runtime/runtime_state.h" @@ -36,9 +35,9 @@ #include "util/mem_info.h" #include "util/pretty_printer.h" #include "util/stack_util.h" +#include "util/string_util.h" #include "util/uid_util.h" -using boost::join; using std::deque; using std::endl; using std::greater; @@ -50,14 +49,13 @@ using std::string; using std::vector; using std::weak_ptr; -using strings::Substitute; namespace doris { const std::string MemTracker::COUNTER_NAME = "PeakMemoryUsage"; // Name for query MemTrackers. '$0' is replaced with the query id. -const std::string QUERY_MEM_TRACKER_LABEL_FORMAT = "queryId=$0"; +const std::string QUERY_MEM_TRACKER_LABEL_FORMAT = "queryId={}"; /// Calculate the soft limit for a MemTracker based on the hard limit 'limit'. static int64_t CalcSoftLimit(int64_t limit) { @@ -112,7 +110,7 @@ std::shared_ptr MemTracker::CreateTracker(RuntimeProfile* profile, i real_parent = parent; if (reset_label_name) { std::vector tmp_result; - boost::split(tmp_result, parent->label(), boost::is_any_of(":")); + tmp_result = split(parent->label(), ":"); label_name = label + ":" + tmp_result[tmp_result.size() - 1]; } else { label_name = label; @@ -149,7 +147,7 @@ std::shared_ptr MemTracker::CreateTracker(int64_t byte_limit, const real_parent = parent; if (reset_label_name) { std::vector tmp_result; - boost::split(tmp_result, parent->label(), boost::is_any_of(":")); + tmp_result = split(parent->label(), ":"); label_name = label + ":" + tmp_result[tmp_result.size() - 1]; } else { label_name = label; @@ -219,7 +217,7 @@ void MemTracker::EnableReservationReporting(const ReservationTrackerCounters& co int64_t MemTracker::GetLowestLimit(MemLimit mode) const { if (limit_trackers_.empty()) return -1; - int64_t min_limit = numeric_limits::max(); + int64_t min_limit = std::numeric_limits::max(); for (MemTracker* limit_tracker : limit_trackers_) { DCHECK(limit_tracker->has_limit()); min_limit = std::min(min_limit, limit_tracker->GetLimit(mode)); @@ -268,28 +266,28 @@ int64_t MemTracker::GetPoolMemReserved() { return mem_reserved; } -std::shared_ptr QueryMemTrackerRegistry::RegisterQueryMemTracker( +std::shared_ptr QueryMemTrackerRegistry::register_query_mem_tracker( const std::string& query_id, int64_t mem_limit) { DCHECK(!query_id.empty()); - VLOG(2) << "Register query memory tracker, query id: " << query_id - << " limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); + VLOG_FILE << "Register query memory tracker, query id: " << query_id + << " limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); // First time this query_id registered, make a new object, otherwise do nothing. // Combine CreateTracker and emplace into one operation to avoid the use of locks _query_mem_trackers.try_emplace_l( query_id, [](std::shared_ptr) {}, MemTracker::CreateTracker(mem_limit, - strings::Substitute(QUERY_MEM_TRACKER_LABEL_FORMAT, query_id), + fmt::format(QUERY_MEM_TRACKER_LABEL_FORMAT, query_id), ExecEnv::GetInstance()->all_query_mem_tracker(), false, false, MemTrackerLevel::OVERVIEW, query_id)); - std::shared_ptr tracker = GetQueryMemTracker(query_id); + std::shared_ptr tracker = get_query_mem_tracker(query_id); if (tracker != nullptr) { - tracker->set_exist_transfer_control(); + tracker->exist_consume_or_release_missing(); } return tracker; } -std::shared_ptr QueryMemTrackerRegistry::GetQueryMemTracker( +std::shared_ptr QueryMemTrackerRegistry::get_query_mem_tracker( const std::string& query_id) { DCHECK(!query_id.empty()); std::shared_ptr tracker = nullptr; @@ -299,7 +297,7 @@ std::shared_ptr QueryMemTrackerRegistry::GetQueryMemTracker( return tracker; } -void QueryMemTrackerRegistry::DeregisterQueryMemTracker() { +void QueryMemTrackerRegistry::deregister_query_mem_tracker() { std::vector expired_querys; for (auto it = _query_mem_trackers.begin(); it != _query_mem_trackers.end(); it++) { // No RuntimeState uses this query MemTracker, it is only referenced by this map, delete it @@ -310,7 +308,7 @@ void QueryMemTrackerRegistry::DeregisterQueryMemTracker() { for (auto qid : expired_querys) { DCHECK(_query_mem_trackers[qid].use_count() == 1); _query_mem_trackers.erase(qid); - VLOG(2) << "Deregister query memory tracker, query id: " << qid; + VLOG_FILE << "Deregister query memory tracker, query id: " << qid; } } @@ -321,7 +319,7 @@ MemTracker::~MemTracker() { delete reservation_counters_.load(); if (parent()) { - if (!exist_transfer_control()) { + if (!consume_or_release_missing()) { DCHECK(consumption() == 0) << "Memory tracker " << debug_string() << " has unreleased consumption " << consumption(); } @@ -453,7 +451,7 @@ std::string MemTracker::LogUsage(int max_recursive_depth, const string& prefix, if (max_recursive_depth == 0) return ss.str(); // Recurse and get information about the children - std::string new_prefix = strings::Substitute(" $0", prefix); + std::string new_prefix = fmt::format(" {}", prefix); int64_t child_consumption; std::string child_trackers_usage; list> children; diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index 9d75111711ee33..b33461b49fdd50 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -59,7 +59,7 @@ enum class MemTrackerLevel { OVERVIEW = 0, TASK, VERBOSE }; // 3. Repeated releases of MemTacker. When the consume is called on the child MemTracker, // after the release is called on the parent MemTracker, // the child ~MemTracker will cause repeated releases. -static const int MIN_NEGATIVE_CONSUMPTION_VALUE = -4 * 1024 * 1024; +static const int MIN_NEGATIVE_CONSUMPTION_VALUE = -10 * 1024 * 1024; class ObjectPool; class MemTracker; @@ -157,7 +157,8 @@ class MemTracker : public std::enable_shared_from_this { tracker->consumption_->add(bytes); if (LIKELY(tracker->consumption_metric_ == nullptr)) { DCHECK_GE(tracker->consumption_->current_value(), - std::min(MIN_NEGATIVE_CONSUMPTION_VALUE, -config::untracked_mem_limit)); + std::min(MIN_NEGATIVE_CONSUMPTION_VALUE, + -config::mem_tracker_consume_min_size_mbytes * 10)); } } } @@ -266,8 +267,12 @@ class MemTracker : public std::enable_shared_from_this { /// trackers since we can enforce that the reported memory usage is internally /// consistent.) if (LIKELY(tracker->consumption_metric_ == nullptr)) { + // A query corresponds to multiple threads, and each thread may have + // config::mem_tracker_consume_min_size_mbytes. The length is not cosumeed. Here, + // 10 is just a guess. DCHECK_GE(tracker->consumption_->current_value(), - std::min(MIN_NEGATIVE_CONSUMPTION_VALUE, -config::untracked_mem_limit)) + std::min(MIN_NEGATIVE_CONSUMPTION_VALUE, + -config::mem_tracker_consume_min_size_mbytes * 10)) << std::endl << tracker->LogUsage(UNLIMITED_DEPTH); } @@ -341,9 +346,9 @@ class MemTracker : public std::enable_shared_from_this { } } - bool exist_transfer_control() { return _exist_transfer_control; } + bool consume_or_release_missing() { return _consume_or_release_missing; } - void set_exist_transfer_control() { _exist_transfer_control = true; } + void exist_consume_or_release_missing() { _consume_or_release_missing = true; } /// Returns the lowest limit for this tracker and its ancestors. Returns /// -1 if there is no limit. @@ -536,10 +541,10 @@ class MemTracker : public std::enable_shared_from_this { /// Lock to protect GcMemory(). This prevents many GCs from occurring at once. std::mutex gc_lock_; - /// True if this is a Query MemTracker returned from RegisterQueryMemTracker(). + /// True if this is a Query MemTracker returned from register_query_mem_tracker(). bool _is_query_mem_tracker = false; - /// Only valid for MemTrackers returned from RegisterQueryMemTracker() + /// Only valid for MemTrackers returned from register_query_mem_tracker() std::string query_id_; /// Hard limit on memory consumption, in bytes. May not be exceeded. If limit_ == -1, @@ -550,9 +555,9 @@ class MemTracker : public std::enable_shared_from_this { /// TryConsume() can opt not to exceed this limit. If -1, there is no consumption limit. const int64_t soft_limit_; - // Whether memory control transfer occurs, between mem trackers. Happened at: - // The current tracker calls consume/release, and other threads call release/consume. - bool _exist_transfer_control = false; + // Is there a situation where different MemTracker calls consume and release in the same block. + // Happened at: The current tracker calls consume/release, and other threads call release/consume. + bool _consume_or_release_missing = false; std::string label_; @@ -620,12 +625,12 @@ class QueryMemTrackerRegistry { // memory usage of all querys executing. The first time this is called for a query, // a new MemTracker object is created with the process tracker as its parent. // Newly created trackers will always have a limit of -1. - std::shared_ptr RegisterQueryMemTracker(const std::string& query_id, - int64_t mem_limit = -1); + std::shared_ptr register_query_mem_tracker(const std::string& query_id, + int64_t mem_limit = -1); - std::shared_ptr GetQueryMemTracker(const std::string& query_id); + std::shared_ptr get_query_mem_tracker(const std::string& query_id); - void DeregisterQueryMemTracker(); + void deregister_query_mem_tracker(); private: // All per-query MemTracker objects. diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp index 6e0e83fcb89e4b..35cd189e230ca0 100644 --- a/be/src/runtime/plan_fragment_executor.cpp +++ b/be/src/runtime/plan_fragment_executor.cpp @@ -28,20 +28,20 @@ #include "exec/scan_node.h" #include "exprs/expr.h" #include "runtime/data_stream_mgr.h" -#include "runtime/thread_context.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/mem_tracker.h" #include "runtime/result_buffer_mgr.h" #include "runtime/result_queue_mgr.h" #include "runtime/row_batch.h" +#include "runtime/thread_context.h" #include "util/container_util.hpp" #include "util/cpu_info.h" +#include "util/logging.h" #include "util/mem_info.h" #include "util/parse_util.h" #include "util/pretty_printer.h" #include "util/uid_util.h" -#include "util/logging.h" #include "vec/core/block.h" #include "vec/exec/vexchange_node.h" @@ -77,10 +77,12 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, const TPlanFragmentExecParams& params = request.params; _query_id = params.query_id; - TAG(LOG(INFO)).log("PlanFragmentExecutor::prepare") - .query_id(_query_id).instance_id(params.fragment_instance_id) - .tag("backend_num", std::to_string(request.backend_num)) - .tag("pthread_id", std::to_string((uintptr_t) pthread_self())); + TAG(LOG(INFO)) + .log("PlanFragmentExecutor::prepare") + .query_id(_query_id) + .instance_id(params.fragment_instance_id) + .tag("backend_num", std::to_string(request.backend_num)) + .tag("pthread_id", std::to_string((uintptr_t)pthread_self())); // VLOG_CRITICAL << "request:\n" << apache::thrift::ThriftDebugString(request); const TQueryGlobals& query_globals = @@ -236,9 +238,12 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, Status PlanFragmentExecutor::open() { int64_t mem_limit = _runtime_state->fragment_mem_tracker()->limit(); - TAG(LOG(INFO)).log("PlanFragmentExecutor::open, using query memory limit: " + PrettyPrinter::print(mem_limit, TUnit::BYTES)) - .query_id(_query_id).instance_id(_runtime_state->fragment_instance_id()) - .tag("mem_limit", std::to_string(mem_limit)); + TAG(LOG(INFO)) + .log("PlanFragmentExecutor::open, using query memory limit: " + + PrettyPrinter::print(mem_limit, TUnit::BYTES)) + .query_id(_query_id) + .instance_id(_runtime_state->fragment_instance_id()) + .tag("mem_limit", std::to_string(mem_limit)); // we need to start the profile-reporting thread before calling Open(), since it // may block @@ -266,7 +271,7 @@ Status PlanFragmentExecutor::open() { } if (status.is_cancelled()) { - if (_cancel_reason == PPlanFragmentCancelReason::MEMORY_EXCEED_LIMIT) { + if (_cancel_reason == PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED) { status = Status::MemoryLimitExceeded(_cancel_msg); } } @@ -453,7 +458,8 @@ void PlanFragmentExecutor::_collect_query_statistics() { void PlanFragmentExecutor::_collect_node_statistics() { DCHECK(_runtime_state->backend_id() != -1); - NodeStatistics* node_statistics = _query_statistics->add_nodes_statistics(_runtime_state->backend_id()); + NodeStatistics* node_statistics = + _query_statistics->add_nodes_statistics(_runtime_state->backend_id()); node_statistics->add_peak_memory(_mem_tracker->peak_consumption()); } @@ -467,7 +473,6 @@ void PlanFragmentExecutor::report_profile() { // tell Open() that we started _report_thread_started_cv.notify_one(); - // Jitter the reporting time of remote fragments by a random amount between // 0 and the report_interval. This way, the coordinator doesn't get all the // updates at once so its better for contention as well as smoother progress @@ -561,8 +566,10 @@ Status PlanFragmentExecutor::get_next(RowBatch** batch) { update_status(status); if (_done) { - TAG(LOG(INFO)).log("PlanFragmentExecutor::get_next finished") - .query_id(_query_id).instance_id(_runtime_state->fragment_instance_id()); + TAG(LOG(INFO)) + .log("PlanFragmentExecutor::get_next finished") + .query_id(_query_id) + .instance_id(_runtime_state->fragment_instance_id()); // Query is done, return the thread token stop_report_thread(); send_report(true); @@ -620,8 +627,10 @@ void PlanFragmentExecutor::update_status(const Status& new_status) { } void PlanFragmentExecutor::cancel(const PPlanFragmentCancelReason& reason, const std::string& msg) { - TAG(LOG(INFO)).log("PlanFragmentExecutor::cancel") - .query_id(_query_id).instance_id(_runtime_state->fragment_instance_id()); + TAG(LOG(INFO)) + .log("PlanFragmentExecutor::cancel") + .query_id(_query_id) + .instance_id(_runtime_state->fragment_instance_id()); DCHECK(_prepared); _cancel_reason = reason; _cancel_msg = msg; diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index 262cc0dc5b4bd1..bda83e7c770182 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -223,11 +223,12 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { _exec_env->process_mem_tracker(), true, false); #ifdef BE_TEST if (ExecEnv::GetInstance()->query_mem_tracker_registry() == nullptr) { - _hook_query_mem_tracker = _exec_env->query_mem_tracker_registry()->RegisterQueryMemTracker( - print_id(query_id), bytes_limit); + _hook_query_mem_tracker = + _exec_env->query_mem_tracker_registry()->register_query_mem_tracker( + print_id(query_id), bytes_limit); } #else - _hook_query_mem_tracker = _exec_env->query_mem_tracker_registry()->RegisterQueryMemTracker( + _hook_query_mem_tracker = _exec_env->query_mem_tracker_registry()->register_query_mem_tracker( print_id(query_id), bytes_limit); #endif _instance_mem_tracker = diff --git a/be/src/runtime/thread_context.cpp b/be/src/runtime/thread_context.cpp new file mode 100644 index 00000000000000..f0fc721d88c52a --- /dev/null +++ b/be/src/runtime/thread_context.cpp @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "runtime/thread_context.h" + +#include + +namespace doris { + +void ThreadContext::update_query_mem_tracker(const std::string& query_id) { +#ifdef BE_TEST + if (ExecEnv::GetInstance()->query_mem_tracker_registry() == nullptr) { + return; + } +#endif + update_query_mem_tracker( + ExecEnv::GetInstance()->query_mem_tracker_registry()->get_query_mem_tracker(query_id)); +} + +void ThreadContext::update_query_mem_tracker( + std::weak_ptr mem_tracker = std::weak_ptr()) { + if (_untracked_mem != 0) { + consume(); + _untracked_mem = 0; + } + _query_mem_tracker = mem_tracker; +} + +void ThreadContext::query_mem_limit_exceeded(int64_t mem_usage) { + if (_task_id != "" && _fragment_instance_id != TUniqueId() && + ExecEnv::GetInstance()->initialized() && + ExecEnv::GetInstance()->fragment_mgr()->is_canceling(_fragment_instance_id).ok()) { + auto st = _query_mem_tracker.lock()->MemLimitExceeded( + nullptr, "Query Memory exceed limit in TCMalloc Hook New.", mem_usage); + + std::string detail = + "Query Memory exceed limit in TCMalloc Hook New, Backend: {}, Fragment: {}, Used: " + "{}, Limit: {}. You can change the limit by session variable exec_mem_limit."; + fmt::format(detail, BackendOptions::get_localhost(), print_id(_fragment_instance_id), + std::to_string(_query_mem_tracker.lock()->consumption()), + std::to_string(_query_mem_tracker.lock()->limit())); + ExecEnv::GetInstance()->fragment_mgr()->cancel( + _fragment_instance_id, PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED, detail); + _fragment_instance_id = TUniqueId(); // Make sure it will only be canceled once + } +} + +void ThreadContext::global_mem_limit_exceeded(int64_t mem_usage) { + std::string detail = "Global Memory exceed limit in TCMalloc Hook New."; + auto st = _query_mem_tracker.lock()->MemLimitExceeded(nullptr, detail, mem_usage); +} + +// Note that, If call the memory allocation operation in TCMalloc new/delete Hook, +// such as calling LOG/iostream/sstream/stringstream/etc. related methods, +// must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck, +void ThreadContext::consume() { + // Query_mem_tracker and global_hook_tracker are counted separately, + // in order to ensure that the process memory counted by global_hook_tracker is accurate enough. + // + // Otherwise, if query_mem_tracker is the child of global_hook_tracker and global_hook_tracker + // is the default tracker, it may be the same block of memory. Consume is called in query_mem_tracker, + // and release is called in global_hook_tracker, which is repeatedly released after ~query_mem_tracker. + if (!_query_mem_tracker.expired()) { + if (_query_mem_consuming == false) { + _query_mem_consuming = true; + if (!_query_mem_tracker.lock()->TryConsume(_missed_query_tracker_mem + + _untracked_mem)) { + query_mem_limit_exceeded(_missed_query_tracker_mem + _untracked_mem); + _missed_query_tracker_mem += _untracked_mem; + } else { + _missed_query_tracker_mem = 0; + } + _query_mem_consuming = false; + } else { + _missed_query_tracker_mem += _untracked_mem; + } + } + + // The first time GetGlobalHookTracker is called after the main thread starts, == nullptr + if (_global_hook_tracker != nullptr) { + if (_global_mem_consuming == false) { + _global_mem_consuming = true; + if (!_global_hook_tracker->TryConsume(_missed_global_tracker_mem + _untracked_mem)) { + global_mem_limit_exceeded(_missed_global_tracker_mem + _untracked_mem); + _missed_global_tracker_mem += _untracked_mem; + } else { + _missed_global_tracker_mem = 0; + } + _global_mem_consuming = false; + } else { + _missed_global_tracker_mem += _untracked_mem; + } + } else { + _missed_global_tracker_mem += _untracked_mem; + } +} + +void ThreadContext::try_consume(int64_t size) { + if (_stop_mem_tracker == true) { + return; + } + _untracked_mem += size; + // When some threads `0 <_untracked_mem <_untracked_mem_limit` + // and some threads `_untracked_mem <= -_untracked_mem_limit` trigger consumption(), + // it will cause tracker->consumption to be temporarily less than 0. + if (_untracked_mem >= _untracked_mem_limit || _untracked_mem <= -_untracked_mem_limit) { + consume(); + _untracked_mem = 0; + } +} + +} // namespace doris diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index bc2f8c36922f03..d7b4e46c4c76ad 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -18,155 +18,91 @@ #pragma once #include +#include #include "gen_cpp/Types_types.h" #include "runtime/exec_env.h" -#include "runtime/fragment_mgr.h" #include "runtime/mem_tracker.h" #include "service/backend_options.h" namespace doris { -class TheadContext { +// The thread context saves some info about a working thread. +// 2 requried info: +// 1. thread_id: Current thread id, Auto generated. +// 2. type: The type is a enum value indicating which type of task current thread is running. +// For example: QUERY, LOAD, COMPACTION, ... +// 3. task id: A unique id to identify this task. maybe query id, load job id, etc. +// +// There may be other optional info to be added later. +class ThreadContext { +public: + enum TaskType { + UNKNOWN = 0, + QUERY = 1, + LOAD = 2, + COMPACTION = 3 + // to be added ... + }; + public: - TheadContext() + ThreadContext() : _thread_id(std::this_thread::get_id()), + _type(TaskType::UNKNOWN), _global_hook_tracker(MemTracker::GetGlobalHookTracker()) {} - ~TheadContext() { update_query_mem_tracker(); } + ~ThreadContext() {} - void attach_query(const TUniqueId& query_id, - const TUniqueId& fragment_instance_id = TUniqueId()) { - _query_id = query_id; + void attach(const TaskType& type, const std::string& task_id, + const TUniqueId& fragment_instance_id = TUniqueId()) { + _type = type; + _task_id = task_id; _fragment_instance_id = fragment_instance_id; -#ifdef BE_TEST - if (ExecEnv::GetInstance()->query_mem_tracker_registry() == nullptr) { - return; - } -#endif - update_query_mem_tracker( - ExecEnv::GetInstance()->query_mem_tracker_registry()->GetQueryMemTracker( - print_id(query_id))); + update_query_mem_tracker(task_id); } - void unattach_query() { - _query_id = TUniqueId(); + void detach() { + _type = TaskType::UNKNOWN; + _task_id = ""; _fragment_instance_id = TUniqueId(); update_query_mem_tracker(); } - void update_query_mem_tracker( - std::weak_ptr mem_tracker = std::weak_ptr()) { - if (_untracked_mem != 0) { - consume(); - _untracked_mem = 0; - } - _query_mem_tracker = mem_tracker; - } + const std::string type() const; + const std::string& task_id() const { return _task_id; } + const std::thread::id& thread_id() const { return _thread_id; } + const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } - void query_mem_limit_exceeded(int64_t mem_usage) { - if (_query_id != TUniqueId() && _fragment_instance_id != TUniqueId() && - ExecEnv::GetInstance()->is_init() && - ExecEnv::GetInstance()->fragment_mgr()->is_canceling(_fragment_instance_id).ok()) { - std::string detail = "Query Memory exceed limit in TCMalloc Hook New."; - auto st = _query_mem_tracker.lock()->MemLimitExceeded(nullptr, detail, mem_usage); - detail += ", Backend: " + BackendOptions::get_localhost() + - ", Fragment: " + print_id(_fragment_instance_id) + - ", Used: " + std::to_string(_query_mem_tracker.lock()->consumption()) + - ", Limit: " + std::to_string(_query_mem_tracker.lock()->limit()) + - ". You can change the limit by session variable exec_mem_limit."; - ExecEnv::GetInstance()->fragment_mgr()->cancel( - _fragment_instance_id, PPlanFragmentCancelReason::MEMORY_EXCEED_LIMIT, detail); - _fragment_instance_id = TUniqueId(); // Make sure it will only be canceled once - } - } - - void global_mem_limit_exceeded(int64_t mem_usage) { - std::string detail = "Global Memory exceed limit in TCMalloc Hook New."; - auto st = _query_mem_tracker.lock()->MemLimitExceeded(nullptr, detail, mem_usage); - } + void update_query_mem_tracker(const std::string& query_id) {} + void update_query_mem_tracker( + std::weak_ptr mem_tracker = std::weak_ptr()) {} + void query_mem_limit_exceeded(int64_t mem_usage) {} + void global_mem_limit_exceeded(int64_t mem_usage) {} // Note that, If call the memory allocation operation in TCMalloc new/delete Hook, // such as calling LOG/iostream/sstream/stringstream/etc. related methods, // must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck, - void consume() { - // Query_mem_tracker and global_hook_tracker are counted separately, - // in order to ensure that the process memory counted by global_hook_tracker is accurate enough. - // - // Otherwise, if query_mem_tracker is the child of global_hook_tracker and global_hook_tracker - // is the default tracker, it may be the same block of memory. Consume is called in query_mem_tracker, - // and release is called in global_hook_tracker, which is repeatedly released after ~query_mem_tracker. - if (!_query_mem_tracker.expired()) { - if (_query_mem_consuming == false) { - _query_mem_consuming = true; - if (!_query_mem_tracker.lock()->TryConsume(_missed_query_tracker_mem + - _untracked_mem)) { - query_mem_limit_exceeded(_missed_query_tracker_mem + _untracked_mem); - _missed_query_tracker_mem += _untracked_mem; - } else { - _missed_query_tracker_mem = 0; - } - _query_mem_consuming = false; - } else { - _missed_query_tracker_mem += _untracked_mem; - } - } - - // The first time GetGlobalHookTracker is called after the main thread starts, == nullptr - if (_global_hook_tracker != nullptr) { - if (_global_mem_consuming == false) { - _global_mem_consuming = true; - if (!_global_hook_tracker->TryConsume(_missed_global_tracker_mem + - _untracked_mem)) { - global_mem_limit_exceeded(_missed_global_tracker_mem + _untracked_mem); - _missed_global_tracker_mem += _untracked_mem; - } else { - _missed_global_tracker_mem = 0; - } - _global_mem_consuming = false; - } else { - _missed_global_tracker_mem += _untracked_mem; - } - } else { - _missed_global_tracker_mem += _untracked_mem; - } - } - - void try_consume(int64_t size) { - if (_stop_mem_tracker == true) { - return; - } - _untracked_mem += size; - // When some threads `0 <_untracked_mem <_untracked_mem_limit` - // and some threads `_untracked_mem <= -_untracked_mem_limit` trigger consumption(), - // it will cause tracker->consumption to be temporarily less than 0. - if (_untracked_mem >= _untracked_mem_limit || _untracked_mem <= -_untracked_mem_limit) { - consume(); - _untracked_mem = 0; - } - } + void consume() {} + void try_consume(int64_t size) {} void consume_mem(int64_t size) { try_consume(size); } void release_mem(int64_t size) { try_consume(-size); } - void stop_mem_tracker() { - _stop_mem_tracker = true; - } - - const TUniqueId& query_id() { return _query_id; } - const std::thread::id& thread_id() { return _thread_id; } + void stop_mem_tracker() { _stop_mem_tracker = true; } private: std::thread::id _thread_id; - TUniqueId _query_id; + TaskType _type; + std::string _task_id; TUniqueId _fragment_instance_id; + std::weak_ptr _query_mem_tracker; std::shared_ptr _global_hook_tracker = nullptr; // The memory size that is not tracker is used to control batch trackers, // avoid frequent consume/release. int64_t _untracked_mem = 0; - int64_t _untracked_mem_limit = config::untracked_mem_limit; + int64_t _untracked_mem_limit = config::mem_tracker_consume_min_size_mbytes; // Memory size of tracker failure after mem limit exceeded, // expect to be successfully consumed later. @@ -183,5 +119,23 @@ class TheadContext { bool _stop_mem_tracker = false; }; -inline thread_local TheadContext thread_local_ctx; +inline thread_local ThreadContext thread_local_ctx; + +inline const std::string task_type_string(ThreadContext::TaskType type) { + switch (type) { + case ThreadContext::TaskType::QUERY: + return "QUERY"; + case ThreadContext::TaskType::LOAD: + return "LOAD"; + case ThreadContext::TaskType::COMPACTION: + return "COMPACTION"; + default: + return "UNKNOWN"; + } +} + +inline const std::string ThreadContext::type() const { + return task_type_string(_type); +} + } // namespace doris diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 52e23c21b80f32..538ed686b4bf27 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -287,7 +287,7 @@ int main(int argc, char** argv) { #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER) doris::MemInfo::refresh_current_mem(); #endif - doris::ExecEnv::GetInstance()->query_mem_tracker_registry()->DeregisterQueryMemTracker(); + doris::ExecEnv::GetInstance()->query_mem_tracker_registry()->deregister_query_mem_tracker(); sleep(10); } diff --git a/be/test/exec/tablet_sink_test.cpp b/be/test/exec/tablet_sink_test.cpp index 9fb6db40cd852b..405e7c0d8293f1 100644 --- a/be/test/exec/tablet_sink_test.cpp +++ b/be/test/exec/tablet_sink_test.cpp @@ -57,7 +57,7 @@ class OlapTableSinkTest : public testing::Test { _env->_internal_client_cache = new BrpcClientCache(); _env->_function_client_cache = new BrpcClientCache(); _env->_buffer_reservation = new ReservationTracker(); - _env->_query_mem_tracker_registry = new QueryMemTrackerRegistry(); + _env->_query_mem_tracker_registry.reset(new QueryMemTrackerRegistry()); ThreadPoolBuilder("SendBatchThreadPool") .set_min_threads(1) .set_max_threads(5) @@ -74,7 +74,6 @@ class OlapTableSinkTest : public testing::Test { SAFE_DELETE(_env->_master_info); SAFE_DELETE(_env->_thread_mgr); SAFE_DELETE(_env->_buffer_reservation); - SAFE_DELETE(_env->_query_mem_tracker_registry); if (_server) { _server->Stop(100); _server->Join(); diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc index 9967142c3249ff..82263f2e31ed21 100644 --- a/be/test/runtime/test_env.cc +++ b/be/test/runtime/test_env.cc @@ -38,7 +38,7 @@ TestEnv::TestEnv() _exec_env->_thread_mgr = new ThreadResourceMgr(2); _exec_env->_buffer_reservation = new ReservationTracker(); _exec_env->_process_mem_tracker = MemTracker::CreateTracker(-1, "TestEnv"); - _exec_env->_query_mem_tracker_registry = new QueryMemTrackerRegistry(); + _exec_env->_query_mem_tracker_registry.reset(new QueryMemTrackerRegistry()); _exec_env->_disk_io_mgr = new DiskIoMgr(1, 1, 1, 10); _exec_env->disk_io_mgr()->init(_io_mgr_tracker); _exec_env->_scan_thread_pool = new PriorityThreadPool(1, 16); @@ -66,7 +66,6 @@ TestEnv::~TestEnv() { SAFE_DELETE(_exec_env->_scan_thread_pool); SAFE_DELETE(_exec_env->_disk_io_mgr); SAFE_DELETE(_exec_env->_buffer_reservation); - SAFE_DELETE(_exec_env->_query_mem_tracker_registry); SAFE_DELETE(_exec_env->_thread_mgr); if (_engine == StorageEngine::_s_instance) { diff --git a/be/test/util/arrow/arrow_work_flow_test.cpp b/be/test/util/arrow/arrow_work_flow_test.cpp index 4f34b1a395c6fb..01c11224056936 100644 --- a/be/test/util/arrow/arrow_work_flow_test.cpp +++ b/be/test/util/arrow/arrow_work_flow_test.cpp @@ -91,7 +91,7 @@ void ArrowWorkFlowTest::init_runtime_state() { _exec_env->_result_queue_mgr = new ResultQueueMgr(); _exec_env->_thread_mgr = new ThreadResourceMgr(); _exec_env->_buffer_reservation = new ReservationTracker(); - _exec_env->_query_mem_tracker_registry = new QueryMemTrackerRegistry(); + _exec_env->_query_mem_tracker_registry.reset(new QueryMemTrackerRegistry()); TQueryOptions query_options; query_options.batch_size = 1024; TUniqueId query_id; diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index 4b3b137af8e93e..7d4f190c4bd828 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -1441,10 +1441,10 @@ The size of the buffer before flashing ``` * Default: 0 -### `untracked_mem_limit` +### `mem_tracker_consume_min_size_mbytes` * Type: int32 -* Description: The maximum buffer length allowed when TCMalloc Hook consumes/releases MemTracker, that is, the minimum batch of consume/release. Increasing this value will increase the frequency of consume/release. +* Description: The minimum length of TCMalloc Hook when consume/release MemTracker. Consume size smaller than this value will continue to accumulate to avoid frequent calls to consume/release of MemTracker. Increasing this value will increase the frequency of consume/release. * Default: 4M ### `max_segment_num_per_rowset` diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index 57dc49ff11cdd2..017e48ee5977d1 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -1460,10 +1460,10 @@ webserver默认工作线程数 ``` * 默认值: 0 -### `untracked_mem_limit` +### `mem_tracker_consume_min_size_mbytes` * 类型: int32 -* 描述: TCMalloc Hook consume/release MemTracker时允许的最大缓存长度,即consume/release的最小批次。增大该值会增加consume/release的频率。 +* 描述: TCMalloc Hook consume/release MemTracker时的最小长度,小于该值的consume size会持续累加,避免频繁调用MemTracker的consume/release,增大该值会增加consume/release的频率。 * 默认值: 4M ### `max_segment_num_per_rowset` diff --git a/gensrc/proto/internal_service.proto b/gensrc/proto/internal_service.proto index 5afb8679c281b8..76c5f95ec5803e 100644 --- a/gensrc/proto/internal_service.proto +++ b/gensrc/proto/internal_service.proto @@ -143,7 +143,7 @@ enum PPlanFragmentCancelReason { USER_CANCEL = 2; INTERNAL_ERROR = 3; TIMEOUT = 4; - MEMORY_EXCEED_LIMIT = 5; + MEMORY_LIMIT_EXCEED = 5; }; message PCancelPlanFragmentRequest { From 02e6fc49061363aea1f4fb4b734d9c4b0092b863 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Tue, 7 Dec 2021 14:55:20 +0800 Subject: [PATCH 09/14] split ThreadContext to ThreadMemTracker --- be/src/common/config.h | 6 +- be/src/runtime/CMakeLists.txt | 1 + be/src/runtime/thread_context.h | 71 ++++++---------- ...ead_context.cpp => thread_mem_tracker.cpp} | 47 +++++------ be/src/runtime/thread_mem_tracker.h | 80 +++++++++++++++++++ be/src/service/doris_main.cpp | 5 +- .../administrator-guide/config/be_config.md | 8 +- .../administrator-guide/config/be_config.md | 8 +- 8 files changed, 152 insertions(+), 74 deletions(-) rename be/src/runtime/{thread_context.cpp => thread_mem_tracker.cpp} (75%) create mode 100644 be/src/runtime/thread_mem_tracker.h diff --git a/be/src/common/config.h b/be/src/common/config.h index fc0c4f922832c2..c7bb9627791593 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -60,6 +60,9 @@ CONF_mInt64(tc_free_memory_rate, "20"); // https://github.com/gperftools/gperftools/issues/1111 CONF_Int64(tc_max_total_thread_cache_bytes, "1073741824"); +// Whether to initialize TCmalloc new/delete Hook, MemTracker is currently counted in Hook. +CONF_mBool(tc_init_hook, "true"); + // process memory limit specified as number of bytes // ('[bB]?'), megabytes ('[mM]'), gigabytes ('[gG]'), // or percentage of the physical memory ('%'). @@ -604,7 +607,8 @@ CONF_Int16(mem_tracker_level, "0"); // The minimum length when TCMalloc Hook consumes/releases MemTracker, consume size // smaller than this value will continue to accumulate. specified as number of bytes. -// Increasing this value will increase the frequency of consume/release. +// Decreasing this value will increase the frequency of consume/release. +// Increasing this value will cause MemTracker statistics to be inaccurate. CONF_mInt32(mem_tracker_consume_min_size_mbytes, "1048576"); // The version information of the tablet will be stored in the memory diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt index 142934d5cbf602..2647920a1453b6 100644 --- a/be/src/runtime/CMakeLists.txt +++ b/be/src/runtime/CMakeLists.txt @@ -46,6 +46,7 @@ set(RUNTIME_FILES runtime_state.cpp runtime_filter_mgr.cpp string_value.cpp + thread_mem_tracker.cpp thread_resource_mgr.cpp decimalv2_value.cpp large_int_value.cpp diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index d7b4e46c4c76ad..47d9bf7cdd1bc7 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -21,9 +21,7 @@ #include #include "gen_cpp/Types_types.h" -#include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" -#include "service/backend_options.h" +#include "runtime/thread_mem_tracker.h" namespace doris { @@ -46,10 +44,9 @@ class ThreadContext { }; public: - ThreadContext() - : _thread_id(std::this_thread::get_id()), - _type(TaskType::UNKNOWN), - _global_hook_tracker(MemTracker::GetGlobalHookTracker()) {} + ThreadContext() : _thread_id(std::this_thread::get_id()), _type(TaskType::UNKNOWN) { + _thread_mem_tracker.reset(new ThreadMemTracker()); + } ~ThreadContext() {} void attach(const TaskType& type, const std::string& task_id, @@ -57,14 +54,18 @@ class ThreadContext { _type = type; _task_id = task_id; _fragment_instance_id = fragment_instance_id; - update_query_mem_tracker(task_id); + if (type == TaskType::QUERY) { + _thread_mem_tracker->attach_query(task_id, fragment_instance_id); + } } void detach() { + if (_type == TaskType::QUERY) { + _thread_mem_tracker->detach_query(); + } _type = TaskType::UNKNOWN; _task_id = ""; _fragment_instance_id = TUniqueId(); - update_query_mem_tracker(); } const std::string type() const; @@ -72,23 +73,17 @@ class ThreadContext { const std::thread::id& thread_id() const { return _thread_id; } const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } - void update_query_mem_tracker(const std::string& query_id) {} - void update_query_mem_tracker( - std::weak_ptr mem_tracker = std::weak_ptr()) {} - void query_mem_limit_exceeded(int64_t mem_usage) {} - void global_mem_limit_exceeded(int64_t mem_usage) {} - - // Note that, If call the memory allocation operation in TCMalloc new/delete Hook, - // such as calling LOG/iostream/sstream/stringstream/etc. related methods, - // must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck, - void consume() {} - void try_consume(int64_t size) {} - - void consume_mem(int64_t size) { try_consume(size); } - - void release_mem(int64_t size) { try_consume(-size); } - - void stop_mem_tracker() { _stop_mem_tracker = true; } + void consume_mem(int64_t size) { + if (_thread_mem_tracker != nullptr) { + _thread_mem_tracker->try_consume(size); + } + } + void release_mem(int64_t size) { + if (_thread_mem_tracker != nullptr) { + _thread_mem_tracker->try_consume(-size); + } + } + void stop_mem_tracker() { _thread_mem_tracker->stop_mem_tracker(); } private: std::thread::id _thread_id; @@ -96,27 +91,9 @@ class ThreadContext { std::string _task_id; TUniqueId _fragment_instance_id; - std::weak_ptr _query_mem_tracker; - std::shared_ptr _global_hook_tracker = nullptr; - - // The memory size that is not tracker is used to control batch trackers, - // avoid frequent consume/release. - int64_t _untracked_mem = 0; - int64_t _untracked_mem_limit = config::mem_tracker_consume_min_size_mbytes; - - // Memory size of tracker failure after mem limit exceeded, - // expect to be successfully consumed later. - int64_t _missed_query_tracker_mem = 0; - int64_t _missed_global_tracker_mem = 0; - - // When memory is being consumed, avoid entering infinite recursion. - bool _query_mem_consuming = false; - bool _global_mem_consuming = false; - - // In some cases, we want to turn off memory statistics. - // For example, when ~GlobalHookTracker, TCMalloc delete hook - // release GlobalHookTracker will crash. - bool _stop_mem_tracker = false; + // After _thread_mem_tracker is initialized, + // the current thread TCMalloc Hook starts to consume/release mem_tracker + std::unique_ptr _thread_mem_tracker; }; inline thread_local ThreadContext thread_local_ctx; diff --git a/be/src/runtime/thread_context.cpp b/be/src/runtime/thread_mem_tracker.cpp similarity index 75% rename from be/src/runtime/thread_context.cpp rename to be/src/runtime/thread_mem_tracker.cpp index f0fc721d88c52a..7059925b5bda18 100644 --- a/be/src/runtime/thread_context.cpp +++ b/be/src/runtime/thread_mem_tracker.cpp @@ -15,15 +15,14 @@ // specific language governing permissions and limitations // under the License. -#pragma once +#include "runtime/thread_mem_tracker.h" -#include "runtime/thread_context.h" - -#include +#include "service/backend_options.h" namespace doris { -void ThreadContext::update_query_mem_tracker(const std::string& query_id) { +void ThreadMemTracker::attach_query(const std::string& query_id, + const TUniqueId& fragment_instance_id) { #ifdef BE_TEST if (ExecEnv::GetInstance()->query_mem_tracker_registry() == nullptr) { return; @@ -31,10 +30,15 @@ void ThreadContext::update_query_mem_tracker(const std::string& query_id) { #endif update_query_mem_tracker( ExecEnv::GetInstance()->query_mem_tracker_registry()->get_query_mem_tracker(query_id)); + _fragment_instance_id = fragment_instance_id; +} + +void ThreadMemTracker::detach_query() { + update_query_mem_tracker(std::weak_ptr()); + _fragment_instance_id = TUniqueId(); } -void ThreadContext::update_query_mem_tracker( - std::weak_ptr mem_tracker = std::weak_ptr()) { +void ThreadMemTracker::update_query_mem_tracker(std::weak_ptr mem_tracker) { if (_untracked_mem != 0) { consume(); _untracked_mem = 0; @@ -42,15 +46,14 @@ void ThreadContext::update_query_mem_tracker( _query_mem_tracker = mem_tracker; } -void ThreadContext::query_mem_limit_exceeded(int64_t mem_usage) { - if (_task_id != "" && _fragment_instance_id != TUniqueId() && - ExecEnv::GetInstance()->initialized() && +void ThreadMemTracker::query_mem_limit_exceeded(int64_t mem_usage) { + if (_fragment_instance_id != TUniqueId() && ExecEnv::GetInstance()->initialized() && ExecEnv::GetInstance()->fragment_mgr()->is_canceling(_fragment_instance_id).ok()) { - auto st = _query_mem_tracker.lock()->MemLimitExceeded( - nullptr, "Query Memory exceed limit in TCMalloc Hook New.", mem_usage); + std::string detail = "Query Memory exceed limit in TCMalloc Hook New."; + auto st = _query_mem_tracker.lock()->MemLimitExceeded(nullptr, detail, mem_usage); - std::string detail = - "Query Memory exceed limit in TCMalloc Hook New, Backend: {}, Fragment: {}, Used: " + detail += + " Query Memory exceed limit in TCMalloc Hook New, Backend: {}, Fragment: {}, Used: " "{}, Limit: {}. You can change the limit by session variable exec_mem_limit."; fmt::format(detail, BackendOptions::get_localhost(), print_id(_fragment_instance_id), std::to_string(_query_mem_tracker.lock()->consumption()), @@ -61,15 +64,12 @@ void ThreadContext::query_mem_limit_exceeded(int64_t mem_usage) { } } -void ThreadContext::global_mem_limit_exceeded(int64_t mem_usage) { +void ThreadMemTracker::global_mem_limit_exceeded(int64_t mem_usage) { std::string detail = "Global Memory exceed limit in TCMalloc Hook New."; auto st = _query_mem_tracker.lock()->MemLimitExceeded(nullptr, detail, mem_usage); } -// Note that, If call the memory allocation operation in TCMalloc new/delete Hook, -// such as calling LOG/iostream/sstream/stringstream/etc. related methods, -// must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck, -void ThreadContext::consume() { +void ThreadMemTracker::consume() { // Query_mem_tracker and global_hook_tracker are counted separately, // in order to ensure that the process memory counted by global_hook_tracker is accurate enough. // @@ -111,15 +111,16 @@ void ThreadContext::consume() { } } -void ThreadContext::try_consume(int64_t size) { +void ThreadMemTracker::try_consume(int64_t size) { if (_stop_mem_tracker == true) { return; } _untracked_mem += size; - // When some threads `0 <_untracked_mem <_untracked_mem_limit` - // and some threads `_untracked_mem <= -_untracked_mem_limit` trigger consumption(), + // When some threads `0 < _untracked_mem < _tracker_consume_min_size` + // and some threads `_untracked_mem <= -_tracker_consume_min_size` trigger consumption(), // it will cause tracker->consumption to be temporarily less than 0. - if (_untracked_mem >= _untracked_mem_limit || _untracked_mem <= -_untracked_mem_limit) { + if (_untracked_mem >= _tracker_consume_min_size || + _untracked_mem <= -_tracker_consume_min_size) { consume(); _untracked_mem = 0; } diff --git a/be/src/runtime/thread_mem_tracker.h b/be/src/runtime/thread_mem_tracker.h new file mode 100644 index 00000000000000..70d78a79a74702 --- /dev/null +++ b/be/src/runtime/thread_mem_tracker.h @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "runtime/exec_env.h" +#include "runtime/fragment_mgr.h" +#include "runtime/mem_tracker.h" + +namespace doris { + +// TCMalloc new/delete Hook is counted in the memory_tracker of the current thread +class ThreadMemTracker { +public: + ThreadMemTracker() : _global_hook_tracker(MemTracker::GetGlobalHookTracker()) {} + ~ThreadMemTracker() { detach_query(); } + + // After attach, the current thread TCMalloc Hook starts to consume/release query mem_tracker + void attach_query(const std::string& query_id, const TUniqueId& fragment_instance_id); + + void detach_query(); + + void update_query_mem_tracker(std::weak_ptr mem_tracker); + + void query_mem_limit_exceeded(int64_t mem_usage); + + void global_mem_limit_exceeded(int64_t mem_usage); + + // Note that, If call the memory allocation operation in TCMalloc new/delete Hook, + // such as calling LOG/iostream/sstream/stringstream/etc. related methods, + // must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck, + void consume(); + + void try_consume(int64_t size); + + void stop_mem_tracker() { _stop_mem_tracker = true; } + +private: + TUniqueId _fragment_instance_id; + + std::weak_ptr _query_mem_tracker; + std::shared_ptr _global_hook_tracker = nullptr; + + // Consume size smaller than _tracker_consume_min_size will continue to accumulate + // to avoid frequent calls to consume/release of MemTracker. + int64_t _untracked_mem = 0; + int64_t _tracker_consume_min_size = config::mem_tracker_consume_min_size_mbytes; + + // Memory size of tracker failure after mem limit exceeded, + // expect to be successfully consumed later. + int64_t _missed_query_tracker_mem = 0; + int64_t _missed_global_tracker_mem = 0; + + // When memory is being consumed, avoid entering infinite recursion. + bool _query_mem_consuming = false; + bool _global_mem_consuming = false; + + // In some cases, we want to turn off memory statistics. + // For example, when ~GlobalHookTracker, TCMalloc delete hook + // release GlobalHookTracker will crash. + bool _stop_mem_tracker = false; +}; + +} // namespace doris diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 538ed686b4bf27..ccfe38200f149a 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -76,7 +76,6 @@ static void thrift_output(const char* x) { } // namespace doris int main(int argc, char** argv) { - init_hook(); // check if print version or help if (argc > 1) { if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { @@ -134,6 +133,10 @@ int main(int argc, char** argv) { return -1; } + if (doris::config::tc_init_hook) { + init_hook(); + } + #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER) // Aggressive decommit is required so that unused pages in the TCMalloc page heap are // not backed by physical pages and do not contribute towards memory consumption. diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index 7d4f190c4bd828..0e8ecf0206da92 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -1303,6 +1303,12 @@ Available memory, value range: [0-100] If the system is found to be in a high-stress scenario and a large number of threads are found in the tcmalloc lock competition phase through the BE thread stack, such as a large number of `SpinLock` related stacks, you can try increasing this parameter to improve system performance. [Reference](https://github.com/gperftools/gperftools/issues/1111) +### `tc_init_hook` + +* Type: bool +* Description: Whether to initialize TCmalloc new/delete Hook, MemTracker is currently counted in Hook. +* Default: true + ### `tc_use_memory_min` Default:10737418240 @@ -1444,7 +1450,7 @@ The size of the buffer before flashing ### `mem_tracker_consume_min_size_mbytes` * Type: int32 -* Description: The minimum length of TCMalloc Hook when consume/release MemTracker. Consume size smaller than this value will continue to accumulate to avoid frequent calls to consume/release of MemTracker. Increasing this value will increase the frequency of consume/release. +* Description: The minimum length of TCMalloc Hook when consume/release MemTracker. Consume size smaller than this value will continue to accumulate to avoid frequent calls to consume/release of MemTracker. Decreasing this value will increase the frequency of consume/release. Increasing this value will cause MemTracker statistics to be inaccurate. Theoretically, the statistical value of a MemTracker differs from the true value = (mem_tracker_consume_min_size_mbytes * the number of BE threads where the MemTracker is located). * Default: 4M ### `max_segment_num_per_rowset` diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index 017e48ee5977d1..c1966eb96d4341 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -1325,6 +1325,12 @@ tablet状态缓存的更新间隔,单位:秒 如果发现系统在高压力场景下,通过 BE 线程堆栈发现大量线程处于 tcmalloc 的锁竞争阶段,如大量的 `SpinLock` 相关堆栈,则可以尝试增大该参数来提升系统性能。[参考](https://github.com/gperftools/gperftools/issues/1111) +### `tc_init_hook` + +* 类型:bool +* 描述:是否初始化TCmalloc new/delete Hook,目前在Hook中统计MemTracker。 +* 默认值:true + ### `tc_use_memory_min` 默认值:10737418240 @@ -1463,7 +1469,7 @@ webserver默认工作线程数 ### `mem_tracker_consume_min_size_mbytes` * 类型: int32 -* 描述: TCMalloc Hook consume/release MemTracker时的最小长度,小于该值的consume size会持续累加,避免频繁调用MemTracker的consume/release,增大该值会增加consume/release的频率。 +* 描述: TCMalloc Hook consume/release MemTracker时的最小长度,小于该值的consume size会持续累加,避免频繁调用MemTracker的consume/release,减小该值会增加consume/release的频率,增大该值会导致MemTracker统计不准,理论上一个MemTracker的统计值与真实值相差 = (mem_tracker_consume_min_size_mbytes * 这个MemTracker所在的BE线程数)。 * 默认值: 4M ### `max_segment_num_per_rowset` From 5f3b25ea8f6ee06773159a63671855de07f7a4d6 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Thu, 9 Dec 2021 13:25:05 +0800 Subject: [PATCH 10/14] modify thread mem tracker --- be/src/common/config.h | 2 +- be/src/runtime/mem_tracker.cpp | 2 +- be/src/runtime/mem_tracker.h | 6 +- be/src/runtime/thread_context.h | 2 +- be/src/runtime/thread_mem_tracker.cpp | 63 +++++++++---------- be/src/runtime/thread_mem_tracker.h | 30 ++++----- .../administrator-guide/config/be_config.md | 6 +- .../administrator-guide/config/be_config.md | 6 +- 8 files changed, 59 insertions(+), 58 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index c7bb9627791593..b4974221ea9b6f 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -609,7 +609,7 @@ CONF_Int16(mem_tracker_level, "0"); // smaller than this value will continue to accumulate. specified as number of bytes. // Decreasing this value will increase the frequency of consume/release. // Increasing this value will cause MemTracker statistics to be inaccurate. -CONF_mInt32(mem_tracker_consume_min_size_mbytes, "1048576"); +CONF_mInt32(mem_tracker_consume_min_size_bytes, "1048576"); // The version information of the tablet will be stored in the memory // in an adjacency graph data structure. diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index e5b36b04549a7c..367ecb387c7a05 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -83,7 +83,7 @@ void MemTracker::CreateRootTracker() { } // An independent tracker, no parent and child, -// used in tcmalloc new/delete hook to count the real memory of the process +// used in tcmalloc new/delete hook to count the real memory of the process. static std::shared_ptr global_hook_mem_tracker; static GoogleOnceType global_hook_mem_tracker_once = GOOGLE_ONCE_INIT; diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index b33461b49fdd50..4a2e6bdd849ca4 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -158,7 +158,7 @@ class MemTracker : public std::enable_shared_from_this { if (LIKELY(tracker->consumption_metric_ == nullptr)) { DCHECK_GE(tracker->consumption_->current_value(), std::min(MIN_NEGATIVE_CONSUMPTION_VALUE, - -config::mem_tracker_consume_min_size_mbytes * 10)); + -config::mem_tracker_consume_min_size_bytes * 10)); } } } @@ -268,11 +268,11 @@ class MemTracker : public std::enable_shared_from_this { /// consistent.) if (LIKELY(tracker->consumption_metric_ == nullptr)) { // A query corresponds to multiple threads, and each thread may have - // config::mem_tracker_consume_min_size_mbytes. The length is not cosumeed. Here, + // config::mem_tracker_consume_min_size_bytes. The length is not cosumeed. Here, // 10 is just a guess. DCHECK_GE(tracker->consumption_->current_value(), std::min(MIN_NEGATIVE_CONSUMPTION_VALUE, - -config::mem_tracker_consume_min_size_mbytes * 10)) + -config::mem_tracker_consume_min_size_bytes * 10)) << std::endl << tracker->LogUsage(UNLIMITED_DEPTH); } diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index 47d9bf7cdd1bc7..85a9e0cc05b8ec 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -53,8 +53,8 @@ class ThreadContext { const TUniqueId& fragment_instance_id = TUniqueId()) { _type = type; _task_id = task_id; - _fragment_instance_id = fragment_instance_id; if (type == TaskType::QUERY) { + _fragment_instance_id = fragment_instance_id; _thread_mem_tracker->attach_query(task_id, fragment_instance_id); } } diff --git a/be/src/runtime/thread_mem_tracker.cpp b/be/src/runtime/thread_mem_tracker.cpp index 7059925b5bda18..220a504c087a76 100644 --- a/be/src/runtime/thread_mem_tracker.cpp +++ b/be/src/runtime/thread_mem_tracker.cpp @@ -30,11 +30,13 @@ void ThreadMemTracker::attach_query(const std::string& query_id, #endif update_query_mem_tracker( ExecEnv::GetInstance()->query_mem_tracker_registry()->get_query_mem_tracker(query_id)); + _query_id = query_id; _fragment_instance_id = fragment_instance_id; } void ThreadMemTracker::detach_query() { update_query_mem_tracker(std::weak_ptr()); + _query_id = ""; _fragment_instance_id = TUniqueId(); } @@ -53,9 +55,11 @@ void ThreadMemTracker::query_mem_limit_exceeded(int64_t mem_usage) { auto st = _query_mem_tracker.lock()->MemLimitExceeded(nullptr, detail, mem_usage); detail += - " Query Memory exceed limit in TCMalloc Hook New, Backend: {}, Fragment: {}, Used: " - "{}, Limit: {}. You can change the limit by session variable exec_mem_limit."; - fmt::format(detail, BackendOptions::get_localhost(), print_id(_fragment_instance_id), + " Query Memory exceed limit in TCMalloc Hook New, Backend: {}, Query: {}, " + "Fragment: {}, Used: {}, Limit: {}. You can change the limit by session variable " + "exec_mem_limit."; + fmt::format(detail, BackendOptions::get_localhost(), _query_id, + print_id(_fragment_instance_id), std::to_string(_query_mem_tracker.lock()->consumption()), std::to_string(_query_mem_tracker.lock()->limit())); ExecEnv::GetInstance()->fragment_mgr()->cancel( @@ -65,8 +69,11 @@ void ThreadMemTracker::query_mem_limit_exceeded(int64_t mem_usage) { } void ThreadMemTracker::global_mem_limit_exceeded(int64_t mem_usage) { - std::string detail = "Global Memory exceed limit in TCMalloc Hook New."; - auto st = _query_mem_tracker.lock()->MemLimitExceeded(nullptr, detail, mem_usage); + if (time(nullptr) - global_exceeded_interval > 60) { + std::string detail = "Global Memory exceed limit in TCMalloc Hook New."; + auto st = _global_hook_tracker->MemLimitExceeded(nullptr, detail, mem_usage); + global_exceeded_interval = time(nullptr); + } } void ThreadMemTracker::consume() { @@ -77,37 +84,29 @@ void ThreadMemTracker::consume() { // is the default tracker, it may be the same block of memory. Consume is called in query_mem_tracker, // and release is called in global_hook_tracker, which is repeatedly released after ~query_mem_tracker. if (!_query_mem_tracker.expired()) { - if (_query_mem_consuming == false) { - _query_mem_consuming = true; - if (!_query_mem_tracker.lock()->TryConsume(_missed_query_tracker_mem + - _untracked_mem)) { - query_mem_limit_exceeded(_missed_query_tracker_mem + _untracked_mem); - _missed_query_tracker_mem += _untracked_mem; - } else { - _missed_query_tracker_mem = 0; + if (_stop_query_mem_tracker == false) { + _stop_query_mem_tracker = true; + if (!_query_mem_tracker.lock()->TryConsume(_untracked_mem)) { + query_mem_limit_exceeded(_untracked_mem); } - _query_mem_consuming = false; - } else { - _missed_query_tracker_mem += _untracked_mem; + _stop_query_mem_tracker = false; } } // The first time GetGlobalHookTracker is called after the main thread starts, == nullptr if (_global_hook_tracker != nullptr) { - if (_global_mem_consuming == false) { - _global_mem_consuming = true; - if (!_global_hook_tracker->TryConsume(_missed_global_tracker_mem + _untracked_mem)) { - global_mem_limit_exceeded(_missed_global_tracker_mem + _untracked_mem); - _missed_global_tracker_mem += _untracked_mem; - } else { - _missed_global_tracker_mem = 0; + if (_stop_global_mem_tracker == false) { + _stop_global_mem_tracker = true; + if (!_global_hook_tracker->TryConsume(_untracked_mem)) { + // Currently, _global_hook_tracker is only used for real-time observation to verify + // the accuracy of MemTracker statistics. Therefore, when the _global_hook_tracker + // TryConsume fails, the process is not expected to terminate. To ensure the accuracy + // of real-time statistics, continue to complete the Consume. + _global_hook_tracker->Consume(_untracked_mem); + global_mem_limit_exceeded(_untracked_mem); } - _global_mem_consuming = false; - } else { - _missed_global_tracker_mem += _untracked_mem; + _stop_global_mem_tracker = false; } - } else { - _missed_global_tracker_mem += _untracked_mem; } } @@ -116,11 +115,11 @@ void ThreadMemTracker::try_consume(int64_t size) { return; } _untracked_mem += size; - // When some threads `0 < _untracked_mem < _tracker_consume_min_size` - // and some threads `_untracked_mem <= -_tracker_consume_min_size` trigger consumption(), + // When some threads `0 < _untracked_mem < _tracker_consume_cache_size` + // and some threads `_untracked_mem <= -_tracker_consume_cache_size` trigger consumption(), // it will cause tracker->consumption to be temporarily less than 0. - if (_untracked_mem >= _tracker_consume_min_size || - _untracked_mem <= -_tracker_consume_min_size) { + if (_untracked_mem >= _tracker_consume_cache_size || + _untracked_mem <= -_tracker_consume_cache_size) { consume(); _untracked_mem = 0; } diff --git a/be/src/runtime/thread_mem_tracker.h b/be/src/runtime/thread_mem_tracker.h index 70d78a79a74702..521aa56d65fd8d 100644 --- a/be/src/runtime/thread_mem_tracker.h +++ b/be/src/runtime/thread_mem_tracker.h @@ -38,6 +38,11 @@ class ThreadMemTracker { void update_query_mem_tracker(std::weak_ptr mem_tracker); + void try_consume(int64_t size); + + void stop_mem_tracker() { _stop_mem_tracker = true; } + +private: void query_mem_limit_exceeded(int64_t mem_usage); void global_mem_limit_exceeded(int64_t mem_usage); @@ -47,34 +52,31 @@ class ThreadMemTracker { // must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck, void consume(); - void try_consume(int64_t size); - - void stop_mem_tracker() { _stop_mem_tracker = true; } - private: + std::string _query_id; TUniqueId _fragment_instance_id; std::weak_ptr _query_mem_tracker; std::shared_ptr _global_hook_tracker = nullptr; - // Consume size smaller than _tracker_consume_min_size will continue to accumulate + // Consume size smaller than _tracker_consume_cache_size will continue to accumulate // to avoid frequent calls to consume/release of MemTracker. int64_t _untracked_mem = 0; - int64_t _tracker_consume_min_size = config::mem_tracker_consume_min_size_mbytes; + int64_t _tracker_consume_cache_size = config::mem_tracker_consume_min_size_bytes; - // Memory size of tracker failure after mem limit exceeded, - // expect to be successfully consumed later. - int64_t _missed_query_tracker_mem = 0; - int64_t _missed_global_tracker_mem = 0; - - // When memory is being consumed, avoid entering infinite recursion. - bool _query_mem_consuming = false; - bool _global_mem_consuming = false; + // If there is a memory new/delete operation in the consume method, it may enter infinite recursion. + // Note: After the tracker is stopped, the memory alloc in the consume method should be released in time, + // otherwise the MemTracker statistics will be inaccurate. + bool _stop_query_mem_tracker = false; + bool _stop_global_mem_tracker = false; // In some cases, we want to turn off memory statistics. // For example, when ~GlobalHookTracker, TCMalloc delete hook // release GlobalHookTracker will crash. bool _stop_mem_tracker = false; + + // Control the interval of printing Log. + int64_t global_exceeded_interval = 0; }; } // namespace doris diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index 0e8ecf0206da92..120338468d4666 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -1447,11 +1447,11 @@ The size of the buffer before flashing ``` * Default: 0 -### `mem_tracker_consume_min_size_mbytes` +### `mem_tracker_consume_min_size_bytes` * Type: int32 -* Description: The minimum length of TCMalloc Hook when consume/release MemTracker. Consume size smaller than this value will continue to accumulate to avoid frequent calls to consume/release of MemTracker. Decreasing this value will increase the frequency of consume/release. Increasing this value will cause MemTracker statistics to be inaccurate. Theoretically, the statistical value of a MemTracker differs from the true value = (mem_tracker_consume_min_size_mbytes * the number of BE threads where the MemTracker is located). -* Default: 4M +* Description: The minimum length of TCMalloc Hook when consume/release MemTracker. Consume size smaller than this value will continue to accumulate to avoid frequent calls to consume/release of MemTracker. Decreasing this value will increase the frequency of consume/release. Increasing this value will cause MemTracker statistics to be inaccurate. Theoretically, the statistical value of a MemTracker differs from the true value = (mem_tracker_consume_min_size_bytes * the number of BE threads where the MemTracker is located). +* Default: 1048576 ### `max_segment_num_per_rowset` diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index c1966eb96d4341..8bbfec30be4bcb 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -1466,11 +1466,11 @@ webserver默认工作线程数 ``` * 默认值: 0 -### `mem_tracker_consume_min_size_mbytes` +### `mem_tracker_consume_min_size_bytes` * 类型: int32 -* 描述: TCMalloc Hook consume/release MemTracker时的最小长度,小于该值的consume size会持续累加,避免频繁调用MemTracker的consume/release,减小该值会增加consume/release的频率,增大该值会导致MemTracker统计不准,理论上一个MemTracker的统计值与真实值相差 = (mem_tracker_consume_min_size_mbytes * 这个MemTracker所在的BE线程数)。 -* 默认值: 4M +* 描述: TCMalloc Hook consume/release MemTracker时的最小长度,小于该值的consume size会持续累加,避免频繁调用MemTracker的consume/release,减小该值会增加consume/release的频率,增大该值会导致MemTracker统计不准,理论上一个MemTracker的统计值与真实值相差 = (mem_tracker_consume_min_size_bytes * 这个MemTracker所在的BE线程数)。 +* 默认值: 1048576 ### `max_segment_num_per_rowset` From fd199c74b409ab96a3d6f3fff4ff251d7eef8038 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Wed, 5 Jan 2022 20:37:06 +0800 Subject: [PATCH 11/14] Refactor MemTracker --- be/src/common/config.h | 16 +- be/src/common/daemon.cpp | 11 - be/src/exec/analytic_eval_node.cpp | 2 +- be/src/exec/base_scanner.cpp | 2 +- be/src/exec/blocking_join_node.cpp | 3 + be/src/exec/broker_scan_node.cpp | 2 +- be/src/exec/data_sink.cpp | 2 +- be/src/exec/es/es_scroll_parser.cpp | 4 +- be/src/exec/es_http_scanner.cpp | 2 +- be/src/exec/es_scan_node.cpp | 2 +- be/src/exec/exec_node.cpp | 10 +- be/src/exec/exec_node.h | 19 - be/src/exec/hash_join_node.cpp | 3 + be/src/exec/hash_table.cpp | 12 +- be/src/exec/olap_scan_node.cpp | 10 +- be/src/exec/olap_scanner.cpp | 5 +- be/src/exec/partitioned_aggregation_node.cc | 4 +- be/src/exec/partitioned_hash_table.cc | 6 +- be/src/exec/tablet_info.cpp | 2 +- be/src/exec/tablet_sink.cpp | 15 +- be/src/exec/tablet_sink.h | 2 +- be/src/exprs/agg_fn_evaluator.cpp | 4 +- be/src/exprs/anyval_util.cpp | 2 +- be/src/exprs/bloomfilter_predicate.h | 6 +- be/src/exprs/expr_context.cpp | 2 +- be/src/exprs/new_agg_fn_evaluator.cc | 2 +- be/src/gutil/strings/numbers.cc | 35 + be/src/gutil/strings/numbers.h | 4 + be/src/http/action/compaction_action.h | 9 +- be/src/http/default_path_handlers.cpp | 18 +- be/src/olap/aggregate_func.h | 4 +- be/src/olap/collect_iterator.h | 2 + be/src/olap/compaction.cpp | 22 +- be/src/olap/delta_writer.cpp | 2 +- be/src/olap/fs/file_block_manager.cpp | 4 +- be/src/olap/generic_iterators.cpp | 4 +- be/src/olap/lru_cache.cpp | 23 +- be/src/olap/lru_cache.h | 2 +- be/src/olap/memtable.cpp | 2 +- be/src/olap/memtable.h | 1 + be/src/olap/memtable_flush_executor.cpp | 13 +- be/src/olap/page_cache.cpp | 18 +- be/src/olap/push_handler.cpp | 2 +- be/src/olap/row_block.cpp | 2 +- be/src/olap/row_block2.cpp | 2 +- be/src/olap/rowset/segment_reader.cpp | 11 +- .../olap/rowset/segment_v2/column_reader.cpp | 2 +- be/src/olap/rowset/segment_v2/page_io.cpp | 21 +- be/src/olap/rowset/segment_v2/segment.cpp | 12 +- .../rowset/segment_v2/segment_iterator.cpp | 6 +- .../olap/rowset/segment_v2/segment_writer.cpp | 19 +- be/src/olap/schema_change.cpp | 15 +- be/src/olap/segment_loader.cpp | 3 +- be/src/olap/storage_engine.cpp | 10 +- be/src/olap/tablet_manager.cpp | 9 +- be/src/olap/task/engine_checksum_task.h | 1 + be/src/runtime/CMakeLists.txt | 3 +- be/src/runtime/buffered_block_mgr2.cc | 48 +- be/src/runtime/bufferpool/buffer_allocator.cc | 18 +- .../runtime/bufferpool/reservation_tracker.cc | 13 +- be/src/runtime/data_stream_recvr.cc | 3 +- be/src/runtime/data_stream_sender.cpp | 6 +- be/src/runtime/disk_io_mgr.cc | 31 +- be/src/runtime/disk_io_mgr.h | 6 +- be/src/runtime/exec_env.h | 19 +- be/src/runtime/exec_env_init.cpp | 19 +- be/src/runtime/export_sink.cpp | 2 +- be/src/runtime/fold_constant_executor.cpp | 2 +- be/src/runtime/fragment_mgr.cpp | 8 +- be/src/runtime/initial_reservations.cc | 2 +- be/src/runtime/load_channel.cpp | 4 +- be/src/runtime/load_channel_mgr.cpp | 2 +- be/src/runtime/mem_pool.cpp | 68 +- be/src/runtime/mem_pool.h | 14 +- be/src/runtime/mem_tracker.cpp | 603 ++++------------- be/src/runtime/mem_tracker.h | 626 +++++------------- be/src/runtime/mem_tracker_task_pool.cpp | 111 ++++ be/src/runtime/mem_tracker_task_pool.h | 55 ++ be/src/runtime/memory/chunk.h | 3 + be/src/runtime/memory/chunk_allocator.cpp | 43 +- be/src/runtime/memory/chunk_allocator.h | 7 +- be/src/runtime/mysql_table_sink.cpp | 2 +- be/src/runtime/odbc_table_sink.cpp | 2 +- be/src/runtime/plan_fragment_executor.cpp | 14 +- be/src/runtime/result_file_sink.cpp | 6 +- be/src/runtime/result_sink.cpp | 5 + be/src/runtime/row_batch.cpp | 138 +++- be/src/runtime/runtime_filter_mgr.cpp | 2 +- be/src/runtime/runtime_state.cpp | 35 +- be/src/runtime/runtime_state.h | 2 +- be/src/runtime/tablets_channel.cpp | 2 +- be/src/runtime/tcmalloc_hook.h | 5 +- be/src/runtime/thread_context.h | 84 ++- be/src/runtime/thread_mem_tracker.cpp | 128 ---- be/src/runtime/thread_mem_tracker_mgr.cpp | 131 ++++ ...mem_tracker.h => thread_mem_tracker_mgr.h} | 74 ++- be/src/runtime/vectorized_row_batch.cpp | 2 +- be/src/service/doris_main.cpp | 4 +- be/test/exec/hash_table_test.cpp | 12 +- be/test/exec/tablet_sink_test.cpp | 2 +- be/test/exprs/bloom_filter_predicate_test.cpp | 6 +- .../bloom_filter_column_predicate_test.cpp | 2 +- be/test/olap/generic_iterators_test.cpp | 6 +- be/test/runtime/mem_limit_test.cpp | 130 ++-- be/test/runtime/memory_scratch_sink_test.cpp | 2 +- be/test/runtime/test_env.cc | 8 +- be/test/util/arrow/arrow_work_flow_test.cpp | 4 +- .../administrator-guide/config/be_config.md | 26 +- .../administrator-guide/config/be_config.md | 24 +- 109 files changed, 1476 insertions(+), 1526 deletions(-) create mode 100644 be/src/runtime/mem_tracker_task_pool.cpp create mode 100644 be/src/runtime/mem_tracker_task_pool.h delete mode 100644 be/src/runtime/thread_mem_tracker.cpp create mode 100644 be/src/runtime/thread_mem_tracker_mgr.cpp rename be/src/runtime/{thread_mem_tracker.h => thread_mem_tracker_mgr.h} (54%) diff --git a/be/src/common/config.h b/be/src/common/config.h index b4974221ea9b6f..563785ec900a13 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -60,9 +60,6 @@ CONF_mInt64(tc_free_memory_rate, "20"); // https://github.com/gperftools/gperftools/issues/1111 CONF_Int64(tc_max_total_thread_cache_bytes, "1073741824"); -// Whether to initialize TCmalloc new/delete Hook, MemTracker is currently counted in Hook. -CONF_mBool(tc_init_hook, "true"); - // process memory limit specified as number of bytes // ('[bB]?'), megabytes ('[mM]'), gigabytes ('[gG]'), // or percentage of the physical memory ('%'). @@ -598,12 +595,17 @@ CONF_Int32(aws_log_level, "3"); // the buffer size when read data from remote storage like s3 CONF_mInt32(remote_storage_read_buffer_mb, "16"); +// Whether to initialize TCmalloc new/delete Hook, MemTracker is currently counted in Hook. +CONF_mBool(use_tc_hook, "true"); + // Default level of MemTracker to show in web page // now MemTracker support two level: -// RELEASE: 0 -// DEBUG: 1 +// OVERVIEW: 0 +// TASK: 1 +// INSTANCE: 2 +// VERBOSE: 3 // the level equal or lower than mem_tracker_level will show in web page -CONF_Int16(mem_tracker_level, "0"); +CONF_mInt16(mem_tracker_level, "0"); // The minimum length when TCMalloc Hook consumes/releases MemTracker, consume size // smaller than this value will continue to accumulate. specified as number of bytes. @@ -611,6 +613,8 @@ CONF_Int16(mem_tracker_level, "0"); // Increasing this value will cause MemTracker statistics to be inaccurate. CONF_mInt32(mem_tracker_consume_min_size_bytes, "1048576"); +CONF_mBool(memory_leak_detection, "false"); + // The version information of the tablet will be stored in the memory // in an adjacency graph data structure. // And as the new version is written and the old version is deleted, diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index 044feda7cf6158..36e4f84e36c03f 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -95,17 +95,6 @@ void Daemon::memory_maintenance_thread() { if (env != nullptr) { BufferPool* buffer_pool = env->buffer_pool(); if (buffer_pool != nullptr) buffer_pool->Maintenance(); - - // The process limit as measured by our trackers may get out of sync with the - // process usage if memory is allocated or freed without updating a MemTracker. - // The metric is refreshed whenever memory is consumed or released via a MemTracker, - // so on a system with queries executing it will be refreshed frequently. However - // if the system is idle, we need to refresh the tracker occasionally since - // untracked memory may be allocated or freed, e.g. by background threads. - if (env->process_mem_tracker() != nullptr && - !env->process_mem_tracker()->is_consumption_metric_null()) { - env->process_mem_tracker()->RefreshConsumptionFromMetric(); - } } } } diff --git a/be/src/exec/analytic_eval_node.cpp b/be/src/exec/analytic_eval_node.cpp index df1b4cea275537..2a9afa38687c04 100644 --- a/be/src/exec/analytic_eval_node.cpp +++ b/be/src/exec/analytic_eval_node.cpp @@ -201,7 +201,7 @@ Status AnalyticEvalNode::open(RuntimeState* state) { "Failed to acquire initial read buffer for analytic function " "evaluation. Reducing query concurrency or increasing the memory limit may " "help this query to complete successfully."); - return mem_tracker()->MemLimitExceeded(state, msg, -1); + return mem_tracker()->mem_limit_exceeded(state, msg, -1); } DCHECK_EQ(_evaluators.size(), _fn_ctxs.size()); diff --git a/be/src/exec/base_scanner.cpp b/be/src/exec/base_scanner.cpp index f9fb5e389860e5..eb29bafa4ba066 100644 --- a/be/src/exec/base_scanner.cpp +++ b/be/src/exec/base_scanner.cpp @@ -44,7 +44,7 @@ BaseScanner::BaseScanner(RuntimeState* state, RuntimeProfile* profile, _mem_tracker(new MemTracker()), #else _mem_tracker( - MemTracker::CreateTracker(-1, "BaseScanner:" + std::to_string(state->load_job_id()), + MemTracker::create_tracker(-1, "BaseScanner:" + std::to_string(state->load_job_id()), state->instance_mem_tracker())), #endif _mem_pool(_mem_tracker.get()), diff --git a/be/src/exec/blocking_join_node.cpp b/be/src/exec/blocking_join_node.cpp index ba137860ac7742..d46e54dc0eefca 100644 --- a/be/src/exec/blocking_join_node.cpp +++ b/be/src/exec/blocking_join_node.cpp @@ -23,6 +23,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris { @@ -82,6 +83,8 @@ Status BlockingJoinNode::close(RuntimeState* state) { } void BlockingJoinNode::build_side_thread(RuntimeState* state, std::promise* status) { + SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(state->query_id()), + state->fragment_instance_id()); status->set_value(construct_build_side(state)); } diff --git a/be/src/exec/broker_scan_node.cpp b/be/src/exec/broker_scan_node.cpp index 11928480d85f3d..344ca3f95e2515 100644 --- a/be/src/exec/broker_scan_node.cpp +++ b/be/src/exec/broker_scan_node.cpp @@ -318,7 +318,7 @@ Status BrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, // 1. too many batches in queue, or // 2. at least one batch in queue and memory exceed limit. (_batch_queue.size() >= _max_buffered_batches || - (mem_tracker()->AnyLimitExceeded(MemLimit::HARD) && !_batch_queue.empty()))) { + (mem_tracker()->any_limit_exceeded() && !_batch_queue.empty()))) { _queue_writer_cond.wait_for(l, std::chrono::seconds(1)); } // Process already set failed, so we just return OK diff --git a/be/src/exec/data_sink.cpp b/be/src/exec/data_sink.cpp index 2a259482251a45..994d98a35d233c 100644 --- a/be/src/exec/data_sink.cpp +++ b/be/src/exec/data_sink.cpp @@ -181,7 +181,7 @@ Status DataSink::init(const TDataSink& thrift_sink) { Status DataSink::prepare(RuntimeState* state) { _expr_mem_tracker = - MemTracker::CreateTracker(-1, _name + ":Expr:" + std::to_string(state->load_job_id()), + MemTracker::create_tracker(-1, _name + ":Expr:" + std::to_string(state->load_job_id()), state->instance_mem_tracker()); return Status::OK(); } diff --git a/be/src/exec/es/es_scroll_parser.cpp b/be/src/exec/es/es_scroll_parser.cpp index 86cd16a934af7e..06e693a5faea80 100644 --- a/be/src/exec/es/es_scroll_parser.cpp +++ b/be/src/exec/es/es_scroll_parser.cpp @@ -359,7 +359,7 @@ Status ScrollParser::fill_tuple(const TupleDescriptor* tuple_desc, Tuple* tuple, if (UNLIKELY(buffer == nullptr)) { std::string details = strings::Substitute(ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", len, "string slot"); - return tuple_pool->mem_tracker()->MemLimitExceeded(nullptr, details, len); + return tuple_pool->mem_tracker()->mem_limit_exceeded(nullptr, details, len); } memcpy(buffer, _id.data(), len); reinterpret_cast(slot)->ptr = buffer; @@ -417,7 +417,7 @@ Status ScrollParser::fill_tuple(const TupleDescriptor* tuple_desc, Tuple* tuple, if (UNLIKELY(buffer == nullptr)) { std::string details = strings::Substitute( ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", val_size, "string slot"); - return tuple_pool->mem_tracker()->MemLimitExceeded(nullptr, details, val_size); + return tuple_pool->mem_tracker()->mem_limit_exceeded(nullptr, details, val_size); } memcpy(buffer, val.data(), val_size); reinterpret_cast(slot)->ptr = buffer; diff --git a/be/src/exec/es_http_scanner.cpp b/be/src/exec/es_http_scanner.cpp index fe3d67b80daf2d..9a914b90c2361a 100644 --- a/be/src/exec/es_http_scanner.cpp +++ b/be/src/exec/es_http_scanner.cpp @@ -47,7 +47,7 @@ EsHttpScanner::EsHttpScanner(RuntimeState* state, RuntimeProfile* profile, Tuple _mem_tracker(new MemTracker()), #else _mem_tracker( - MemTracker::CreateTracker(-1, "EsHttpScanner:" + std::to_string(state->load_job_id()), + MemTracker::create_tracker(-1, "EsHttpScanner:" + std::to_string(state->load_job_id()), state->instance_mem_tracker())), #endif _mem_pool(_mem_tracker.get()), diff --git a/be/src/exec/es_scan_node.cpp b/be/src/exec/es_scan_node.cpp index fad266993beb74..c71a3efe1d1409 100644 --- a/be/src/exec/es_scan_node.cpp +++ b/be/src/exec/es_scan_node.cpp @@ -775,7 +775,7 @@ Status EsScanNode::materialize_row(MemPool* tuple_pool, Tuple* tuple, if (UNLIKELY(buffer == nullptr)) { std::string details = strings::Substitute( ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", val_size, "string slot"); - return tuple_pool->mem_tracker()->MemLimitExceeded(nullptr, details, val_size); + return tuple_pool->mem_tracker()->mem_limit_exceeded(nullptr, details, val_size); } memcpy(buffer, val.data(), val_size); reinterpret_cast(slot)->ptr = buffer; diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp index 4501d423430bb5..14a24304182ac7 100644 --- a/be/src/exec/exec_node.cpp +++ b/be/src/exec/exec_node.cpp @@ -203,11 +203,11 @@ Status ExecNode::prepare(RuntimeState* state) { std::bind(&RuntimeProfile::units_per_second, _rows_returned_counter, runtime_profile()->total_time_counter()), ""); - _mem_tracker = MemTracker::CreateTracker(_runtime_profile.get(), -1, - "ExecNode:" + _runtime_profile->name(), - state->instance_mem_tracker()); - _expr_mem_tracker = MemTracker::CreateTracker(-1, "ExecNode:Exprs:" + _runtime_profile->name(), - _mem_tracker); + _mem_tracker = MemTracker::create_tracker(-1, "ExecNode:" + _runtime_profile->name(), + state->instance_mem_tracker(), + MemTrackerLevel::VERBOSE, _runtime_profile.get()); + _expr_mem_tracker = MemTracker::create_tracker(-1, "ExecNode:Exprs:" + _runtime_profile->name(), + _mem_tracker); _expr_mem_pool.reset(new MemPool(_expr_mem_tracker.get())); if (_vconjunct_ctx_ptr) { diff --git a/be/src/exec/exec_node.h b/be/src/exec/exec_node.h index 7cad50018d848a..c27469d2561fdb 100644 --- a/be/src/exec/exec_node.h +++ b/be/src/exec/exec_node.h @@ -377,25 +377,6 @@ class ExecNode { bool _is_closed; }; -#define LIMIT_EXCEEDED(tracker, state, msg) \ - do { \ - stringstream str; \ - str << "Memory exceed limit. " << msg << " "; \ - str << "Backend: " << BackendOptions::get_localhost() << ", "; \ - str << "fragment: " << print_id(state->fragment_instance_id()) << " "; \ - str << "Used: " << tracker->consumption() << ", Limit: " << tracker->limit() << ". "; \ - str << "You can change the limit by session variable exec_mem_limit."; \ - return Status::MemoryLimitExceeded(str.str()); \ - } while (false) - -#define RETURN_IF_LIMIT_EXCEEDED(state, msg) \ - do { \ - /* if (UNLIKELY(MemTracker::limit_exceeded(*(state)->mem_trackers()))) { */ \ - MemTracker* tracker = state->instance_mem_tracker()->find_limit_exceeded_tracker(); \ - if (tracker != nullptr) { \ - LIMIT_EXCEEDED(tracker, state, msg); \ - } \ - } while (false) } // namespace doris #endif diff --git a/be/src/exec/hash_join_node.cpp b/be/src/exec/hash_join_node.cpp index f7e8bbcc649858..5f39ff0e5102e9 100644 --- a/be/src/exec/hash_join_node.cpp +++ b/be/src/exec/hash_join_node.cpp @@ -30,6 +30,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_filter_mgr.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/defer_op.h" #include "util/runtime_profile.h" @@ -176,6 +177,8 @@ Status HashJoinNode::close(RuntimeState* state) { } void HashJoinNode::build_side_thread(RuntimeState* state, std::promise* status) { + SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(state->query_id()), + state->fragment_instance_id()); status->set_value(construct_hash_table(state)); } diff --git a/be/src/exec/hash_table.cpp b/be/src/exec/hash_table.cpp index 36c3d7b76eaa9c..50f9c8c87784db 100644 --- a/be/src/exec/hash_table.cpp +++ b/be/src/exec/hash_table.cpp @@ -53,7 +53,7 @@ HashTable::HashTable(const std::vector& build_expr_ctxs, _buckets.resize(num_buckets); _num_buckets = num_buckets; _num_buckets_till_resize = MAX_BUCKET_OCCUPANCY_FRACTION * _num_buckets; - _mem_tracker->Consume(_buckets.capacity() * sizeof(Bucket)); + _mem_tracker->consume(_buckets.capacity() * sizeof(Bucket)); // Compute the layout and buffer size to store the evaluated expr results _results_buffer_size = Expr::compute_results_layout( @@ -70,7 +70,7 @@ HashTable::HashTable(const std::vector& build_expr_ctxs, _alloc_list.push_back(_current_nodes); _end_list.push_back(_current_nodes + _current_capacity * _node_byte_size); - _mem_tracker->Consume(_current_capacity * _node_byte_size); + _mem_tracker->consume(_current_capacity * _node_byte_size); if (_mem_tracker->limit_exceeded()) { mem_limit_exceeded(_current_capacity * _node_byte_size); } @@ -85,8 +85,8 @@ void HashTable::close() { for (auto ptr : _alloc_list) { free(ptr); } - _mem_tracker->Release(_total_capacity * _node_byte_size); - _mem_tracker->Release(_buckets.size() * sizeof(Bucket)); + _mem_tracker->release(_total_capacity * _node_byte_size); + _mem_tracker->release(_buckets.size() * sizeof(Bucket)); } bool HashTable::eval_row(TupleRow* row, const std::vector& ctxs) { @@ -180,7 +180,7 @@ Status HashTable::resize_buckets(int64_t num_buckets) { int64_t old_num_buckets = _num_buckets; int64_t delta_bytes = (num_buckets - old_num_buckets) * sizeof(Bucket); - Status st = _mem_tracker->TryConsume(delta_bytes); + Status st = _mem_tracker->try_consume(delta_bytes); if (!st) { LOG_EVERY_N(WARNING, 100) << "resize bucket failed: " << st.to_string(); mem_limit_exceeded(delta_bytes); @@ -244,7 +244,7 @@ void HashTable::grow_node_array() { _alloc_list.push_back(_current_nodes); _end_list.push_back(_current_nodes + alloc_size); - _mem_tracker->Consume(alloc_size); + _mem_tracker->consume(alloc_size); if (_mem_tracker->limit_exceeded()) { mem_limit_exceeded(alloc_size); } diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index 1e788d771be6d0..e53b54b1a47578 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -1332,6 +1332,8 @@ Status OlapScanNode::normalize_bloom_filter_predicate(SlotDescriptor* slot) { void OlapScanNode::transfer_thread(RuntimeState* state) { // scanner open pushdown to scanThread + SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(state->query_id()), + state->fragment_instance_id()); Status status = Status::OK(); for (auto scanner : _olap_scanners) { status = Expr::clone_if_not_exists(_conjunct_ctxs, state, scanner->conjunct_ctxs()); @@ -1502,8 +1504,10 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { } void OlapScanNode::scanner_thread(OlapScanner* scanner) { - thread_local_ctx.attach(ThreadContext::QUERY, print_id(scanner->runtime_state()->query_id()), + SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(scanner->runtime_state()->query_id()), _runtime_state->fragment_instance_id()); + // thread_local_ctx.attach(ThreadContext::QUERY, print_id(scanner->runtime_state()->query_id()), + // _runtime_state->fragment_instance_id()); if (UNLIKELY(_transfer_done)) { _scanner_done = true; std::unique_lock l(_scan_batches_lock); @@ -1513,7 +1517,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { _scan_batch_added_cv.notify_one(); _scan_thread_exit_cv.notify_one(); LOG(INFO) << "Scan thread cancelled, cause query done, scan thread started to exit"; - thread_local_ctx.detach(); + // thread_local_ctx.detach(); return; } int64_t wait_time = scanner->update_wait_worker_timer(); @@ -1663,7 +1667,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { // and transfer thread _scan_batch_added_cv.notify_one(); _scan_thread_exit_cv.notify_one(); - thread_local_ctx.detach(); + // thread_local_ctx.detach(); } Status OlapScanNode::add_one_batch(RowBatch* row_batch) { diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp index d7dc83967b5abb..daa536493c9027 100644 --- a/be/src/exec/olap_scanner.cpp +++ b/be/src/exec/olap_scanner.cpp @@ -48,10 +48,9 @@ OlapScanner::OlapScanner(RuntimeState* runtime_state, OlapScanNode* parent, bool _aggregation(aggregation), _need_agg_finalize(need_agg_finalize), _version(-1), - _mem_tracker(MemTracker::CreateTracker( + _mem_tracker(MemTracker::create_tracker( runtime_state->fragment_mem_tracker()->limit(), "OlapScanner", - runtime_state->fragment_mem_tracker(), true, true, MemTrackerLevel::VERBOSE)) { -} + runtime_state->fragment_mem_tracker(), MemTrackerLevel::VERBOSE)) {} Status OlapScanner::prepare( const TPaloScanRange& scan_range, const std::vector& key_ranges, diff --git a/be/src/exec/partitioned_aggregation_node.cc b/be/src/exec/partitioned_aggregation_node.cc index 98651998b6123b..35b502fe7ce305 100644 --- a/be/src/exec/partitioned_aggregation_node.cc +++ b/be/src/exec/partitioned_aggregation_node.cc @@ -409,7 +409,7 @@ Status PartitionedAggregationNode::CopyStringData(const SlotDescriptor& slot_des "Cannot perform aggregation at node with id $0." " Failed to allocate $1 output bytes.", _id, sv->len); - return pool->mem_tracker()->MemLimitExceeded(state_, details, sv->len); + return pool->mem_tracker()->mem_limit_exceeded(state_, details, sv->len); } memcpy(new_ptr, sv->ptr, sv->len); sv->ptr = new_ptr; @@ -932,7 +932,7 @@ Tuple* PartitionedAggregationNode::ConstructIntermediateTuple( << ", Limit: " << pool->mem_tracker()->limit() << ". " << "You can change the limit by session variable exec_mem_limit."; string details = Substitute(str.str(), _id, tuple_data_size); - *status = pool->mem_tracker()->MemLimitExceeded(state_, details, tuple_data_size); + *status = pool->mem_tracker()->mem_limit_exceeded(state_, details, tuple_data_size); return nullptr; } memset(tuple_data, 0, fixed_size); diff --git a/be/src/exec/partitioned_hash_table.cc b/be/src/exec/partitioned_hash_table.cc index b8cbdaab631b3c..cc52c6067bb17c 100644 --- a/be/src/exec/partitioned_hash_table.cc +++ b/be/src/exec/partitioned_hash_table.cc @@ -310,13 +310,13 @@ Status PartitionedHashTableCtx::ExprValuesCache::Init(RuntimeState* state, MAX_EXPR_VALUES_ARRAY_SIZE / expr_values_bytes_per_row_)); int mem_usage = MemUsage(capacity_, expr_values_bytes_per_row_, num_exprs_); - Status st = tracker->TryConsume(mem_usage); + Status st = tracker->try_consume(mem_usage); WARN_IF_ERROR(st, "PartitionedHashTableCtx::ExprValuesCache failed"); if (UNLIKELY(!st)) { capacity_ = 0; string details = Substitute( "PartitionedHashTableCtx::ExprValuesCache failed to allocate $0 bytes.", mem_usage); - return tracker->MemLimitExceeded(state, details, mem_usage); + return tracker->mem_limit_exceeded(state, details, mem_usage); } int expr_values_size = expr_values_bytes_per_row_ * capacity_; @@ -349,7 +349,7 @@ void PartitionedHashTableCtx::ExprValuesCache::Close(const std::shared_ptrRelease(mem_usage); + tracker->release(mem_usage); } int PartitionedHashTableCtx::ExprValuesCache::MemUsage(int capacity, int expr_values_bytes_per_row, diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index cbeda55bd1394d..491fa50dad5b02 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -161,7 +161,7 @@ OlapTablePartitionParam::OlapTablePartitionParam(std::shared_ptr_mem_tracker->AnyLimitExceeded(MemLimit::HARD) && + while (!_cancelled && _parent->_mem_tracker->any_limit_exceeded() && _pending_batches_num > 0) { SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); SleepFor(MonoDelta::FromMilliseconds(10)); @@ -704,8 +705,8 @@ Status OlapTableSink::prepare(RuntimeState* state) { // profile must add to state's object pool _profile = state->obj_pool()->add(new RuntimeProfile("OlapTableSink")); _mem_tracker = - MemTracker::CreateTracker(-1, "OlapTableSink:" + std::to_string(state->load_job_id()), - state->instance_mem_tracker(), true, false); + MemTracker::create_tracker(-1, "OlapTableSink:" + std::to_string(state->load_job_id()), + state->instance_mem_tracker()); SCOPED_TIMER(_profile->total_time_counter()); @@ -839,8 +840,8 @@ Status OlapTableSink::open(RuntimeState* state) { _send_batch_thread_pool_token = state->exec_env()->send_batch_thread_pool()->new_token( ThreadPool::ExecutionMode::CONCURRENT, send_batch_parallelism); RETURN_IF_ERROR(Thread::create( - "OlapTableSink", "send_batch_process", [this]() { this->_send_batch_process(); }, - &_sender_thread)); + "OlapTableSink", "send_batch_process", + [this, state]() { this->_send_batch_process(state); }, &_sender_thread)); return Status::OK(); } @@ -1209,8 +1210,10 @@ Status OlapTableSink::_validate_data(RuntimeState* state, RowBatch* batch, Bitma return Status::OK(); } -void OlapTableSink::_send_batch_process() { +void OlapTableSink::_send_batch_process(RuntimeState* state) { SCOPED_TIMER(_non_blocking_send_timer); + SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(state->query_id()), + state->fragment_instance_id()); do { int running_channels_num = 0; for (auto index_channel : _channels) { diff --git a/be/src/exec/tablet_sink.h b/be/src/exec/tablet_sink.h index b31f3841c337a7..19e80dd20d6992 100644 --- a/be/src/exec/tablet_sink.h +++ b/be/src/exec/tablet_sink.h @@ -365,7 +365,7 @@ class OlapTableSink : public DataSink { // the consumer func of sending pending batches in every NodeChannel. // use polling & NodeChannel::try_send_and_fetch_status() to achieve nonblocking sending. // only focus on pending batches and channel status, the internal errors of NodeChannels will be handled by the producer - void _send_batch_process(); + void _send_batch_process(RuntimeState* state); protected: friend class NodeChannel; diff --git a/be/src/exprs/agg_fn_evaluator.cpp b/be/src/exprs/agg_fn_evaluator.cpp index d83920c43bb20e..f77e74be37c3eb 100644 --- a/be/src/exprs/agg_fn_evaluator.cpp +++ b/be/src/exprs/agg_fn_evaluator.cpp @@ -264,7 +264,7 @@ Status AggFnEvaluator::open(RuntimeState* state, FunctionContext* agg_fn_ctx) { void AggFnEvaluator::close(RuntimeState* state) { Expr::close(_input_exprs_ctxs, state); if (UNLIKELY(_total_mem_consumption > 0)) { - _mem_tracker->Release(_total_mem_consumption); + _mem_tracker->release(_total_mem_consumption); } } @@ -435,7 +435,7 @@ void AggFnEvaluator::update_mem_limlits(int len) { _accumulated_mem_consumption += len; // per 16M , update mem_tracker one time if (UNLIKELY(_accumulated_mem_consumption > 16777216)) { - _mem_tracker->Consume(_accumulated_mem_consumption); + _mem_tracker->consume(_accumulated_mem_consumption); _total_mem_consumption += _accumulated_mem_consumption; _accumulated_mem_consumption = 0; } diff --git a/be/src/exprs/anyval_util.cpp b/be/src/exprs/anyval_util.cpp index fabdb505cccb36..141fd09847ff91 100644 --- a/be/src/exprs/anyval_util.cpp +++ b/be/src/exprs/anyval_util.cpp @@ -40,7 +40,7 @@ Status allocate_any_val(RuntimeState* state, MemPool* pool, const TypeDescriptor const int anyval_alignment = AnyValUtil::any_val_alignment(type); *result = reinterpret_cast(pool->try_allocate_aligned(anyval_size, anyval_alignment)); if (*result == nullptr) { - return pool->mem_tracker()->MemLimitExceeded(state, mem_limit_exceeded_msg, anyval_size); + return pool->mem_tracker()->mem_limit_exceeded(state, mem_limit_exceeded_msg, anyval_size); } memset(static_cast(*result), 0, anyval_size); return Status::OK(); diff --git a/be/src/exprs/bloomfilter_predicate.h b/be/src/exprs/bloomfilter_predicate.h index a6b7f83636c4a6..6fd16a1a9e81d2 100644 --- a/be/src/exprs/bloomfilter_predicate.h +++ b/be/src/exprs/bloomfilter_predicate.h @@ -99,7 +99,7 @@ class BloomFilterFuncBase : public IBloomFilterFuncBase { virtual ~BloomFilterFuncBase() { if (_tracker != nullptr) { - _tracker->Release(_bloom_filter_alloced); + _tracker->release(_bloom_filter_alloced); } } @@ -115,7 +115,7 @@ class BloomFilterFuncBase : public IBloomFilterFuncBase { _bloom_filter_alloced = bloom_filter_length; _bloom_filter.reset(BloomFilterAdaptor::create()); RETURN_IF_ERROR(_bloom_filter->init(bloom_filter_length)); - _tracker->Consume(_bloom_filter_alloced); + _tracker->consume(_bloom_filter_alloced); _inited = true; return Status::OK(); } @@ -138,7 +138,7 @@ class BloomFilterFuncBase : public IBloomFilterFuncBase { } _bloom_filter_alloced = len; - _tracker->Consume(_bloom_filter_alloced); + _tracker->consume(_bloom_filter_alloced); return _bloom_filter->init(data, len); } diff --git a/be/src/exprs/expr_context.cpp b/be/src/exprs/expr_context.cpp index 40e93ee66a14fc..e0f3b6461b030a 100644 --- a/be/src/exprs/expr_context.cpp +++ b/be/src/exprs/expr_context.cpp @@ -373,7 +373,7 @@ Status ExprContext::get_const_value(RuntimeState* state, Expr& expr, AnyVal** co // Make sure the memory is owned by this evaluator. char* ptr_copy = reinterpret_cast(_pool->try_allocate(sv->len)); if (ptr_copy == nullptr) { - return _pool->mem_tracker()->MemLimitExceeded( + return _pool->mem_tracker()->mem_limit_exceeded( state, "Could not allocate constant string value", sv->len); } memcpy(ptr_copy, sv->ptr, sv->len); diff --git a/be/src/exprs/new_agg_fn_evaluator.cc b/be/src/exprs/new_agg_fn_evaluator.cc index 7a2209ba7fedab..376643516cccc9 100644 --- a/be/src/exprs/new_agg_fn_evaluator.cc +++ b/be/src/exprs/new_agg_fn_evaluator.cc @@ -101,7 +101,7 @@ NewAggFnEvaluator::NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, NewAggFnEvaluator::~NewAggFnEvaluator() { if (UNLIKELY(_total_mem_consumption > 0)) { - _mem_tracker->Release(_total_mem_consumption); + _mem_tracker->release(_total_mem_consumption); } DCHECK(closed_); } diff --git a/be/src/gutil/strings/numbers.cc b/be/src/gutil/strings/numbers.cc index 5027dea46b89d4..6cc76d24850ffa 100644 --- a/be/src/gutil/strings/numbers.cc +++ b/be/src/gutil/strings/numbers.cc @@ -1479,6 +1479,41 @@ string ItoaKMGT(int64 i) { return StringPrintf("%s%" PRId64 "%s", sign, val, suffix); } +string AccurateItoaKMGT(int64 i) { + const char *sign = ""; + if (i < 0) { + // We lose some accuracy if the caller passes LONG_LONG_MIN, but + // that's OK as this function is only for human readability + if (i == numeric_limits::min()) i++; + sign = "-"; + i = -i; + } + + string ret = StringPrintf("%s", sign); + int64 val; + if ((val = (i >> 40)) > 1) { + ret += StringPrintf(" %" PRId64 "%s", val, "T"); + i = i - (val << 40); + } + if ((val = (i >> 30)) > 1) { + ret += StringPrintf(" %" PRId64 "%s", val, "G"); + i = i - (val << 30); + } + if ((val = (i >> 20)) > 1) { + ret += StringPrintf(" %" PRId64 "%s", val, "M"); + i = i - (val << 20); + } + if ((val = (i >> 10)) > 1) { + ret += StringPrintf(" %" PRId64 "%s", val, "K"); + i = i - (val << 10); + } else { + ret += StringPrintf(" %" PRId64 "%s", i, "K"); + } + + return ret; +} + + // DEPRECATED(wadetregaskis). // These are non-inline because some BUILD files turn on -Wformat-non-literal. diff --git a/be/src/gutil/strings/numbers.h b/be/src/gutil/strings/numbers.h index 00a10d37a81ee5..01540d29008683 100644 --- a/be/src/gutil/strings/numbers.h +++ b/be/src/gutil/strings/numbers.h @@ -474,8 +474,12 @@ char* SimpleItoaWithCommas(__int128_t i, char* buffer, int32_t buffer_size); // e.g. 3000 -> 2K 57185920 -> 45M // // Return value: string +// +// AccurateItoaKMGT() +// Description: preserve accuracy // ---------------------------------------------------------------------- string ItoaKMGT(int64 i); +string AccurateItoaKMGT(int64 i); // ---------------------------------------------------------------------- // ParseDoubleRange() diff --git a/be/src/http/action/compaction_action.h b/be/src/http/action/compaction_action.h index a989c9b8293f6b..8138279aeeb298 100644 --- a/be/src/http/action/compaction_action.h +++ b/be/src/http/action/compaction_action.h @@ -39,10 +39,11 @@ const std::string PARAM_COMPACTION_CUMULATIVE = "cumulative"; /// See compaction-action.md for details. class CompactionAction : public HttpHandler { public: - CompactionAction(CompactionActionType type) - : _type(type) { - _compaction_mem_tracker = type == RUN_COMPACTION ? - MemTracker::CreateTracker(-1, "ManualCompaction", nullptr, false, false, MemTrackerLevel::TASK) : nullptr; + CompactionAction(CompactionActionType type) : _type(type) { + _compaction_mem_tracker = + type == RUN_COMPACTION ? MemTracker::create_tracker(-1, "ManualCompaction", nullptr, + MemTrackerLevel::TASK) + : nullptr; } virtual ~CompactionAction() {} diff --git a/be/src/http/default_path_handlers.cpp b/be/src/http/default_path_handlers.cpp index d8416970dc00fe..3bb77d3057a5f1 100644 --- a/be/src/http/default_path_handlers.cpp +++ b/be/src/http/default_path_handlers.cpp @@ -144,12 +144,22 @@ void mem_tracker_handler(const WebPageHandler::ArgumentMap& args, std::stringstr (*output) << "\n"; std::vector> trackers; - MemTracker::ListTrackers(&trackers); + MemTracker::list_root_trackers(&trackers); for (const shared_ptr& tracker : trackers) { string parent = tracker->parent() == nullptr ? "none" : tracker->parent()->label(); - string limit_str = tracker->limit() == -1 ? "none" : ItoaKMGT(tracker->limit()); - string current_consumption_str = ItoaKMGT(tracker->consumption()); - string peak_consumption_str = ItoaKMGT(tracker->peak_consumption()); + string limit_str; + string current_consumption_str; + string peak_consumption_str; + if (!config::memory_leak_detection) { + limit_str = tracker->limit() == -1 ? "none" : ItoaKMGT(tracker->limit()); + current_consumption_str = ItoaKMGT(tracker->consumption()); + peak_consumption_str = ItoaKMGT(tracker->peak_consumption()); + } else { + limit_str = tracker->limit() == -1 ? "none" : AccurateItoaKMGT(tracker->limit()); + current_consumption_str = AccurateItoaKMGT(tracker->consumption()); + peak_consumption_str = AccurateItoaKMGT(tracker->peak_consumption()); + } + int64_t use_count = tracker.use_count(); (*output) << strings::Substitute( "$0$1$2" // id, parent, limit diff --git a/be/src/olap/aggregate_func.h b/be/src/olap/aggregate_func.h index f1996330f1c4eb..e282a6e1caa0a9 100644 --- a/be/src/olap/aggregate_func.h +++ b/be/src/olap/aggregate_func.h @@ -488,7 +488,7 @@ struct AggregateFuncTraitsdata = reinterpret_cast(hll); - mem_pool->mem_tracker()->Consume(hll->memory_consumed()); + mem_pool->mem_tracker()->consume(hll->memory_consumed()); agg_pool->add(hll); } @@ -534,7 +534,7 @@ struct AggregateFuncTraitssize = 0; auto bitmap = new BitmapValue(src_slice->data); - mem_pool->mem_tracker()->Consume(sizeof(BitmapValue)); + mem_pool->mem_tracker()->consume(sizeof(BitmapValue)); dst_slice->data = (char*)bitmap; agg_pool->add(bitmap); diff --git a/be/src/olap/collect_iterator.h b/be/src/olap/collect_iterator.h index e0dd44d2e3601f..2161ccf437806e 100644 --- a/be/src/olap/collect_iterator.h +++ b/be/src/olap/collect_iterator.h @@ -17,6 +17,8 @@ #pragma once +#include + #include "olap/olap_define.h" #include "olap/row_cursor.h" #include "olap/rowset/rowset_reader.h" diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 8576292a807c81..31e2dd646e70ba 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -28,11 +28,14 @@ namespace doris { Compaction::Compaction(TabletSharedPtr tablet, const std::string& label, const std::shared_ptr& parent_tracker) - : _mem_tracker(MemTracker::CreateTracker(-1, label, parent_tracker, true, false, MemTrackerLevel::TASK)), - _readers_tracker(MemTracker::CreateTracker(-1, "CompactionReaderTracker:" + std::to_string(tablet->tablet_id()), _mem_tracker, - true, false)), - _writer_tracker(MemTracker::CreateTracker(-1, "CompationWriterTracker:" + std::to_string(tablet->tablet_id()), _mem_tracker, - true, false)), + : _mem_tracker( + MemTracker::create_tracker(-1, label, parent_tracker, MemTrackerLevel::TASK)), + _readers_tracker(MemTracker::create_tracker( + -1, "CompactionReaderTracker:" + std::to_string(tablet->tablet_id()), + _mem_tracker)), + _writer_tracker(MemTracker::create_tracker( + -1, "CompationWriterTracker:" + std::to_string(tablet->tablet_id()), + _mem_tracker)), _tablet(tablet), _input_rowsets_size(0), _input_row_num(0), @@ -141,7 +144,8 @@ OLAPStatus Compaction::do_compaction_impl(int64_t permits) { << ", output_version=" << _output_version << ", current_max_version=" << current_max_version << ", disk=" << _tablet->data_dir()->path() << ", segments=" << segments_num - << ". elapsed time=" << watch.get_elapse_second() << "s. cumulative_compaction_policy=" + << ". elapsed time=" << watch.get_elapse_second() + << "s. cumulative_compaction_policy=" << _tablet->cumulative_compaction_policy()->name() << "."; return OLAP_SUCCESS; @@ -173,9 +177,9 @@ OLAPStatus Compaction::construct_input_rowset_readers() { for (auto& rowset : _input_rowsets) { RowsetReaderSharedPtr rs_reader; RETURN_NOT_OK(rowset->create_reader( - MemTracker::CreateTracker( + MemTracker::create_tracker( -1, "Compaction:RowsetReader:" + rowset->rowset_id().to_string(), - _readers_tracker, true, true), + _readers_tracker), &rs_reader)); _input_rs_readers.push_back(std::move(rs_reader)); } @@ -295,4 +299,4 @@ int64_t Compaction::get_compaction_permits() { return permits; } -} // namespace doris +} // namespace doris diff --git a/be/src/olap/delta_writer.cpp b/be/src/olap/delta_writer.cpp index c90d017c917331..696e2c5b5836d8 100644 --- a/be/src/olap/delta_writer.cpp +++ b/be/src/olap/delta_writer.cpp @@ -105,7 +105,7 @@ OLAPStatus DeltaWriter::init() { return OLAP_ERR_TABLE_NOT_FOUND; } - _mem_tracker = MemTracker::CreateTracker(-1, "DeltaWriter:" + std::to_string(_tablet->tablet_id()), + _mem_tracker = MemTracker::create_tracker(-1, "DeltaWriter:" + std::to_string(_tablet->tablet_id()), _parent_mem_tracker); // check tablet version number if (_tablet->version_count() > config::max_tablet_version_num) { diff --git a/be/src/olap/fs/file_block_manager.cpp b/be/src/olap/fs/file_block_manager.cpp index 8e54df99d6de07..06721b7b631498 100644 --- a/be/src/olap/fs/file_block_manager.cpp +++ b/be/src/olap/fs/file_block_manager.cpp @@ -368,8 +368,8 @@ Status FileReadableBlock::readv(uint64_t offset, const Slice* results, size_t re FileBlockManager::FileBlockManager(Env* env, BlockManagerOptions opts) : _env(DCHECK_NOTNULL(env)), _opts(std::move(opts)), - _mem_tracker(MemTracker::CreateTracker(-1, "FileBlockManager", _opts.parent_mem_tracker, - false, false, MemTrackerLevel::OVERVIEW)) { + _mem_tracker(MemTracker::create_tracker(-1, "FileBlockManager", _opts.parent_mem_tracker, + MemTrackerLevel::OVERVIEW)) { if (_opts.enable_metric) { _metrics.reset(new internal::BlockManagerMetrics()); } diff --git a/be/src/olap/generic_iterators.cpp b/be/src/olap/generic_iterators.cpp index 1b8f176637ac96..0d31955aad3844 100644 --- a/be/src/olap/generic_iterators.cpp +++ b/be/src/olap/generic_iterators.cpp @@ -210,7 +210,7 @@ class MergeIterator : public RowwiseIterator { MergeIterator(std::vector iters, std::shared_ptr parent, int sequence_id_idx) : _origin_iters(std::move(iters)), _sequence_id_idx(sequence_id_idx), _merge_heap(MergeContextComparator(_sequence_id_idx)) { // use for count the mem use of Block use in Merge - _mem_tracker = MemTracker::CreateTracker(-1, "MergeIterator", std::move(parent), false); + _mem_tracker = MemTracker::create_tracker(-1, "MergeIterator", std::move(parent)); } ~MergeIterator() override { @@ -325,7 +325,7 @@ class UnionIterator : public RowwiseIterator { // Client should not use iterators any more. UnionIterator(std::vector &v, std::shared_ptr parent) : _origin_iters(v.begin(), v.end()) { - _mem_tracker = MemTracker::CreateTracker(-1, "UnionIterator", parent, false); + _mem_tracker = MemTracker::create_tracker(-1, "UnionIterator", parent); } ~UnionIterator() override { diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index ca73ab1d8ba1fc..b93e1190f10b3a 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -14,6 +14,7 @@ #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/olap_index.h" +#include "runtime/thread_context.h" #include "olap/row_block.h" #include "olap/utils.h" #include "util/doris_metrics.h" @@ -292,7 +293,8 @@ void LRUCache::_evict_one_entry(LRUHandle* e) { Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const CacheKey& key, void* value), - CachePriority priority) { + CachePriority priority, std::shared_ptr source_mem_tracker) { + size_t handle_size = sizeof(LRUHandle) - 1 + key.size(); LRUHandle* e = reinterpret_cast(malloc(handle_size)); e->value = value; @@ -318,6 +320,8 @@ Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, // note that the cache might get larger than its capacity if not enough // space was freed auto old = _table.insert(e); + // DCHECK(thread_local_ctx.thread_mem_tracker()->GetQueryMemTracker() == nullptr); + thread_local_ctx.transfer_in_thread_tracker(source_mem_tracker, charge); _usage += e->total_size; if (old != nullptr) { old->in_cache = false; @@ -335,6 +339,7 @@ Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, // we free the entries here outside of mutex for // performance reasons while (to_remove_head != nullptr) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); LRUHandle* next = to_remove_head->next; to_remove_head->free(); to_remove_head = next; @@ -442,8 +447,7 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, std::shared_ptr parent) : _name(name), _last_id(1), - _mem_tracker(MemTracker::CreateTracker(-1, name, parent, true, false, - MemTrackerLevel::OVERVIEW)) { + _mem_tracker(MemTracker::create_tracker(-1, name, parent, MemTrackerLevel::OVERVIEW)) { const size_t per_shard = (total_capacity + (kNumShards - 1)) / kNumShards; for (int s = 0; s < kNumShards; s++) { _shards[s] = new LRUCache(type); @@ -467,27 +471,32 @@ ShardedLRUCache::~ShardedLRUCache() { } _entity->deregister_hook(_name); DorisMetrics::instance()->metric_registry()->deregister_entity(_entity); - _mem_tracker->Release(_mem_tracker->consumption()); + // _mem_tracker->release(_mem_tracker->consumption()); } Cache::Handle* ShardedLRUCache::insert(const CacheKey& key, void* value, size_t charge, void (*deleter)(const CacheKey& key, void* value), CachePriority priority) { + std::shared_ptr source_mem_tracker = thread_local_ctx.thread_mem_tracker(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); const uint32_t hash = _hash_slice(key); - return _shards[_shard(hash)]->insert(key, hash, value, charge, deleter, priority); + return _shards[_shard(hash)]->insert(key, hash, value, charge, deleter, priority, source_mem_tracker); } Cache::Handle* ShardedLRUCache::lookup(const CacheKey& key) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); const uint32_t hash = _hash_slice(key); return _shards[_shard(hash)]->lookup(key, hash); } void ShardedLRUCache::release(Handle* handle) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); LRUHandle* h = reinterpret_cast(handle); _shards[_shard(h->hash)]->release(handle); } void ShardedLRUCache::erase(const CacheKey& key) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); const uint32_t hash = _hash_slice(key); _shards[_shard(hash)]->erase(key, hash); } @@ -506,6 +515,7 @@ uint64_t ShardedLRUCache::new_id() { } int64_t ShardedLRUCache::prune() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); int64_t num_prune = 0; for (int s = 0; s < kNumShards; s++) { num_prune += _shards[s]->prune(); @@ -514,6 +524,7 @@ int64_t ShardedLRUCache::prune() { } int64_t ShardedLRUCache::prune_if(CacheValuePredicate pred) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); int64_t num_prune = 0; for (int s = 0; s < kNumShards; s++) { num_prune += _shards[s]->prune_if(pred); @@ -541,7 +552,7 @@ void ShardedLRUCache::update_cache_metrics() const { hit_ratio->set_value(total_lookup_count == 0 ? 0 : ((double)total_hit_count / total_lookup_count)); - _mem_tracker->Consume(total_usage - _mem_tracker->consumption()); + // _mem_tracker->consume(total_usage - _mem_tracker->consumption()); } Cache* new_lru_cache(const std::string& name, size_t capacity, diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h index 2ea6bda38e2661..8f35c0f4d51c63 100644 --- a/be/src/olap/lru_cache.h +++ b/be/src/olap/lru_cache.h @@ -315,7 +315,7 @@ class LRUCache { // Like Cache methods, but with an extra "hash" parameter. Cache::Handle* insert(const CacheKey& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const CacheKey& key, void* value), - CachePriority priority = CachePriority::NORMAL); + CachePriority priority = CachePriority::NORMAL, std::shared_ptr source_mem_tracker = nullptr); Cache::Handle* lookup(const CacheKey& key, uint32_t hash); void release(Cache::Handle* handle); void erase(const CacheKey& key, uint32_t hash); diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 2ec1ccbbc1edb5..c0f671166f6a29 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -39,7 +39,7 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet _tuple_desc(tuple_desc), _slot_descs(slot_descs), _keys_type(keys_type), - _mem_tracker(MemTracker::CreateTracker(-1, "MemTable", parent_tracker)), + _mem_tracker(MemTracker::create_tracker(-1, "MemTable", parent_tracker)), _buffer_mem_pool(new MemPool(_mem_tracker.get())), _table_mem_pool(new MemPool(_mem_tracker.get())), _schema_size(_schema->schema_size()), diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index fdb574cb1e80f3..7b0ee309d3d3a5 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -46,6 +46,7 @@ class MemTable { int64_t tablet_id() const { return _tablet_id; } size_t memory_usage() const { return _mem_tracker->consumption(); } + std::shared_ptr mem_tracker() { return _mem_tracker; } void insert(const Tuple* tuple); /// Flush OLAPStatus flush(); diff --git a/be/src/olap/memtable_flush_executor.cpp b/be/src/olap/memtable_flush_executor.cpp index b63074d2822708..67392d913463ba 100644 --- a/be/src/olap/memtable_flush_executor.cpp +++ b/be/src/olap/memtable_flush_executor.cpp @@ -20,6 +20,7 @@ #include #include "olap/memtable.h" +#include "runtime/thread_context.h" #include "util/scoped_cleanup.h" #include "util/time.h" @@ -28,8 +29,7 @@ namespace doris { std::ostream& operator<<(std::ostream& os, const FlushStatistic& stat) { os << "(flush time(ms)=" << stat.flush_time_ns / NANOS_PER_MILLIS << ", flush wait time(ms)=" << stat.flush_wait_time_ns / NANOS_PER_MILLIS - << ", flush count=" << stat.flush_count - << ", flush bytes: " << stat.flush_size_bytes + << ", flush count=" << stat.flush_count << ", flush bytes: " << stat.flush_size_bytes << ", flush disk bytes: " << stat.flush_disk_size_bytes << ")"; return os; } @@ -42,7 +42,8 @@ std::ostream& operator<<(std::ostream& os, const FlushStatistic& stat) { OLAPStatus FlushToken::submit(const std::shared_ptr& memtable) { RETURN_NOT_OK(_flush_status.load()); int64_t submit_task_time = MonotonicNanos(); - _flush_token->submit_func(std::bind(&FlushToken::_flush_memtable, this, memtable, submit_task_time)); + _flush_token->submit_func( + std::bind(&FlushToken::_flush_memtable, this, memtable, submit_task_time)); return OLAP_SUCCESS; } @@ -56,6 +57,7 @@ OLAPStatus FlushToken::wait() { } void FlushToken::_flush_memtable(std::shared_ptr memtable, int64_t submit_task_time) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(memtable->mem_tracker()); _stats.flush_wait_time_ns += (MonotonicNanos() - submit_task_time); SCOPED_CLEANUP({ memtable.reset(); }); // If previous flush has failed, return directly @@ -71,9 +73,8 @@ void FlushToken::_flush_memtable(std::shared_ptr memtable, int64_t sub } VLOG_CRITICAL << "flush memtable cost: " << timer.elapsed_time() - << ", count: " << _stats.flush_count - << ", mem size: " << memtable->memory_usage() - << ", disk size: " << memtable->flush_size(); + << ", count: " << _stats.flush_count << ", mem size: " << memtable->memory_usage() + << ", disk size: " << memtable->flush_size(); _stats.flush_time_ns += timer.elapsed_time(); _stats.flush_count++; _stats.flush_size_bytes += memtable->memory_usage(); diff --git a/be/src/olap/page_cache.cpp b/be/src/olap/page_cache.cpp index 76dd0542a85f4e..b0555c9889a71f 100644 --- a/be/src/olap/page_cache.cpp +++ b/be/src/olap/page_cache.cpp @@ -29,20 +29,26 @@ void StoragePageCache::create_global_cache(size_t capacity, int32_t index_cache_ StoragePageCache::StoragePageCache(size_t capacity, int32_t index_cache_percentage) : _index_cache_percentage(index_cache_percentage), - _mem_tracker(MemTracker::CreateTracker(capacity, "StoragePageCache", nullptr, true, true, MemTrackerLevel::OVERVIEW)) { + _mem_tracker(MemTracker::create_tracker(capacity, "StoragePageCache", nullptr, + MemTrackerLevel::OVERVIEW)) { if (index_cache_percentage == 0) { - _data_page_cache = std::unique_ptr(new_lru_cache("DataPageCache", capacity, _mem_tracker)); + _data_page_cache = + std::unique_ptr(new_lru_cache("DataPageCache", capacity, _mem_tracker)); } else if (index_cache_percentage == 100) { - _index_page_cache = std::unique_ptr(new_lru_cache("IndexPageCache", capacity, _mem_tracker)); + _index_page_cache = + std::unique_ptr(new_lru_cache("IndexPageCache", capacity, _mem_tracker)); } else if (index_cache_percentage > 0 && index_cache_percentage < 100) { - _data_page_cache = std::unique_ptr(new_lru_cache("DataPageCache", capacity * (100 - index_cache_percentage) / 100, _mem_tracker)); - _index_page_cache = std::unique_ptr(new_lru_cache("IndexPageCache", capacity * index_cache_percentage / 100, _mem_tracker)); + _data_page_cache = std::unique_ptr(new_lru_cache( + "DataPageCache", capacity * (100 - index_cache_percentage) / 100, _mem_tracker)); + _index_page_cache = std::unique_ptr(new_lru_cache( + "IndexPageCache", capacity * index_cache_percentage / 100, _mem_tracker)); } else { CHECK(false) << "invalid index page cache percentage"; } } -bool StoragePageCache::lookup(const CacheKey& key, PageCacheHandle* handle, segment_v2::PageTypePB page_type) { +bool StoragePageCache::lookup(const CacheKey& key, PageCacheHandle* handle, + segment_v2::PageTypePB page_type) { auto cache = _get_page_cache(page_type); auto lru_handle = cache->lookup(key.encode()); if (lru_handle == nullptr) { diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index ef50f5a3bac316..b1a03a19ede162 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -903,7 +903,7 @@ OLAPStatus PushBrokerReader::init(const Schema* schema, const TBrokerScanRange& } _runtime_profile = _runtime_state->runtime_profile(); _runtime_profile->set_name("PushBrokerReader"); - _mem_tracker = MemTracker::CreateTracker(-1, "PushBrokerReader", + _mem_tracker = MemTracker::create_tracker(-1, "PushBrokerReader", _runtime_state->instance_mem_tracker()); _mem_pool.reset(new MemPool(_mem_tracker.get())); _counter.reset(new ScannerCounter()); diff --git a/be/src/olap/row_block.cpp b/be/src/olap/row_block.cpp index d6f522093a60cd..1b041c80d00c61 100644 --- a/be/src/olap/row_block.cpp +++ b/be/src/olap/row_block.cpp @@ -39,7 +39,7 @@ namespace doris { RowBlock::RowBlock(const TabletSchema* schema, const std::shared_ptr& parent_tracker) : _capacity(0), _schema(schema) { - _tracker = MemTracker::CreateTracker(-1, "RowBlock", parent_tracker, true, true, MemTrackerLevel::VERBOSE); + _tracker = MemTracker::create_tracker(-1, "RowBlock", parent_tracker, MemTrackerLevel::VERBOSE); _mem_pool.reset(new MemPool(_tracker.get())); } diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp index dda02b335d71eb..21132d58bf2c5d 100644 --- a/be/src/olap/row_block2.cpp +++ b/be/src/olap/row_block2.cpp @@ -39,7 +39,7 @@ RowBlockV2::RowBlockV2(const Schema& schema, uint16_t capacity, std::shared_ptr< : _schema(schema), _capacity(capacity), _column_vector_batches(_schema.num_columns()), - _tracker(MemTracker::CreateTracker(-1, "RowBlockV2", std::move(parent))), + _tracker(MemTracker::create_tracker(-1, "RowBlockV2", std::move(parent))), _pool(new MemPool(_tracker.get())), _selection_vector(nullptr) { for (auto cid : _schema.column_ids()) { diff --git a/be/src/olap/rowset/segment_reader.cpp b/be/src/olap/rowset/segment_reader.cpp index 192caaa8789b44..a1d1f9c500202e 100644 --- a/be/src/olap/rowset/segment_reader.cpp +++ b/be/src/olap/rowset/segment_reader.cpp @@ -58,7 +58,7 @@ SegmentReader::SegmentReader(const std::string file, SegmentGroup* segment_group _is_using_mmap(false), _is_data_loaded(false), _buffer_size(0), - _tracker(MemTracker::CreateTracker(-1, "SegmentReader:" + file, parent_tracker, false)), + _tracker(MemTracker::create_tracker(-1, "SegmentReader:" + file, parent_tracker)), _mem_pool(new MemPool(_tracker.get())), _shared_buffer(nullptr), _lru_cache(lru_cache), @@ -87,7 +87,7 @@ SegmentReader::~SegmentReader() { _file_handler.close(); if (_is_data_loaded && _runtime_state != nullptr) { - MemTracker::update_limits(_buffer_size * -1, _runtime_state->mem_trackers()); + MemTracker::batch_consume(_buffer_size * -1, _runtime_state->mem_trackers()); } for (auto& it : _streams) { @@ -250,8 +250,7 @@ OLAPStatus SegmentReader::seek_to_block(uint32_t first_block, uint32_t last_bloc } if (_runtime_state != nullptr) { - MemTracker::update_limits(_buffer_size, _runtime_state->mem_trackers()); - if (MemTracker::limit_exceeded(_runtime_state->mem_trackers())) { + if (!MemTracker::batch_consume(_buffer_size, _runtime_state->mem_trackers())) { return OLAP_ERR_FETCH_MEMORY_EXCEEDED; } } @@ -837,7 +836,7 @@ OLAPStatus SegmentReader::_reset_readers() { for (std::map::iterator it = _streams.begin(); it != _streams.end(); ++it) { if (_runtime_state != nullptr) { - MemTracker::update_limits(-1 * it->second->get_buffer_size(), + MemTracker::batch_consume(-1 * it->second->get_buffer_size(), _runtime_state->mem_trackers()); } delete it->second; @@ -851,7 +850,7 @@ OLAPStatus SegmentReader::_reset_readers() { continue; } if (_runtime_state != nullptr) { - MemTracker::update_limits(-1 * (*it)->get_buffer_size(), + MemTracker::batch_consume(-1 * (*it)->get_buffer_size(), _runtime_state->mem_trackers()); } delete (*it); diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 9b14ff4201d4f7..30848d497eeb15 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -458,7 +458,7 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, bool FileColumnIterator::FileColumnIterator(ColumnReader* reader) : _reader(reader) {} FileColumnIterator::~FileColumnIterator() { - _opts.mem_tracker->Release(_opts.mem_tracker->consumption()); + _opts.mem_tracker->release(_opts.mem_tracker->consumption()); } Status FileColumnIterator::seek_to_first() { diff --git a/be/src/olap/rowset/segment_v2/page_io.cpp b/be/src/olap/rowset/segment_v2/page_io.cpp index 739cde1597d0cb..0fd13dd164a8aa 100644 --- a/be/src/olap/rowset/segment_v2/page_io.cpp +++ b/be/src/olap/rowset/segment_v2/page_io.cpp @@ -26,6 +26,7 @@ #include "olap/fs/block_manager.h" #include "olap/page_cache.h" #include "util/block_compression.h" +#include "runtime/thread_context.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/faststring.h" @@ -139,7 +140,11 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* } // hold compressed page at first, reset to decompressed page later - std::unique_ptr page(new char[page_size]); + std::unique_ptr page; + { + // SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); + page.reset(new char[page_size]); + } Slice page_slice(page.get(), page_size); { SCOPED_RAW_TIMER(&opts.stats->io_ns); @@ -170,8 +175,11 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* return Status::Corruption("Bad page: page is compressed but codec is NO_COMPRESSION"); } SCOPED_RAW_TIMER(&opts.stats->decompress_ns); - std::unique_ptr decompressed_page( - new char[footer->uncompressed_size() + footer_size + 4]); + std::unique_ptr decompressed_page; + { + // SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); + decompressed_page.reset(new char[footer->uncompressed_size() + footer_size + 4]); + } // decompress page body Slice compressed_body(page_slice.data, body_size); @@ -185,8 +193,11 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* // append footer and footer size memcpy(decompressed_body.data + decompressed_body.size, page_slice.data + body_size, footer_size + 4); - // free memory of compressed page - page = std::move(decompressed_page); + { + // SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); + // free memory of compressed page + page = std::move(decompressed_page); + } page_slice = Slice(page.get(), footer->uncompressed_size() + footer_size + 4); opts.stats->uncompressed_bytes_read += page_slice.size; } else { diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 470efc0a9ec765..48f94b0322cfed 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -51,14 +51,14 @@ Segment::Segment(const FilePathDesc& path_desc, uint32_t segment_id, const Table : _path_desc(path_desc), _segment_id(segment_id), _tablet_schema(tablet_schema) { #ifndef BE_TEST - _mem_tracker = MemTracker::CreateTracker(-1, "Segment", StorageEngine::instance()->tablet_mem_tracker(), false); + _mem_tracker = MemTracker::create_tracker(-1, "Segment", StorageEngine::instance()->tablet_mem_tracker()); #else - _mem_tracker = MemTracker::CreateTracker(-1, "Segment", nullptr, false); + _mem_tracker = MemTracker::create_tracker(-1, "Segment", nullptr); #endif } Segment::~Segment() { - _mem_tracker->Release(_mem_tracker->consumption()); + _mem_tracker->release(_mem_tracker->consumption()); } Status Segment::_open() { @@ -128,7 +128,7 @@ Status Segment::_parse_footer() { return Status::Corruption(strings::Substitute("Bad segment file $0: file size $1 < $2", _path_desc.filepath, file_size, 12 + footer_length)); } - _mem_tracker->Consume(footer_length); + _mem_tracker->consume(footer_length); std::string footer_buf; footer_buf.resize(footer_length); @@ -172,7 +172,7 @@ Status Segment::_load_index() { DCHECK_EQ(footer.type(), SHORT_KEY_PAGE); DCHECK(footer.has_short_key_page_footer()); - _mem_tracker->Consume(body.get_size()); + _mem_tracker->consume(body.get_size()); _sk_index_decoder.reset(new ShortKeyIndexDecoder); return _sk_index_decoder->parse(body, footer.short_key_page_footer()); }); @@ -214,7 +214,7 @@ Status Segment::new_column_iterator(uint32_t cid, std::shared_ptr pa tablet_column.has_default_value(), tablet_column.default_value(), tablet_column.is_nullable(), type_info, tablet_column.length())); ColumnIteratorOptions iter_opts; - iter_opts.mem_tracker = MemTracker::CreateTracker(-1, "DefaultColumnIterator", parent, false); + iter_opts.mem_tracker = MemTracker::create_tracker(-1, "DefaultColumnIterator", parent); RETURN_IF_ERROR(default_value_iter->init(iter_opts)); *iter = default_value_iter.release(); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index e5c7e0883f9d3b..9a72ef26a59330 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -101,7 +101,7 @@ SegmentIterator::SegmentIterator(std::shared_ptr segment, const Schema& _lazy_materialization_read(false), _inited(false) { // use for count the mem use of ColumnIterator - _mem_tracker = MemTracker::CreateTracker(-1, "SegmentIterator", std::move(parent), false); + _mem_tracker = MemTracker::create_tracker(-1, "SegmentIterator", std::move(parent)); } SegmentIterator::~SegmentIterator() { @@ -209,7 +209,7 @@ Status SegmentIterator::_prepare_seek(const StorageReadOptions::KeyRange& key_ra iter_opts.stats = _opts.stats; iter_opts.rblock = _rblock.get(); iter_opts.mem_tracker = - MemTracker::CreateTracker(-1, "ColumnIterator", _mem_tracker, false); + MemTracker::create_tracker(-1, "ColumnIterator", _mem_tracker); RETURN_IF_ERROR(_column_iterators[cid]->init(iter_opts)); } } @@ -341,7 +341,7 @@ Status SegmentIterator::_init_return_column_iterators() { iter_opts.use_page_cache = _opts.use_page_cache; iter_opts.rblock = _rblock.get(); iter_opts.mem_tracker = - MemTracker::CreateTracker(-1, "ColumnIterator", _mem_tracker, false); + MemTracker::create_tracker(-1, "ColumnIterator", _mem_tracker); RETURN_IF_ERROR(_column_iterators[cid]->init(iter_opts)); } } diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index adbfef96940a14..e7d4db99c514cd 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -37,18 +37,23 @@ const char* k_segment_magic = "D0R1"; const uint32_t k_segment_magic_length = 4; SegmentWriter::SegmentWriter(fs::WritableBlock* wblock, uint32_t segment_id, - const TabletSchema* tablet_schema, const SegmentWriterOptions& opts, std::shared_ptr parent) - : _segment_id(segment_id), _tablet_schema(tablet_schema), _opts(opts), _wblock(wblock), _mem_tracker(MemTracker::CreateTracker( - -1, "Segment-" + std::to_string(segment_id), parent, false)) { + const TabletSchema* tablet_schema, const SegmentWriterOptions& opts, + std::shared_ptr parent) + : _segment_id(segment_id), + _tablet_schema(tablet_schema), + _opts(opts), + _wblock(wblock), + _mem_tracker( + MemTracker::create_tracker(-1, "Segment-" + std::to_string(segment_id), parent)) { CHECK_NOTNULL(_wblock); } SegmentWriter::~SegmentWriter() { - _mem_tracker->Release(_mem_tracker->consumption()); + _mem_tracker->release(_mem_tracker->consumption()); }; void SegmentWriter::init_column_meta(ColumnMetaPB* meta, uint32_t* column_id, - const TabletColumn& column) { + const TabletColumn& column) { // TODO(zc): Do we need this column_id?? meta->set_column_id((*column_id)++); meta->set_unique_id(column.unique_id()); @@ -129,7 +134,7 @@ uint64_t SegmentWriter::estimate_segment_size() { size += _index_builder->size(); // update the mem_tracker of segment size - _mem_tracker->Consume(size - _mem_tracker->consumption()); + _mem_tracker->consume(size - _mem_tracker->consumption()); return size; } @@ -218,7 +223,7 @@ Status SegmentWriter::_write_footer() { // that will need an extra seek when reading fixed_buf.append(k_segment_magic, k_segment_magic_length); - std::vector slices{footer_buf, fixed_buf}; + std::vector slices {footer_buf, fixed_buf}; return _write_raw_data(slices); } diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index e3c7d8b4bf8feb..b73d6ef30a1bba 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -745,7 +745,7 @@ bool RowBlockSorter::sort(RowBlock** row_block) { RowBlockAllocator::RowBlockAllocator(const TabletSchema& tablet_schema, std::shared_ptr parent, size_t memory_limitation) : _tablet_schema(tablet_schema), - _mem_tracker(MemTracker::CreateTracker(-1, "RowBlockAllocator", parent, false)), + _mem_tracker(MemTracker::create_tracker(-1, "RowBlockAllocator", parent)), _row_len(tablet_schema.row_size()), _memory_limitation(memory_limitation) { VLOG_NOTICE << "RowBlockAllocator(). row_len=" << _row_len; @@ -784,7 +784,7 @@ OLAPStatus RowBlockAllocator::allocate(RowBlock** row_block, size_t num_rows, bo row_block_info.null_supported = null_supported; (*row_block)->init(row_block_info); - _mem_tracker->Consume(row_block_size); + _mem_tracker->consume(row_block_size); VLOG_NOTICE << "RowBlockAllocator::allocate() this=" << this << ", num_rows=" << num_rows << ", m_memory_allocated=" << _mem_tracker->consumption() << ", row_block_addr=" << *row_block; @@ -797,7 +797,7 @@ void RowBlockAllocator::release(RowBlock* row_block) { return; } - _mem_tracker->Release(row_block->capacity() * _row_len); + _mem_tracker->release(row_block->capacity() * _row_len); VLOG_NOTICE << "RowBlockAllocator::release() this=" << this << ", num_rows=" << row_block->capacity() @@ -823,8 +823,7 @@ bool RowBlockMerger::merge(const std::vector& row_block_arr, RowsetWr std::shared_ptr parent, uint64_t* merged_rows) { uint64_t tmp_merged_rows = 0; RowCursor row_cursor; - std::shared_ptr tracker( - MemTracker::CreateTracker(-1, "RowBlockMerger", parent, false)); + std::shared_ptr tracker(MemTracker::create_tracker(-1, "RowBlockMerger", parent)); std::unique_ptr mem_pool(new MemPool(tracker.get())); std::unique_ptr agg_object_pool(new ObjectPool()); if (row_cursor.init(_tablet->tablet_schema()) != OLAP_SUCCESS) { @@ -1420,7 +1419,7 @@ bool SchemaChangeWithSorting::_external_sorting(vector& src_row } SchemaChangeHandler::SchemaChangeHandler() - : _mem_tracker(MemTracker::CreateTracker(-1, "SchemaChange", StorageEngine::instance()->schema_change_mem_tracker())) { + : _mem_tracker(MemTracker::create_tracker(-1, "SchemaChange", StorageEngine::instance()->schema_change_mem_tracker())) { REGISTER_HOOK_METRIC(schema_change_mem_consumption, [this]() { return _mem_tracker->consumption(); }); } @@ -1532,8 +1531,8 @@ OLAPStatus SchemaChangeHandler::_do_process_alter_tablet_v2(const TAlterTabletRe reader_context.seek_columns = &return_columns; reader_context.sequence_id_idx = reader_context.tablet_schema->sequence_col_idx(); - auto mem_tracker = MemTracker::CreateTracker(-1, "AlterTablet:" + std::to_string(base_tablet->tablet_id()) + "-" - + std::to_string(new_tablet->tablet_id()), _mem_tracker, true, false, MemTrackerLevel::TASK); + auto mem_tracker = MemTracker::create_tracker(-1, "AlterTablet:" + std::to_string(base_tablet->tablet_id()) + "-" + + std::to_string(new_tablet->tablet_id()), _mem_tracker, MemTrackerLevel::TASK); do { // get history data to be converted and it will check if there is hold in base tablet diff --git a/be/src/olap/segment_loader.cpp b/be/src/olap/segment_loader.cpp index 198b4b41543c34..0da658c4efbec2 100644 --- a/be/src/olap/segment_loader.cpp +++ b/be/src/olap/segment_loader.cpp @@ -31,8 +31,7 @@ void SegmentLoader::create_global_instance(size_t capacity) { } SegmentLoader::SegmentLoader(size_t capacity) - : _mem_tracker(MemTracker::CreateTracker(capacity, "SegmentLoader", nullptr, true, true, - MemTrackerLevel::OVERVIEW)) { + : _mem_tracker(MemTracker::create_tracker(capacity, "SegmentLoader", nullptr, MemTrackerLevel::OVERVIEW)) { _cache = std::unique_ptr( new_typed_lru_cache("SegmentCache", capacity, LRUCacheType::NUMBER, _mem_tracker)); } diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index 201bffb6649bb2..f1139e6ebc92b9 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -112,10 +112,10 @@ StorageEngine::StorageEngine(const EngineOptions& options) _is_all_cluster_id_exist(true), _index_stream_lru_cache(nullptr), _file_cache(nullptr), - _compaction_mem_tracker(MemTracker::CreateTracker(-1, "AutoCompaction", nullptr, false, - false, MemTrackerLevel::OVERVIEW)), - _tablet_mem_tracker(MemTracker::CreateTracker(-1, "TabletHeader", nullptr, false, false, - MemTrackerLevel::OVERVIEW)), + _compaction_mem_tracker(MemTracker::create_tracker(-1, "AutoCompaction", nullptr, + MemTrackerLevel::OVERVIEW)), + _tablet_mem_tracker(MemTracker::create_tracker(-1, "TabletHeader", nullptr, + MemTrackerLevel::OVERVIEW)), _stop_background_threads_latch(1), _tablet_manager(new TabletManager(config::tablet_map_shard_size)), _txn_manager(new TxnManager(config::txn_map_shard_size, config::txn_shard_size)), @@ -134,7 +134,7 @@ StorageEngine::StorageEngine(const EngineOptions& options) REGISTER_HOOK_METRIC(compaction_mem_consumption, [this]() { return _compaction_mem_tracker->consumption(); // We can get each compaction's detail usage - // LOG(INFO) << _compaction_mem_tracker=>LogUsage(2); + // LOG(INFO) << _compaction_mem_tracker=>log_usage(2); }); } diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index 2e7cd8a8a45c52..592bac395383a4 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -73,8 +73,7 @@ static bool _cmp_tablet_by_create_time(const TabletSharedPtr& a, const TabletSha } TabletManager::TabletManager(int32_t tablet_map_lock_shard_size) - : _mem_tracker(MemTracker::CreateTracker(-1, "TabletMeta", nullptr, false, false, - MemTrackerLevel::OVERVIEW)), + : _mem_tracker(MemTracker::create_tracker(-1, "TabletMeta", nullptr, MemTrackerLevel::OVERVIEW)), _tablets_shards_size(tablet_map_lock_shard_size), _tablets_shards_mask(tablet_map_lock_shard_size - 1), _last_update_stat_ms(0) { @@ -89,7 +88,7 @@ TabletManager::TabletManager(int32_t tablet_map_lock_shard_size) } TabletManager::~TabletManager() { - _mem_tracker->Release(_mem_tracker->consumption()); + _mem_tracker->release(_mem_tracker->consumption()); DEREGISTER_HOOK_METRIC(tablet_meta_mem_consumption); } @@ -204,7 +203,7 @@ OLAPStatus TabletManager::_add_tablet_to_map_unlocked(TTabletId tablet_id, Schem // TODO: remove multiply 2 of tablet meta mem size // Because table schema will copy in tablet, there will be double mem cost // so here multiply 2 - _mem_tracker->Consume(tablet->tablet_meta()->mem_size() * 2); + _mem_tracker->consume(tablet->tablet_meta()->mem_size() * 2); VLOG_NOTICE << "add tablet to map successfully." << " tablet_id=" << tablet_id << ", schema_hash=" << schema_hash; @@ -1368,7 +1367,7 @@ OLAPStatus TabletManager::_drop_tablet_directly_unlocked(TTabletId tablet_id, } dropped_tablet->deregister_tablet_from_dir(); - _mem_tracker->Release(dropped_tablet->tablet_meta()->mem_size() * 2); + _mem_tracker->release(dropped_tablet->tablet_meta()->mem_size() * 2); return OLAP_SUCCESS; } diff --git a/be/src/olap/task/engine_checksum_task.h b/be/src/olap/task/engine_checksum_task.h index 7f0cdb6e6bdcc8..0430c560e99101 100644 --- a/be/src/olap/task/engine_checksum_task.h +++ b/be/src/olap/task/engine_checksum_task.h @@ -44,6 +44,7 @@ class EngineChecksumTask : public EngineTask { TSchemaHash _schema_hash; TVersion _version; uint32_t* _checksum; + std::shared_ptr _mem_tracker; }; // EngineTask } // namespace doris diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt index 2647920a1453b6..c38c2a116e24c6 100644 --- a/be/src/runtime/CMakeLists.txt +++ b/be/src/runtime/CMakeLists.txt @@ -46,7 +46,7 @@ set(RUNTIME_FILES runtime_state.cpp runtime_filter_mgr.cpp string_value.cpp - thread_mem_tracker.cpp + thread_mem_tracker_mgr.cpp thread_resource_mgr.cpp decimalv2_value.cpp large_int_value.cpp @@ -68,6 +68,7 @@ set(RUNTIME_FILES disk_io_mgr_scan_range.cc buffered_block_mgr2.cc mem_tracker.cpp + mem_tracker_task_pool.cpp spill_sorter.cc sorted_run_merger.cc data_stream_recvr.cc diff --git a/be/src/runtime/buffered_block_mgr2.cc b/be/src/runtime/buffered_block_mgr2.cc index 92edcdcabe5c3e..5b70f763c6c9ab 100644 --- a/be/src/runtime/buffered_block_mgr2.cc +++ b/be/src/runtime/buffered_block_mgr2.cc @@ -100,8 +100,7 @@ class BufferedBlockMgr2::Client { DCHECK(buffer != nullptr); if (buffer->len == _mgr->max_block_size()) { ++_num_pinned_buffers; - _tracker->ConsumeLocal(buffer->len, _query_tracker.get()); - // _tracker->Consume(buffer->len); + _tracker->consume(buffer->len, _query_tracker.get()); } } @@ -110,8 +109,7 @@ class BufferedBlockMgr2::Client { if (buffer->len == _mgr->max_block_size()) { DCHECK_GT(_num_pinned_buffers, 0); --_num_pinned_buffers; - _tracker->ReleaseLocal(buffer->len, _query_tracker.get()); - // _tracker->Release(buffer->len); + _tracker->release(buffer->len, _query_tracker.get()); } } @@ -261,7 +259,7 @@ int64_t BufferedBlockMgr2::available_buffers(Client* client) const { int64_t BufferedBlockMgr2::remaining_unreserved_buffers() const { int64_t num_buffers = _free_io_buffers.size() + _unpinned_blocks.size() + _non_local_outstanding_writes; - num_buffers += _mem_tracker->SpareCapacity(MemLimit::HARD) / max_block_size(); + num_buffers += _mem_tracker->spare_capacity() / max_block_size(); num_buffers -= _unfullfilled_reserved_buffers; return num_buffers; } @@ -324,24 +322,24 @@ bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) { } int buffers_needed = BitUtil::ceil(size, max_block_size()); unique_lock lock(_lock); - Status st = _mem_tracker->TryConsume(size); + Status st = _mem_tracker->try_consume(size); WARN_IF_ERROR(st, "consume failed"); if (size < max_block_size() && st) { // For small allocations (less than a block size), just let the allocation through. - client->_tracker->ConsumeLocal(size, client->_query_tracker.get()); - // client->_tracker->Consume(size); + client->_tracker->consume(size, client->_query_tracker.get()); + // client->_tracker->consume(size); return true; } if (available_buffers(client) + client->_num_tmp_reserved_buffers < buffers_needed) { return false; } - st = _mem_tracker->TryConsume(size); + st = _mem_tracker->try_consume(size); WARN_IF_ERROR(st, "consume failed"); if (st) { // There was still unallocated memory, don't need to recycle allocated blocks. - client->_tracker->ConsumeLocal(size, client->_query_tracker.get()); - // client->_tracker->Consume(size); + client->_tracker->consume(size, client->_query_tracker.get()); + // client->_tracker->consume(size); return true; } @@ -386,7 +384,7 @@ bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) { } client->_num_tmp_reserved_buffers -= additional_tmp_reservations; _unfullfilled_reserved_buffers -= additional_tmp_reservations; - _mem_tracker->Release(buffers_acquired * max_block_size()); + _mem_tracker->release(buffers_acquired * max_block_size()); return false; } @@ -394,21 +392,21 @@ bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) { _unfullfilled_reserved_buffers -= buffers_acquired; DCHECK_GE(buffers_acquired * max_block_size(), size); - _mem_tracker->Release(buffers_acquired * max_block_size()); - st = _mem_tracker->TryConsume(size); + _mem_tracker->release(buffers_acquired * max_block_size()); + st = _mem_tracker->try_consume(size); WARN_IF_ERROR(st, "consume failed"); if (!st) { return false; } - client->_tracker->ConsumeLocal(size, client->_query_tracker.get()); - // client->_tracker->Consume(size); + client->_tracker->consume(size, client->_query_tracker.get()); + // client->_tracker->consume(size); DCHECK(validate()) << endl << debug_internal(); return true; } void BufferedBlockMgr2::release_memory(Client* client, int64_t size) { - _mem_tracker->Release(size); - client->_tracker->ReleaseLocal(size, client->_query_tracker.get()); + _mem_tracker->release(size); + client->_tracker->release(size, client->_query_tracker.get()); } void BufferedBlockMgr2::cancel() { @@ -469,7 +467,7 @@ Status BufferedBlockMgr2::get_new_block(Client* client, Block* unpin_block, Bloc if (len > 0 && len < _max_block_size) { DCHECK(unpin_block == nullptr); - Status st = client->_tracker->TryConsume(len); + Status st = client->_tracker->try_consume(len); WARN_IF_ERROR(st, "get_new_block failed"); if (st) { // TODO: Have a cache of unused blocks of size 'len' (0, _max_block_size) @@ -600,7 +598,7 @@ BufferedBlockMgr2::~BufferedBlockMgr2() { // Free memory resources. for (BufferDescriptor* buffer : _all_io_buffers) { - _mem_tracker->Release(buffer->len); + _mem_tracker->release(buffer->len); delete[] buffer->buffer; } DCHECK_EQ(_mem_tracker->consumption(), 0); @@ -954,7 +952,7 @@ void BufferedBlockMgr2::delete_block(Block* block) { if (block->_buffer_desc->len != _max_block_size) { // Just delete the block for now. delete[] block->_buffer_desc->buffer; - block->_client->_tracker->Release(block->_buffer_desc->len); + block->_client->_tracker->release(block->_buffer_desc->len); delete block->_buffer_desc; block->_buffer_desc = nullptr; } else { @@ -1094,7 +1092,7 @@ Status BufferedBlockMgr2::find_buffer_for_block(Block* block, bool* in_mem) { Status BufferedBlockMgr2::find_buffer(unique_lock& lock, BufferDescriptor** buffer_desc) { *buffer_desc = nullptr; - Status st = _mem_tracker->TryConsume(_max_block_size); + Status st = _mem_tracker->try_consume(_max_block_size); WARN_IF_ERROR(st, "try to allocate a new buffer failed"); // First, try to allocate a new buffer. if (_free_io_buffers.size() < _block_write_threshold && st) { @@ -1262,8 +1260,8 @@ string BufferedBlockMgr2::debug_internal() const { << " Num available buffers: " << remaining_unreserved_buffers() << endl << " Total pinned buffers: " << _total_pinned_buffers << endl << " Unfullfilled reserved buffers: " << _unfullfilled_reserved_buffers << endl - << " Remaining memory: " << _mem_tracker->SpareCapacity(MemLimit::HARD) - << " (#blocks=" << (_mem_tracker->SpareCapacity(MemLimit::HARD) / _max_block_size) << ")" + << " Remaining memory: " << _mem_tracker->spare_capacity() + << " (#blocks=" << (_mem_tracker->spare_capacity() / _max_block_size) << ")" << endl << " Block write threshold: " << _block_write_threshold; return ss.str(); @@ -1295,7 +1293,7 @@ void BufferedBlockMgr2::init(DiskIoMgr* io_mgr, RuntimeProfile* parent_profile, _integrity_check_timer = ADD_TIMER(_profile.get(), "TotalIntegrityCheckTime"); // Create a new mem_tracker and allocate buffers. - _mem_tracker = MemTracker::CreateTracker(mem_limit, "BufferedBlockMgr2", parent_tracker); + _mem_tracker = MemTracker::create_tracker(mem_limit, "BufferedBlockMgr2", parent_tracker); _initialized = true; } diff --git a/be/src/runtime/bufferpool/buffer_allocator.cc b/be/src/runtime/bufferpool/buffer_allocator.cc index a3bbe4c6c2fdc4..fc639b46c21a71 100644 --- a/be/src/runtime/bufferpool/buffer_allocator.cc +++ b/be/src/runtime/bufferpool/buffer_allocator.cc @@ -22,6 +22,7 @@ #include "common/atomic.h" #include "common/config.h" #include "runtime/bufferpool/system_allocator.h" +#include "runtime/thread_context.h" #include "util/bit_util.h" #include "util/cpu_info.h" #include "util/pretty_printer.h" @@ -220,6 +221,7 @@ Status BufferPool::BufferAllocator::Allocate(ClientHandle* client, int64_t len, COUNTER_UPDATE(client->impl_->counters().cumulative_allocations, 1); RETURN_IF_ERROR(AllocateInternal(len, buffer)); + thread_local_ctx.consume_mem(len); DCHECK(buffer->is_open()); buffer->client_ = client; return Status::OK(); @@ -245,7 +247,9 @@ Status BufferPool::BufferAllocator::AllocateInternal(int64_t len, BufferHandle* const int current_core = CpuInfo::get_current_core(); // Fast path: recycle a buffer of the correct size from this core's arena. FreeBufferArena* current_core_arena = per_core_arenas_[current_core].get(); - if (current_core_arena->PopFreeBuffer(len, buffer)) return Status::OK(); + if (current_core_arena->PopFreeBuffer(len, buffer)) { + return Status::OK(); + } // Fast-ish path: allocate a new buffer if there is room in 'system_bytes_remaining_'. int64_t delta = DecreaseBytesRemaining(len, true, &system_bytes_remaining_); @@ -264,7 +268,9 @@ Status BufferPool::BufferAllocator::AllocateInternal(int64_t len, BufferHandle* // Each core should start searching from a different point to avoid hot-spots. int other_core = numa_node_cores[(numa_node_core_idx + i) % numa_node_cores.size()]; FreeBufferArena* other_core_arena = per_core_arenas_[other_core].get(); - if (other_core_arena->PopFreeBuffer(len, buffer)) return Status::OK(); + if (other_core_arena->PopFreeBuffer(len, buffer)) { + return Status::OK(); + } } /* @@ -298,7 +304,11 @@ Status BufferPool::BufferAllocator::AllocateInternal(int64_t len, BufferHandle* } // We have headroom to allocate a new buffer at this point. DCHECK_EQ(delta, len); - Status status = system_allocator_->Allocate(len, buffer); + Status status; + { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); + status = system_allocator_->Allocate(len, buffer); + } if (!status.ok()) { system_bytes_remaining_.add(len); return status; @@ -375,6 +385,7 @@ void BufferPool::BufferAllocator::Free(BufferHandle&& handle) { handle.client_ = nullptr; // Buffer is no longer associated with a client. FreeBufferArena* arena = per_core_arenas_[handle.home_core_].get(); handle.Poison(); + thread_local_ctx.release_mem(handle.len()); arena->AddFreeBuffer(std::move(handle)); } @@ -420,6 +431,7 @@ int BufferPool::BufferAllocator::GetFreeListSize(int core, int64_t len) { int64_t BufferPool::BufferAllocator::FreeToSystem(std::vector&& buffers) { int64_t bytes_freed = 0; + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); for (BufferHandle& buffer : buffers) { bytes_freed += buffer.len(); // Ensure that the memory is unpoisoned when it's next allocated by the system. diff --git a/be/src/runtime/bufferpool/reservation_tracker.cc b/be/src/runtime/bufferpool/reservation_tracker.cc index 4fa41d85e1b751..cb1b5f929f268c 100644 --- a/be/src/runtime/bufferpool/reservation_tracker.cc +++ b/be/src/runtime/bufferpool/reservation_tracker.cc @@ -75,8 +75,8 @@ void ReservationTracker::InitChildTracker(RuntimeProfile* profile, ReservationTr DCHECK_EQ(parent_mem_tracker, mem_tracker_->parent().get()); // Make sure we don't have a lower limit than the ancestor, since we don't enforce // limits at lower links. - DCHECK_EQ(mem_tracker_->GetLowestLimit(MemLimit::HARD), - parent_mem_tracker->GetLowestLimit(MemLimit::HARD)); + DCHECK_EQ(mem_tracker_->get_lowest_limit(), + parent_mem_tracker->get_lowest_limit()); } else { // Make sure we didn't leave a gap in the links. E.g. this tracker's grandparent // shouldn't have a MemTracker. @@ -110,7 +110,6 @@ void ReservationTracker::InitCounters(RuntimeProfile* profile, int64_t reservati counters_.reservation_limit = ADD_COUNTER(profile, "ReservationLimit", TUnit::BYTES); COUNTER_SET(counters_.reservation_limit, reservation_limit); } - if (mem_tracker_ != nullptr) mem_tracker_->EnableReservationReporting(counters_); } void ReservationTracker::Close() { @@ -187,14 +186,14 @@ bool ReservationTracker::TryConsumeFromMemTracker(int64_t reservation_increase) if (GetParentMemTracker() == nullptr) { // At the topmost link, which may be a MemTracker with a limit, we need to use // TryConsume() to check the limit. - Status st = mem_tracker_->TryConsume(reservation_increase); + Status st = mem_tracker_->try_consume(reservation_increase); WARN_IF_ERROR(st, "TryConsumeFromMemTracker failed"); return st.ok(); } else { // For lower links, there shouldn't be a limit to enforce, so we just need to // update the consumption of the linked MemTracker since the reservation is // already reflected in its parent. - mem_tracker_->ConsumeLocal(reservation_increase, GetParentMemTracker()); + mem_tracker_->consume(reservation_increase, GetParentMemTracker()); return true; } } @@ -203,9 +202,9 @@ void ReservationTracker::ReleaseToMemTracker(int64_t reservation_decrease) { DCHECK_GE(reservation_decrease, 0); if (mem_tracker_ == nullptr) return; if (GetParentMemTracker() == nullptr) { - mem_tracker_->Release(reservation_decrease); + mem_tracker_->release(reservation_decrease); } else { - mem_tracker_->ReleaseLocal(reservation_decrease, GetParentMemTracker()); + mem_tracker_->release(reservation_decrease, GetParentMemTracker()); } } diff --git a/be/src/runtime/data_stream_recvr.cc b/be/src/runtime/data_stream_recvr.cc index 962395ad2616ed..9bf9bc26f03d33 100644 --- a/be/src/runtime/data_stream_recvr.cc +++ b/be/src/runtime/data_stream_recvr.cc @@ -446,7 +446,8 @@ DataStreamRecvr::DataStreamRecvr( _num_buffered_bytes(0), _profile(profile), _sub_plan_query_statistics_recvr(sub_plan_query_statistics_recvr) { - _mem_tracker = MemTracker::CreateTracker(_profile, -1, "DataStreamRecvr", parent_tracker); + _mem_tracker = MemTracker::create_tracker(-1, "DataStreamRecvr", parent_tracker, + MemTrackerLevel::VERBOSE, _profile); // Create one queue per sender if is_merging is true. int num_queues = is_merging ? num_senders : 1; diff --git a/be/src/runtime/data_stream_sender.cpp b/be/src/runtime/data_stream_sender.cpp index 681f5fc20b0db1..0fdb68f6d55f3b 100644 --- a/be/src/runtime/data_stream_sender.cpp +++ b/be/src/runtime/data_stream_sender.cpp @@ -388,9 +388,9 @@ Status DataStreamSender::prepare(RuntimeState* state) { << "])"; _profile = _pool->add(new RuntimeProfile(title.str())); SCOPED_TIMER(_profile->total_time_counter()); - _mem_tracker = MemTracker::CreateTracker( - _profile, -1, "DataStreamSender:" + print_id(state->fragment_instance_id()), - state->instance_mem_tracker()); + _mem_tracker = MemTracker::create_tracker( + -1, "DataStreamSender:" + print_id(state->fragment_instance_id()), + state->instance_mem_tracker(), MemTrackerLevel::VERBOSE, _profile); if (_part_type == TPartitionType::UNPARTITIONED || _part_type == TPartitionType::RANDOM) { std::random_device rd; diff --git a/be/src/runtime/disk_io_mgr.cc b/be/src/runtime/disk_io_mgr.cc index 9b9b350fee8e29..0adf10be22a57c 100644 --- a/be/src/runtime/disk_io_mgr.cc +++ b/be/src/runtime/disk_io_mgr.cc @@ -229,13 +229,13 @@ void DiskIoMgr::BufferDescriptor::set_mem_tracker(std::shared_ptr tr if (_mem_tracker.get() == tracker.get()) { return; } - // TODO(yingchun): use TransferTo? + // TODO(yingchun): use transfer_to? if (_mem_tracker != nullptr) { - _mem_tracker->Release(_buffer_len); + _mem_tracker->release(_buffer_len); } _mem_tracker = std::move(tracker); if (_mem_tracker != nullptr) { - _mem_tracker->Consume(_buffer_len); + _mem_tracker->consume(_buffer_len); } } @@ -359,13 +359,12 @@ DiskIoMgr::~DiskIoMgr() { */ } -Status DiskIoMgr::init(const std::shared_ptr& process_mem_tracker) { - DCHECK(process_mem_tracker != nullptr); - _process_mem_tracker = process_mem_tracker; +Status DiskIoMgr::init(const int64_t mem_limit) { + _disk_io_mem_tracker = MemTracker::create_tracker(mem_limit, "DiskIO", nullptr, MemTrackerLevel::OVERVIEW); // If we hit the process limit, see if we can reclaim some memory by removing // previously allocated (but unused) io buffers. /* - * process_mem_tracker->AddGcFunction(bind(&DiskIoMgr::gc_io_buffers, this)); + * process_mem_tracker->add_gc_function(bind(&DiskIoMgr::gc_io_buffers, this)); */ for (int i = 0; i < _disk_queues.size(); ++i) { @@ -713,9 +712,9 @@ char* DiskIoMgr::get_free_buffer(int64_t* buffer_size) { char* buffer = nullptr; if (_free_buffers[idx].empty()) { ++_num_allocated_buffers; - // Update the process mem usage. This is checked the next time we start + // Update the disk io mem usage. This is checked the next time we start // a read for the next reader (DiskIoMgr::GetNextScanRange) - _process_mem_tracker->Consume(*buffer_size); + _disk_io_mem_tracker->consume(*buffer_size); buffer = new char[*buffer_size]; } else { buffer = _free_buffers[idx].front(); @@ -733,7 +732,7 @@ void DiskIoMgr::gc_io_buffers() { for (list::iterator iter = _free_buffers[idx].begin(); iter != _free_buffers[idx].end(); ++iter) { int64_t buffer_size = (1 << idx) * _min_buffer_size; - _process_mem_tracker->Release(buffer_size); + _disk_io_mem_tracker->release(buffer_size); --_num_allocated_buffers; delete[] * iter; @@ -760,7 +759,7 @@ void DiskIoMgr::return_free_buffer(char* buffer, int64_t buffer_size) { if (!config::disable_mem_pools && _free_buffers[idx].size() < config::max_free_io_buffers) { _free_buffers[idx].push_back(buffer); } else { - _process_mem_tracker->Release(buffer_size); + _disk_io_mem_tracker->release(buffer_size); --_num_allocated_buffers; delete[] buffer; } @@ -817,15 +816,15 @@ bool DiskIoMgr::get_next_request_range(DiskQueue* disk_queue, RequestRange** ran // We just picked a reader, check the mem limits. // TODO: we can do a lot better here. The reader can likely make progress // with fewer io buffers. - bool process_limit_exceeded = _process_mem_tracker->limit_exceeded(); + bool disk_io_limit_exceeded = _disk_io_mem_tracker->limit_exceeded(); bool reader_limit_exceeded = (*request_context)->_mem_tracker != nullptr - ? (*request_context)->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) + ? (*request_context)->_mem_tracker->any_limit_exceeded() : false; // bool reader_limit_exceeded = (*request_context)->_mem_tracker != nullptr // ? (*request_context)->_mem_tracker->limit_exceeded() : false; - if (process_limit_exceeded || reader_limit_exceeded) { + if (disk_io_limit_exceeded || reader_limit_exceeded) { (*request_context)->cancel(Status::MemoryLimitExceeded("Memory limit exceeded")); } @@ -1019,11 +1018,11 @@ void DiskIoMgr::read_range(DiskQueue* disk_queue, RequestContext* reader, ScanRa int64_t buffer_size = std::min(bytes_remaining, static_cast(_max_buffer_size)); bool enough_memory = true; if (reader->_mem_tracker != nullptr) { - enough_memory = reader->_mem_tracker->SpareCapacity(MemLimit::HARD) > LOW_MEMORY; + enough_memory = reader->_mem_tracker->spare_capacity() > LOW_MEMORY; if (!enough_memory) { // Low memory, GC and try again. gc_io_buffers(); - enough_memory = reader->_mem_tracker->SpareCapacity(MemLimit::HARD) > LOW_MEMORY; + enough_memory = reader->_mem_tracker->spare_capacity() > LOW_MEMORY; } } diff --git a/be/src/runtime/disk_io_mgr.h b/be/src/runtime/disk_io_mgr.h index af988fb73ee067..0679b8a1d72a28 100644 --- a/be/src/runtime/disk_io_mgr.h +++ b/be/src/runtime/disk_io_mgr.h @@ -542,7 +542,7 @@ class DiskIoMgr { ~DiskIoMgr(); // Initialize the IoMgr. Must be called once before any of the other APIs. - Status init(const std::shared_ptr& process_mem_tracker); + Status init(const int64_t mem_limit); // Allocates tracking structure for a request context. // Register a new request context which is returned in *request_context. @@ -691,8 +691,8 @@ class DiskIoMgr { // Pool to allocate BufferDescriptors. ObjectPool _pool; - // Process memory tracker; needed to account for io buffers. - std::shared_ptr _process_mem_tracker; + // account for io buffers. + std::shared_ptr _disk_io_mem_tracker; // Number of worker(read) threads per disk. Also the max depth of queued // work to the disk. diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index a5a10712fa7d28..932c8fb526ad1a 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -21,6 +21,7 @@ #include "common/status.h" #include "olap/options.h" #include "runtime/mem_tracker.h" +#include "runtime/mem_tracker_task_pool.h" #include "util/threadpool.h" namespace doris { @@ -46,7 +47,7 @@ class LoadPathMgr; class LoadStreamMgr; class MemTracker; class StorageEngine; -class QueryMemTrackerRegistry; +class MemTrackerTaskPool; class PriorityThreadPool; class ReservationTracker; class ResultBufferMgr; @@ -118,9 +119,10 @@ class ExecEnv { } std::shared_ptr process_mem_tracker() { return _process_mem_tracker; } - std::shared_ptr all_query_mem_tracker() { return _all_query_mem_tracker; } - QueryMemTrackerRegistry* query_mem_tracker_registry() { - return _query_mem_tracker_registry.get(); + std::shared_ptr new_process_mem_tracker() { return _new_process_mem_tracker; } + std::shared_ptr query_pool_mem_tracker() { return _query_pool_mem_tracker; } + MemTrackerTaskPool* task_pool_mem_tracker_registry() { + return _task_pool_mem_tracker_registry.get(); } ThreadResourceMgr* thread_mgr() { return _thread_mgr; } PriorityThreadPool* scan_thread_pool() { return _scan_thread_pool; } @@ -185,13 +187,16 @@ class ExecEnv { ClientCache* _frontend_client_cache = nullptr; ClientCache* _broker_client_cache = nullptr; ClientCache* _extdatasource_client_cache = nullptr; + ThreadResourceMgr* _thread_mgr = nullptr; + // The ancestor of all trackers in the process. It is the only child of the root tracker. // All manually created trackers should specify the process tracker as the parent. std::shared_ptr _process_mem_tracker = nullptr; + // TODO(zxy): Will replace _process_mem_tracker in future. + std::shared_ptr _new_process_mem_tracker = nullptr; // The ancestor for all querys tracker. - std::shared_ptr _all_query_mem_tracker = nullptr; - std::unique_ptr _query_mem_tracker_registry; - ThreadResourceMgr* _thread_mgr = nullptr; + std::shared_ptr _query_pool_mem_tracker = nullptr; + std::unique_ptr _task_pool_mem_tracker_registry; // The following two thread pools are used in different scenarios. // _scan_thread_pool is a priority thread pool. diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 129dcb1dbdb94f..89d67f1eb19d14 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -44,6 +44,7 @@ #include "runtime/load_channel_mgr.h" #include "runtime/load_path_mgr.h" #include "runtime/mem_tracker.h" +#include "runtime/mem_tracker_task_pool.h" #include "runtime/result_buffer_mgr.h" #include "runtime/result_queue_mgr.h" #include "runtime/routine_load/routine_load_task_executor.h" @@ -93,7 +94,7 @@ Status ExecEnv::_init(const std::vector& store_paths) { _broker_client_cache = new BrokerServiceClientCache(config::max_client_cache_size_per_host); _extdatasource_client_cache = new ExtDataSourceServiceClientCache(config::max_client_cache_size_per_host); - _query_mem_tracker_registry.reset(new QueryMemTrackerRegistry()); + _task_pool_mem_tracker_registry.reset(new MemTrackerTaskPool()); _thread_mgr = new ThreadResourceMgr(); _scan_thread_pool = new PriorityThreadPool(config::doris_scanner_thread_pool_thread_num, config::doris_scanner_thread_pool_queue_size); @@ -173,14 +174,14 @@ Status ExecEnv::_init_mem_tracker() { << ". Using physical memory instead"; global_memory_limit_bytes = MemInfo::physical_mem(); } - _process_mem_tracker = MemTracker::CreateTracker(global_memory_limit_bytes, "Process", - MemTracker::GetRootTracker(), false, false, - MemTrackerLevel::OVERVIEW); + _process_mem_tracker = MemTracker::create_tracker(global_memory_limit_bytes, "Process", + MemTracker::get_root_tracker(), MemTrackerLevel::OVERVIEW); + _new_process_mem_tracker = MemTracker::create_tracker(global_memory_limit_bytes, "NewProcess", + MemTracker::get_root_tracker(), MemTrackerLevel::OVERVIEW); + _query_pool_mem_tracker = MemTracker::create_tracker(global_memory_limit_bytes, "QueryPool", + _new_process_mem_tracker, MemTrackerLevel::OVERVIEW); REGISTER_HOOK_METRIC(query_mem_consumption, - [this]() { return _process_mem_tracker->consumption(); }); - _all_query_mem_tracker = - MemTracker::CreateTracker(global_memory_limit_bytes, "All Query", _process_mem_tracker, - false, false, MemTrackerLevel::OVERVIEW); + [this]() { return _query_pool_mem_tracker->consumption(); }); LOG(INFO) << "Using global memory limit: " << PrettyPrinter::print(global_memory_limit_bytes, TUnit::BYTES) << ", origin config value: " << config::mem_limit; @@ -245,7 +246,7 @@ Status ExecEnv::_init_mem_tracker() { SegmentLoader::create_global_instance(config::segment_cache_capacity); // 4. init other managers - RETURN_IF_ERROR(_disk_io_mgr->init(_process_mem_tracker)); + RETURN_IF_ERROR(_disk_io_mgr->init(_process_mem_tracker->limit())); RETURN_IF_ERROR(_tmp_file_mgr->init()); // TODO(zc): The current memory usage configuration is a bit confusing, diff --git a/be/src/runtime/export_sink.cpp b/be/src/runtime/export_sink.cpp index 9cc9f4c293792f..b1dbcdb5bc20ad 100644 --- a/be/src/runtime/export_sink.cpp +++ b/be/src/runtime/export_sink.cpp @@ -72,7 +72,7 @@ Status ExportSink::prepare(RuntimeState* state) { _profile = state->obj_pool()->add(new RuntimeProfile(title.str())); SCOPED_TIMER(_profile->total_time_counter()); - _mem_tracker = MemTracker::CreateTracker(-1, "ExportSink", state->instance_mem_tracker()); + _mem_tracker = MemTracker::create_tracker(-1, "ExportSink", state->instance_mem_tracker()); // Prepare the exprs to run. RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _mem_tracker)); diff --git a/be/src/runtime/fold_constant_executor.cpp b/be/src/runtime/fold_constant_executor.cpp index f093c04235ee93..2ecc6ecb6c620a 100644 --- a/be/src/runtime/fold_constant_executor.cpp +++ b/be/src/runtime/fold_constant_executor.cpp @@ -188,7 +188,7 @@ Status FoldConstantExecutor::_init(const TQueryGlobals& query_globals) { _runtime_profile = _runtime_state->runtime_profile(); _runtime_profile->set_name("FoldConstantExpr"); - _mem_tracker = MemTracker::CreateTracker(-1, "FoldConstantExpr", _runtime_state->instance_mem_tracker()); + _mem_tracker = MemTracker::create_tracker(-1, "FoldConstantExpr", _runtime_state->instance_mem_tracker()); _mem_pool.reset(new MemPool(_mem_tracker.get())); return Status::OK(); diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index efc8e59d94ba7d..4884aaaaa222d4 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -470,8 +470,10 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi .query_id(exec_state->query_id()) .instance_id(exec_state->fragment_instance_id()) .tag("pthread_id", std::to_string((uintptr_t)pthread_self())); - thread_local_ctx.attach(ThreadContext::QUERY, print_id(exec_state->query_id()), - exec_state->fragment_instance_id()); + SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(exec_state->query_id()), + exec_state->fragment_instance_id()); + // thread_local_ctx.attach(ThreadContext::QUERY, print_id(exec_state->query_id()), + // exec_state->fragment_instance_id()); exec_state->execute(); std::shared_ptr fragments_ctx = exec_state->get_fragments_ctx(); @@ -492,7 +494,7 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi // Callback after remove from this id cb(exec_state->executor()); - thread_local_ctx.detach(); + // thread_local_ctx.detach(); } Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params) { diff --git a/be/src/runtime/initial_reservations.cc b/be/src/runtime/initial_reservations.cc index adbc2be09d7883..0e622e85328a5b 100644 --- a/be/src/runtime/initial_reservations.cc +++ b/be/src/runtime/initial_reservations.cc @@ -38,7 +38,7 @@ InitialReservations::InitialReservations(ObjectPool* obj_pool, std::shared_ptr query_mem_tracker, int64_t initial_reservation_total_claims) : initial_reservation_mem_tracker_( - MemTracker::CreateTracker(-1, "InitialReservations", query_mem_tracker, false)), + MemTracker::create_tracker(-1, "InitialReservations", query_mem_tracker)), remaining_initial_reservation_claims_(initial_reservation_total_claims) { initial_reservations_.InitChildTracker(nullptr, query_reservation, initial_reservation_mem_tracker_.get(), diff --git a/be/src/runtime/load_channel.cpp b/be/src/runtime/load_channel.cpp index ee33cc3fdbbc07..276425d0ab205b 100644 --- a/be/src/runtime/load_channel.cpp +++ b/be/src/runtime/load_channel.cpp @@ -28,8 +28,8 @@ LoadChannel::LoadChannel(const UniqueId& load_id, int64_t mem_limit, int64_t tim const std::string& sender_ip) : _load_id(load_id), _timeout_s(timeout_s), _is_high_priority(is_high_priority), _sender_ip(sender_ip) { - _mem_tracker = MemTracker::CreateTracker( - mem_limit, "LoadChannel:" + _load_id.to_string(), mem_tracker, true, false, MemTrackerLevel::TASK); + _mem_tracker = MemTracker::create_tracker( + mem_limit, "LoadChannel:" + _load_id.to_string(), mem_tracker, MemTrackerLevel::TASK); // _last_updated_time should be set before being inserted to // _load_channels in load_channel_mgr, or it may be erased // immediately by gc thread. diff --git a/be/src/runtime/load_channel_mgr.cpp b/be/src/runtime/load_channel_mgr.cpp index baa0e8891c41cf..6604e85fc1279e 100644 --- a/be/src/runtime/load_channel_mgr.cpp +++ b/be/src/runtime/load_channel_mgr.cpp @@ -85,7 +85,7 @@ LoadChannelMgr::~LoadChannelMgr() { Status LoadChannelMgr::init(int64_t process_mem_limit) { int64_t load_mem_limit = calc_process_max_load_memory(process_mem_limit); - _mem_tracker = MemTracker::CreateTracker(load_mem_limit, "LoadChannelMgr", nullptr, true, false, MemTrackerLevel::OVERVIEW); + _mem_tracker = MemTracker::create_tracker(load_mem_limit, "LoadChannelMgr", nullptr, MemTrackerLevel::OVERVIEW); REGISTER_HOOK_METRIC(load_mem_consumption, [this]() { return _mem_tracker->consumption(); }); diff --git a/be/src/runtime/mem_pool.cpp b/be/src/runtime/mem_pool.cpp index bcaaa27c448b25..29ed12795dac05 100644 --- a/be/src/runtime/mem_pool.cpp +++ b/be/src/runtime/mem_pool.cpp @@ -24,6 +24,7 @@ #include "runtime/mem_tracker.h" #include "runtime/memory/chunk_allocator.h" +#include "runtime/thread_context.h" #include "util/bit_util.h" #include "util/doris_metrics.h" @@ -37,6 +38,17 @@ const int MemPool::MAX_CHUNK_SIZE; const int MemPool::DEFAULT_ALIGNMENT; uint32_t MemPool::k_zero_length_region_ alignas(std::max_align_t) = MEM_POOL_POISON; +MemPool::MemPool(MemTracker* mem_tracker) + : current_chunk_idx_(-1), + next_chunk_size_(INITIAL_CHUNK_SIZE), + total_allocated_bytes_(0), + total_reserved_bytes_(0), + peak_allocated_bytes_(0), + // new_mem_tracker_(thread_local_ctx.thread_mem_tracker()), + mem_tracker_(mem_tracker) { + DCHECK(mem_tracker != nullptr); +} + MemPool::ChunkInfo::ChunkInfo(const Chunk& chunk_) : chunk(chunk_), allocated_bytes(0) { DorisMetrics::instance()->memory_pool_bytes_total->increment(chunk.size); } @@ -45,9 +57,11 @@ MemPool::~MemPool() { int64_t total_bytes_released = 0; for (auto& chunk : chunks_) { total_bytes_released += chunk.chunk.size; - ChunkAllocator::instance()->free(chunk.chunk); + ChunkAllocator::instance()->free(chunk.chunk, new_mem_tracker_); } - mem_tracker_->Release(total_bytes_released); + mem_tracker_->release(total_bytes_released); + // DCHECK(new_mem_tracker_ == thread_local_ctx.thread_mem_tracker()); + // new_mem_tracker_->release(total_bytes_released); DorisMetrics::instance()->memory_pool_bytes_total->increment(-total_bytes_released); } @@ -65,7 +79,7 @@ void MemPool::free_all() { int64_t total_bytes_released = 0; for (auto& chunk : chunks_) { total_bytes_released += chunk.chunk.size; - ChunkAllocator::instance()->free(chunk.chunk); + ChunkAllocator::instance()->free(chunk.chunk, new_mem_tracker_); } chunks_.clear(); next_chunk_size_ = INITIAL_CHUNK_SIZE; @@ -73,7 +87,9 @@ void MemPool::free_all() { total_allocated_bytes_ = 0; total_reserved_bytes_ = 0; - mem_tracker_->Release(total_bytes_released); + mem_tracker_->release(total_bytes_released); + // DCHECK(new_mem_tracker_ == thread_local_ctx.thread_mem_tracker()); + // new_mem_tracker_->release(total_bytes_released); DorisMetrics::instance()->memory_pool_bytes_total->increment(-total_bytes_released); } @@ -115,18 +131,22 @@ bool MemPool::find_chunk(size_t min_size, bool check_limits) { } chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size); + // DCHECK(new_mem_tracker_ == thread_local_ctx.thread_mem_tracker()); if (check_limits) { - Status st = mem_tracker_->TryConsume(chunk_size); + Status st = mem_tracker_->try_consume(chunk_size); + // Status st2 = new_mem_tracker_->try_consume(chunk_size); WARN_IF_ERROR(st, "try to allocate a new buffer failed"); if (!st) return false; } else { - mem_tracker_->Consume(chunk_size); + mem_tracker_->consume(chunk_size); + // new_mem_tracker_->consume(chunk_size); } // Allocate a new chunk. Return early if allocate fails. Chunk chunk; - if (!ChunkAllocator::instance()->allocate(chunk_size, &chunk)) { - mem_tracker_->Release(chunk_size); + if (!ChunkAllocator::instance()->allocate(chunk_size, &chunk, new_mem_tracker_)) { + mem_tracker_->release(chunk_size); + // new_mem_tracker_->release(chunk_size); return false; } ASAN_POISON_MEMORY_REGION(chunk.data, chunk_size); @@ -173,9 +193,27 @@ void MemPool::acquire_data(MemPool* src, bool keep_current) { // Skip unnecessary atomic ops if the mem_trackers are the same. if (src->mem_tracker_ != mem_tracker_) { - src->mem_tracker_->Release(total_transferred_bytes); - mem_tracker_->Consume(total_transferred_bytes); + src->mem_tracker_->release(total_transferred_bytes); + mem_tracker_->consume(total_transferred_bytes); } + // if (src->new_mem_tracker_ != new_mem_tracker_) { + // // if (task_type_ == "QUERY" && src->task_type() == "UNKNOWN") { + // // thread_local_ctx.consume_mem(total_transferred_bytes); + // // } + // // if (mem_tracker_->GetQueryMemTracker() != nullptr && + // // src->mem_tracker_->GetQueryMemTracker() == nullptr) { + // // thread_local_ctx.consume_mem(total_transferred_bytes); + // // } + // // if ((new_mem_tracker_.lock()->GetQueryMemTracker() != nullptr && + // // src->new_mem_tracker_.lock()->GetQueryMemTracker() == nullptr) || + // // (new_mem_tracker_.lock()->GetQueryMemTracker() == nullptr && + // // src->new_mem_tracker_.lock()->GetQueryMemTracker() != nullptr)) { + // // new_mem_tracker_.lock()->consume(total_transferred_bytes); + // // src->new_mem_tracker_.lock()->release(total_transferred_bytes); + // // } + // new_mem_tracker_->consume(total_transferred_bytes); + // src->new_mem_tracker_->release(total_transferred_bytes); + // } // insert new chunks after current_chunk_idx_ auto insert_chunk = chunks_.begin() + current_chunk_idx_ + 1; @@ -212,8 +250,14 @@ void MemPool::exchange_data(MemPool* other) { std::swap(chunks_, other->chunks_); // update MemTracker - mem_tracker_->Consume(delta_size); - other->mem_tracker_->Release(delta_size); + if (other->mem_tracker_ != mem_tracker_) { + mem_tracker_->consume(delta_size); + other->mem_tracker_->release(delta_size); + } + // if (other->new_mem_tracker_ != new_mem_tracker_) { + // new_mem_tracker_->consume(delta_size); + // other->new_mem_tracker_->release(delta_size); + // } } std::string MemPool::debug_string() { diff --git a/be/src/runtime/mem_pool.h b/be/src/runtime/mem_pool.h index 04d62368315406..faa0a4ad9c4ec2 100644 --- a/be/src/runtime/mem_pool.h +++ b/be/src/runtime/mem_pool.h @@ -86,18 +86,12 @@ class MemTracker; /// At this point p.total_allocated_bytes_ would be 0. /// The one remaining (empty) chunk is released: /// delete p; +// +// 存在pool的申请和释放在不同线程被调用。 class MemPool { public: /// 'tracker' tracks the amount of memory allocated by this pool. Must not be nullptr. - MemPool(MemTracker* mem_tracker) - : current_chunk_idx_(-1), - next_chunk_size_(INITIAL_CHUNK_SIZE), - total_allocated_bytes_(0), - total_reserved_bytes_(0), - peak_allocated_bytes_(0), - mem_tracker_(mem_tracker) { - DCHECK(mem_tracker != nullptr); - } + MemPool(MemTracker* mem_tracker); /// Frees all chunks of memory and subtracts the total allocated bytes /// from the registered limits. @@ -279,6 +273,8 @@ class MemPool { /// The current and peak memory footprint of this pool. This is different from /// total allocated_bytes_ since it includes bytes in chunks that are not used. MemTracker* mem_tracker_; + + std::shared_ptr new_mem_tracker_; }; // Stamp out templated implementations here so they're included in IR module diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index 367ecb387c7a05..b3a3b07996d2c8 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -19,374 +19,138 @@ #include -#include -#include #include #include "exec/exec_node.h" #include "gutil/once.h" -#include "runtime/bufferpool/reservation_tracker_counters.h" #include "runtime/exec_env.h" #include "runtime/runtime_state.h" #include "runtime/thread_context.h" #include "service/backend_options.h" -#include "util/debug_util.h" -#include "util/doris_metrics.h" -#include "util/mem_info.h" #include "util/pretty_printer.h" -#include "util/stack_util.h" #include "util/string_util.h" #include "util/uid_util.h" -using std::deque; -using std::endl; -using std::greater; -using std::list; -using std::pair; -using std::priority_queue; -using std::shared_ptr; -using std::string; - -using std::vector; -using std::weak_ptr; - namespace doris { const std::string MemTracker::COUNTER_NAME = "PeakMemoryUsage"; -// Name for query MemTrackers. '$0' is replaced with the query id. -const std::string QUERY_MEM_TRACKER_LABEL_FORMAT = "queryId={}"; - -/// Calculate the soft limit for a MemTracker based on the hard limit 'limit'. -static int64_t CalcSoftLimit(int64_t limit) { - if (limit < 0) { - return -1; - } - if (MemInfo::initialized() && limit > MemInfo::physical_mem()) { - LOG(WARNING) << "Memory limit " << PrettyPrinter::print(limit, TUnit::BYTES) - << " exceeds physical memory of " - << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES); - limit = MemInfo::physical_mem(); - } - double frac = std::max(0.0, std::min(1.0, config::soft_mem_limit_frac)); - return static_cast(limit * frac); -} - // The ancestor for all trackers. Every tracker is visible from the root down. +// The consume/release of child tracker will not be synchronized to root tracker. +// It is used to independently statistics the real memory of the process in TCMalloc New/Delete Hook. static std::shared_ptr root_tracker; static GoogleOnceType root_tracker_once = GOOGLE_ONCE_INIT; -void MemTracker::CreateRootTracker() { - root_tracker.reset( - new MemTracker(nullptr, -1, "Root", nullptr, true, MemTrackerLevel::OVERVIEW)); +void MemTracker::create_root_tracker() { + root_tracker.reset(new MemTracker(-1, "Root", nullptr, MemTrackerLevel::OVERVIEW, nullptr)); root_tracker->Init(); } -// An independent tracker, no parent and child, -// used in tcmalloc new/delete hook to count the real memory of the process. -static std::shared_ptr global_hook_mem_tracker; -static GoogleOnceType global_hook_mem_tracker_once = GOOGLE_ONCE_INIT; - -void MemTracker::CreateGlobalHookTracker() { - global_hook_mem_tracker.reset( - new MemTracker(nullptr, -1, "Global Hook", nullptr, true, MemTrackerLevel::OVERVIEW)); - global_hook_mem_tracker->Init(); +std::shared_ptr MemTracker::get_root_tracker() { + GoogleOnceInit(&root_tracker_once, &MemTracker::create_root_tracker); + return root_tracker; } -std::shared_ptr MemTracker::CreateTracker(RuntimeProfile* profile, int64_t byte_limit, - const std::string& label, - const std::shared_ptr& parent, - bool reset_label_name, MemTrackerLevel level, - const std::string& query_id) { - std::shared_ptr real_parent; - std::string label_name; - // if parent is not null, reset label name to query id. - // The parent label always: RuntimeState:instance:8ca5a59e3aa84f74-84bb0d0466193736 - // we just need the last id of it: 8ca5a59e3aa84f74-84bb0d0466193736 - // to build the new label name of tracker: `label`: 8ca5a59e3aa84f74-84bb0d0466193736 - // else if parent is null - // just use the root is parent and keep the label_name as label - if (parent) { - real_parent = parent; - if (reset_label_name) { - std::vector tmp_result; - tmp_result = split(parent->label(), ":"); - label_name = label + ":" + tmp_result[tmp_result.size() - 1]; - } else { - label_name = label; +void MemTracker::list_root_trackers(std::vector>* trackers) { + trackers->clear(); + std::deque> to_process; + to_process.push_front(get_root_tracker()); + while (!to_process.empty()) { + std::shared_ptr t = to_process.back(); + to_process.pop_back(); + + trackers->push_back(t); + std::list> children; + { + lock_guard l(t->_child_trackers_lock); + children = t->_child_trackers; + } + for (const auto& child_weak : children) { + std::shared_ptr child = child_weak.lock(); + if (child && static_cast(child->_level) <= + config::mem_tracker_level) { + to_process.emplace_back(std::move(child)); + } } - } else { - real_parent = GetRootTracker(); - label_name = label; } - - shared_ptr tracker( - new MemTracker(profile, byte_limit, label_name, real_parent, true, - level > real_parent->_level ? level : real_parent->_level)); - real_parent->AddChildTracker(tracker); - tracker->Init(); - tracker->set_query_id(query_id); - - return tracker; } -std::shared_ptr MemTracker::CreateTracker(int64_t byte_limit, const std::string& label, - std::shared_ptr parent, - bool log_usage_if_zero, bool reset_label_name, - MemTrackerLevel level, - const std::string& query_id) { - std::shared_ptr real_parent; - std::string label_name; - // if parent is not null, reset label name to query id. - // The parent label always: RuntimeState:instance:8ca5a59e3aa84f74-84bb0d0466193736 - // we just need the last id of it: 8ca5a59e3aa84f74-84bb0d0466193736 - // to build the new label name of tracker: `label`: 8ca5a59e3aa84f74-84bb0d0466193736 - // else if parent is null - // just use the root is parent and keep the label_name as label +std::shared_ptr MemTracker::create_tracker(int64_t byte_limit, const std::string& label, + const std::shared_ptr& parent, + MemTrackerLevel level, + RuntimeProfile* profile) { + std::shared_ptr reset_parent; + std::string reset_label = label; if (parent) { - real_parent = parent; - if (reset_label_name) { - std::vector tmp_result; - tmp_result = split(parent->label(), ":"); - label_name = label + ":" + tmp_result[tmp_result.size() - 1]; + // If parent contains query memtracker, add query ID to label. + if (parent->get_task_mem_tracker() != nullptr) { + std::vector parent_label = split(parent->get_task_mem_tracker()->label(), "="); + reset_label = label + ":" + parent_label[parent_label.size() - 1]; } else { - label_name = label; + reset_label = label; } + reset_parent = std::move(parent); } else { - real_parent = GetRootTracker(); - label_name = label; + reset_parent = get_root_tracker(); } - shared_ptr tracker( - new MemTracker(nullptr, byte_limit, label_name, real_parent, log_usage_if_zero, - level > real_parent->_level ? level : real_parent->_level)); - real_parent->AddChildTracker(tracker); + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); + std::shared_ptr tracker( + new MemTracker(byte_limit, reset_label, reset_parent, + level > reset_parent->_level ? level : reset_parent->_level, profile)); + reset_parent->add_child_tracker(tracker); tracker->Init(); - tracker->set_query_id(query_id); - return tracker; } MemTracker::MemTracker(int64_t byte_limit, const std::string& label) - : MemTracker(nullptr, byte_limit, label, std::shared_ptr(), true, - MemTrackerLevel::VERBOSE) {} - -MemTracker::MemTracker(RuntimeProfile* profile, int64_t byte_limit, const string& label, - const std::shared_ptr& parent, bool log_usage_if_zero, - MemTrackerLevel level) - : limit_(byte_limit), - soft_limit_(CalcSoftLimit(byte_limit)), - label_(label), - parent_(parent), - consumption_metric_(nullptr), - log_usage_if_zero_(log_usage_if_zero), - _level(level), - num_gcs_metric_(nullptr), - bytes_freed_by_last_gc_metric_(nullptr), - bytes_over_limit_metric_(nullptr), - limit_metric_(nullptr) { + : MemTracker(byte_limit, label, std::shared_ptr(), MemTrackerLevel::VERBOSE, + nullptr) {} + +MemTracker::MemTracker(int64_t byte_limit, const std::string& label, + const std::shared_ptr& parent, MemTrackerLevel level, + RuntimeProfile* profile) + : _limit(byte_limit), _label(label), _parent(parent), _level(level) { if (profile == nullptr) { - consumption_ = std::make_shared(TUnit::BYTES); + _consumption = std::make_shared(TUnit::BYTES); } else { - consumption_ = profile->AddSharedHighWaterMarkCounter(COUNTER_NAME, TUnit::BYTES); + _consumption = profile->AddSharedHighWaterMarkCounter(COUNTER_NAME, TUnit::BYTES); } } void MemTracker::Init() { - DCHECK_GE(limit_, -1); - DCHECK_LE(soft_limit_, limit_); - // populate all_trackers_ and limit_trackers_ + DCHECK_GE(_limit, -1); MemTracker* tracker = this; while (tracker != nullptr) { - all_trackers_.push_back(tracker); - if (tracker->has_limit()) limit_trackers_.push_back(tracker); - tracker = tracker->parent_.get(); - } - DCHECK_GT(all_trackers_.size(), 0); - DCHECK_EQ(all_trackers_[0], this); -} - -void MemTracker::AddChildTracker(const std::shared_ptr& tracker) { - lock_guard l(child_trackers_lock_); - tracker->child_tracker_it_ = child_trackers_.insert(child_trackers_.end(), tracker); -} - -void MemTracker::EnableReservationReporting(const ReservationTrackerCounters& counters) { - delete reservation_counters_.swap(new ReservationTrackerCounters(counters)); -} - -int64_t MemTracker::GetLowestLimit(MemLimit mode) const { - if (limit_trackers_.empty()) return -1; - int64_t min_limit = std::numeric_limits::max(); - for (MemTracker* limit_tracker : limit_trackers_) { - DCHECK(limit_tracker->has_limit()); - min_limit = std::min(min_limit, limit_tracker->GetLimit(mode)); - } - return min_limit; -} - -int64_t MemTracker::SpareCapacity(MemLimit mode) const { - int64_t result = std::numeric_limits::max(); - for (const auto& tracker : limit_trackers_) { - int64_t mem_left = tracker->GetLimit(mode) - tracker->consumption(); - result = std::min(result, mem_left); - } - return result; -} - -void MemTracker::RefreshConsumptionFromMetric() { - DCHECK(consumption_metric_ != nullptr); - consumption_->set(consumption_metric_->value()); -} - -int64_t MemTracker::GetPoolMemReserved() { - DCHECK_EQ(limit_, -1) << LogUsage(UNLIMITED_DEPTH); - - // Use cache to avoid holding child_trackers_lock_ - list> children; - { - lock_guard l(child_trackers_lock_); - children = child_trackers_; - } - - int64_t mem_reserved = 0L; - for (const auto& child_weak : children) { - std::shared_ptr child = child_weak.lock(); - if (child) { - int64_t child_limit = child->limit(); - if (child_limit > 0) { - // Make sure we don't overflow if the query limits are set to ridiculous values. - mem_reserved += std::min(child_limit, MemInfo::physical_mem()); - } else { - DCHECK(child_limit == -1) << child->LogUsage(UNLIMITED_DEPTH); - mem_reserved += child->consumption(); - } - } - } - return mem_reserved; -} - -std::shared_ptr QueryMemTrackerRegistry::register_query_mem_tracker( - const std::string& query_id, int64_t mem_limit) { - DCHECK(!query_id.empty()); - VLOG_FILE << "Register query memory tracker, query id: " << query_id - << " limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); - - // First time this query_id registered, make a new object, otherwise do nothing. - // Combine CreateTracker and emplace into one operation to avoid the use of locks - _query_mem_trackers.try_emplace_l( - query_id, [](std::shared_ptr) {}, - MemTracker::CreateTracker(mem_limit, - fmt::format(QUERY_MEM_TRACKER_LABEL_FORMAT, query_id), - ExecEnv::GetInstance()->all_query_mem_tracker(), false, false, - MemTrackerLevel::OVERVIEW, query_id)); - std::shared_ptr tracker = get_query_mem_tracker(query_id); - if (tracker != nullptr) { - tracker->exist_consume_or_release_missing(); - } - return tracker; -} - -std::shared_ptr QueryMemTrackerRegistry::get_query_mem_tracker( - const std::string& query_id) { - DCHECK(!query_id.empty()); - std::shared_ptr tracker = nullptr; - // Avoid using locks to resolve erase conflicts - _query_mem_trackers.if_contains(query_id, - [&tracker](std::shared_ptr v) { tracker = v; }); - return tracker; -} - -void QueryMemTrackerRegistry::deregister_query_mem_tracker() { - std::vector expired_querys; - for (auto it = _query_mem_trackers.begin(); it != _query_mem_trackers.end(); it++) { - // No RuntimeState uses this query MemTracker, it is only referenced by this map, delete it - if (it->second.use_count() == 1) { - expired_querys.emplace_back(it->first); - } - } - for (auto qid : expired_querys) { - DCHECK(_query_mem_trackers[qid].use_count() == 1); - _query_mem_trackers.erase(qid); - VLOG_FILE << "Deregister query memory tracker, query id: " << qid; + _all_trackers.push_back(tracker); + if (tracker->has_limit()) _limit_trackers.push_back(tracker); + tracker = tracker->_parent.get(); } + DCHECK_GT(_all_trackers.size(), 0); + DCHECK_EQ(_all_trackers[0], this); } MemTracker::~MemTracker() { - if (label_ == "Global Hook") { - thread_local_ctx.stop_mem_tracker(); - } - delete reservation_counters_.load(); - if (parent()) { - if (!consume_or_release_missing()) { - DCHECK(consumption() == 0) << "Memory tracker " << debug_string() - << " has unreleased consumption " << consumption(); - } - parent_->Release(consumption()); - - lock_guard l(parent_->child_trackers_lock_); - if (child_tracker_it_ != parent_->child_trackers_.end()) { - parent_->child_trackers_.erase(child_tracker_it_); - child_tracker_it_ = parent_->child_trackers_.end(); - } - } -} - -void MemTracker::ListTrackers(vector>* trackers) { - trackers->clear(); - deque> to_process; - to_process.push_front(GetRootTracker()); - to_process.push_front(GetGlobalHookTracker()); - while (!to_process.empty()) { - shared_ptr t = to_process.back(); - to_process.pop_back(); - - trackers->push_back(t); - list> children; - { - lock_guard l(t->child_trackers_lock_); - children = t->child_trackers_; - } - for (const auto& child_weak : children) { - shared_ptr child = child_weak.lock(); - if (child && static_cast(child->_level) <= - config::mem_tracker_level) { - to_process.emplace_back(std::move(child)); + if (consumption() != 0) { + memory_leak_check(this); + // At present, it can only guarantee the accurate recording of the Instance tracker, + // lower layer has the problem of repeated release of different trackers, as explained above. + if (_level <= MemTrackerLevel::INSTANCE) { + _parent->release(consumption()); } } - } -} -//void MemTracker::RegisterMetrics(MetricGroup* metrics, const string& prefix) { -// num_gcs_metric_ = metrics->AddCounter(strings::Substitute("$0.num-gcs", prefix), 0); -// -// // TODO: Consider a total amount of bytes freed counter -// bytes_freed_by_last_gc_metric_ = metrics->AddGauge( -// strings::Substitute("$0.bytes-freed-by-last-gc", prefix), -1); -// -// bytes_over_limit_metric_ = metrics->AddGauge( -// strings::Substitute("$0.bytes-over-limit", prefix), -1); -// -// limit_metric_ = metrics->AddGauge(strings::Substitute("$0.limit", prefix), limit_); -//} - -void MemTracker::TransferTo(MemTracker* dst, int64_t bytes) { - DCHECK_EQ(all_trackers_.back(), dst->all_trackers_.back()) << "Must have same root"; - // Find the common ancestor and update trackers between 'this'/'dst' and - // the common ancestor. This logic handles all cases, including the - // two trackers being the same or being ancestors of each other because - // 'all_trackers_' includes the current tracker. - int ancestor_idx = all_trackers_.size() - 1; - int dst_ancestor_idx = dst->all_trackers_.size() - 1; - while (ancestor_idx > 0 && dst_ancestor_idx > 0 && - all_trackers_[ancestor_idx - 1] == dst->all_trackers_[dst_ancestor_idx - 1]) { - --ancestor_idx; - --dst_ancestor_idx; + // Do not call release on the parent tracker to avoid repeated releases. + // Ensure that all consume/release are triggered by TCMalloc new/delete hook. + lock_guard l(_parent->_child_trackers_lock); + if (_child_tracker_it != _parent->_child_trackers.end()) { + _parent->_child_trackers.erase(_child_tracker_it); + _child_tracker_it = _parent->_child_trackers.end(); + } } - MemTracker* common_ancestor = all_trackers_[ancestor_idx]; - ReleaseLocal(bytes, common_ancestor); - dst->ConsumeLocal(bytes, common_ancestor); + // TCMalloc hook will be triggered during destructor memtracker, may cause crash. + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); } // Calling this on the query tracker results in output like: @@ -410,83 +174,48 @@ void MemTracker::TransferTo(MemTracker* dst, int64_t bytes) { // TrackerName: Limit=5.00 MB Reservation=5.00 MB OtherMemory=1.04 MB // Total=6.04 MB Peak=6.45 MB // -std::string MemTracker::LogUsage(int max_recursive_depth, const string& prefix, - int64_t* logged_consumption) { +std::string MemTracker::log_usage(int max_recursive_depth, int64_t* logged_consumption) { // Make sure the consumption is up to date. - if (consumption_metric_ != nullptr) RefreshConsumptionFromMetric(); int64_t curr_consumption = consumption(); - int64_t peak_consumption = consumption_->value(); + int64_t peak_consumption = _consumption->value(); if (logged_consumption != nullptr) *logged_consumption = curr_consumption; - if (!log_usage_if_zero_ && curr_consumption == 0) return ""; - - std::stringstream ss; - ss << prefix << label_ << ":"; - if (CheckLimitExceeded(MemLimit::HARD)) ss << " memory limit exceeded."; - if (limit_ > 0) ss << " Limit=" << PrettyPrinter::print(limit_, TUnit::BYTES); - - // TODO(zxy): ReservationTrackerCounters is not actually used in the current Doris. - // Printing here ReservationTrackerCounters may cause BE crash when high concurrency. - // The memory tracker in Doris will be redesigned in the future. - // ReservationTrackerCounters* reservation_counters = reservation_counters_.load(); - // if (reservation_counters != nullptr) { - // int64_t reservation = reservation_counters->peak_reservation->current_value(); - // ss << " Reservation=" << PrettyPrinter::print(reservation, TUnit::BYTES); - // if (reservation_counters->reservation_limit != nullptr) { - // int64_t limit = reservation_counters->reservation_limit->value(); - // ss << " ReservationLimit=" << PrettyPrinter::print(limit, TUnit::BYTES); - // } - // ss << " OtherMemory=" << PrettyPrinter::print(curr_consumption - reservation, TUnit::BYTES); - // } - ss << " Total=" << PrettyPrinter::print(curr_consumption, TUnit::BYTES); - // Peak consumption is not accurate if the metric is lazily updated (i.e. - // this is a non-root tracker that exists only for reporting purposes). - // Only report peak consumption if we actually call Consume()/Release() on - // this tracker or an descendent. - if (consumption_metric_ == nullptr || parent_ == nullptr) { - ss << " Peak=" << PrettyPrinter::print(peak_consumption, TUnit::BYTES); - } + if (_level > MemTrackerLevel::INSTANCE && curr_consumption == 0) return ""; + + std::string detail = + "MemTracker log_usage Label: {}, Limit: {}, Total: {}, Peak: {}, Exceeded: {}"; + fmt::format(detail, _label, PrettyPrinter::print(_limit, TUnit::BYTES), + PrettyPrinter::print(curr_consumption, TUnit::BYTES), + PrettyPrinter::print(peak_consumption, TUnit::BYTES), + limit_exceeded() ? "true" : "false"); // This call does not need the children, so return early. - if (max_recursive_depth == 0) return ss.str(); + if (max_recursive_depth == 0) return detail; // Recurse and get information about the children - std::string new_prefix = fmt::format(" {}", prefix); int64_t child_consumption; std::string child_trackers_usage; - list> children; + std::list> children; { - lock_guard l(child_trackers_lock_); - children = child_trackers_; - } - child_trackers_usage = - LogUsage(max_recursive_depth - 1, new_prefix, children, &child_consumption); - if (!child_trackers_usage.empty()) ss << "\n" << child_trackers_usage; - - if (parent_ == nullptr) { - // Log the difference between the metric value and children as "untracked" memory so - // that the values always add up. This value is not always completely accurate because - // we did not necessarily get a consistent snapshot of the consumption values for all - // children at a single moment in time, but is good enough for our purposes. - int64_t untracked_bytes = curr_consumption - child_consumption; - ss << "\n" - << new_prefix - << "Untracked Memory: Total=" << PrettyPrinter::print(untracked_bytes, TUnit::BYTES); + lock_guard l(_child_trackers_lock); + children = _child_trackers; } - return ss.str(); + child_trackers_usage = log_usage(max_recursive_depth - 1, children, &child_consumption); + if (!child_trackers_usage.empty()) detail += "\n" + child_trackers_usage; + return detail; } -std::string MemTracker::LogUsage(int max_recursive_depth, const string& prefix, - const list>& trackers, - int64_t* logged_consumption) { +std::string MemTracker::log_usage(int max_recursive_depth, + const std::list>& trackers, + int64_t* logged_consumption) { *logged_consumption = 0; - std::vector usage_strings; + std::vector usage_strings; for (const auto& tracker_weak : trackers) { - shared_ptr tracker = tracker_weak.lock(); + std::shared_ptr tracker = tracker_weak.lock(); if (tracker) { int64_t tracker_consumption; std::string usage_string = - tracker->LogUsage(max_recursive_depth, prefix, &tracker_consumption); + tracker->log_usage(max_recursive_depth, &tracker_consumption); if (!usage_string.empty()) usage_strings.push_back(usage_string); *logged_consumption += tracker_consumption; } @@ -494,141 +223,67 @@ std::string MemTracker::LogUsage(int max_recursive_depth, const string& prefix, return join(usage_strings, "\n"); } -std::string MemTracker::LogTopNQueries(int limit) { - if (limit == 0) return ""; - priority_queue, std::vector>, - std::greater>> - min_pq; - GetTopNQueries(min_pq, limit); - std::vector usage_strings(min_pq.size()); - while (!min_pq.empty()) { - usage_strings.push_back(min_pq.top().second); - min_pq.pop(); - } - std::reverse(usage_strings.begin(), usage_strings.end()); - return join(usage_strings, "\n"); -} - -void MemTracker::GetTopNQueries( - priority_queue, std::vector>, - greater>>& min_pq, - int limit) { - list> children; - { - lock_guard l(child_trackers_lock_); - children = child_trackers_; - } - for (const auto& child_weak : children) { - shared_ptr child = child_weak.lock(); - if (child) { - child->GetTopNQueries(min_pq, limit); - } - } -} - -MemTracker* MemTracker::GetQueryMemTracker() { +MemTracker* MemTracker::get_task_mem_tracker() { MemTracker* tracker = this; - while (tracker != nullptr && !tracker->_is_query_mem_tracker) { - tracker = tracker->parent_.get(); + while (tracker != nullptr && tracker->_level != MemTrackerLevel::TASK) { + tracker = tracker->_parent.get(); } return tracker; } -Status MemTracker::MemLimitExceeded(MemTracker* mtracker, RuntimeState* state, - const std::string& details, int64_t failed_allocation_size) { +Status MemTracker::mem_limit_exceeded(RuntimeState* state, const std::string& details, + int64_t failed_allocation_size) { DCHECK_GE(failed_allocation_size, 0); - std::stringstream ss; - if (!details.empty()) ss << details << std::endl; - if (failed_allocation_size != 0) { - if (mtracker != nullptr) ss << mtracker->label(); - ss << " could not allocate " << PrettyPrinter::print(failed_allocation_size, TUnit::BYTES) - << " without exceeding limit." << std::endl; + MemTracker* process_tracker = ExecEnv::GetInstance()->process_mem_tracker().get(); + std::string detail = + "Memory exceed limit. details: {}, Label: {}, could not allocate size {} without " + "exceeding limit on backend: {}, Memory left in process limit: {}, by fragment: {}."; + fmt::format(detail, details, _label, PrettyPrinter::print(failed_allocation_size, TUnit::BYTES), + BackendOptions::get_localhost(), + PrettyPrinter::print(process_tracker->spare_capacity(), TUnit::BYTES), + print_id(state->fragment_instance_id())); + Status status = Status::MemoryLimitExceeded(detail); + if (state != nullptr) state->log_error(detail); + + // only print the tracker log_usage in be log. + if (process_tracker->spare_capacity() < failed_allocation_size) { + // Dumping the process MemTracker is expensive. Limiting the recursive depth to two + // levels limits the level of detail to a one-line summary for each query MemTracker. + detail += "\n" + process_tracker->log_usage(2); } - ss << "Error occurred on backend " << BackendOptions::get_localhost(); - if (state != nullptr) ss << " by fragment " << print_id(state->fragment_instance_id()); - ss << std::endl; - ExecEnv* exec_env = ExecEnv::GetInstance(); - MemTracker* process_tracker = exec_env->process_mem_tracker().get(); - const int64_t process_capacity = process_tracker->SpareCapacity(MemLimit::HARD); - ss << "Memory left in process limit: " << PrettyPrinter::print(process_capacity, TUnit::BYTES) - << std::endl; - Status status = Status::MemoryLimitExceeded(ss.str()); - - // only print the query tracker in be log(if available). - MemTracker* query_tracker = nullptr; - if (mtracker != nullptr) { - query_tracker = mtracker->GetQueryMemTracker(); - if (query_tracker != nullptr) { - if (query_tracker->has_limit()) { - const int64_t query_capacity = - query_tracker->limit() - query_tracker->consumption(); - ss << "Memory left in query limit: " - << PrettyPrinter::print(query_capacity, TUnit::BYTES) << std::endl; - } - ss << query_tracker->LogUsage(UNLIMITED_DEPTH); - } + if (get_task_mem_tracker() != nullptr) { + detail += "\n" + get_task_mem_tracker()->log_usage(); } + LOG(WARNING) << detail; - // Log the process level if the process tracker is close to the limit or - // if this tracker is not within a query's MemTracker hierarchy. - if (process_capacity < failed_allocation_size || query_tracker == nullptr) { - // IMPALA-5598: For performance reasons, limit the levels of recursion when - // dumping the process tracker to only two layers. - ss << process_tracker->LogUsage(PROCESS_MEMTRACKER_LIMITED_DEPTH); - } - if (state != nullptr) state->log_error(ss.str()); - LOG(WARNING) << ss.str(); return status; } -void MemTracker::AddGcFunction(GcFunction f) { - gc_functions_.push_back(f); -} - -bool MemTracker::LimitExceededSlow(MemLimit mode) { - if (mode == MemLimit::HARD && bytes_over_limit_metric_ != nullptr) { - bytes_over_limit_metric_->set_value(consumption() - limit_); - } - return GcMemory(GetLimit(mode)); +void MemTracker::add_gc_function(GcFunction f) { + _gc_functions.push_back(f); } -bool MemTracker::GcMemory(int64_t max_consumption) { +bool MemTracker::gc_memory(int64_t max_consumption) { if (max_consumption < 0) return true; - lock_guard l(gc_lock_); - if (consumption_metric_ != nullptr) RefreshConsumptionFromMetric(); + lock_guard l(_gc_lock); int64_t pre_gc_consumption = consumption(); // Check if someone gc'd before us if (pre_gc_consumption < max_consumption) return false; - if (num_gcs_metric_ != nullptr) num_gcs_metric_->increment(1); int64_t curr_consumption = pre_gc_consumption; + const int64_t EXTRA_BYTES_TO_FREE = 512L * 1024L * 1024L; // Try to free up some memory - for (int i = 0; i < gc_functions_.size(); ++i) { + for (int i = 0; i < _gc_functions.size(); ++i) { // Try to free up the amount we are over plus some extra so that we don't have to // immediately GC again. Don't free all the memory since that can be unnecessarily // expensive. - const int64_t EXTRA_BYTES_TO_FREE = 512L * 1024L * 1024L; int64_t bytes_to_free = curr_consumption - max_consumption + EXTRA_BYTES_TO_FREE; - gc_functions_[i](bytes_to_free); - if (consumption_metric_ != nullptr) RefreshConsumptionFromMetric(); + _gc_functions[i](bytes_to_free); curr_consumption = consumption(); if (max_consumption - curr_consumption <= EXTRA_BYTES_TO_FREE) break; } - if (bytes_freed_by_last_gc_metric_ != nullptr) { - bytes_freed_by_last_gc_metric_->set_value(pre_gc_consumption - curr_consumption); - } return curr_consumption > max_consumption; } -std::shared_ptr MemTracker::GetRootTracker() { - GoogleOnceInit(&root_tracker_once, &MemTracker::CreateRootTracker); - return root_tracker; -} - -std::shared_ptr MemTracker::GetGlobalHookTracker() { - GoogleOnceInit(&global_hook_mem_tracker_once, &MemTracker::CreateGlobalHookTracker); - return global_hook_mem_tracker; -} - } // namespace doris diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index 4a2e6bdd849ca4..2126895fa9946b 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -17,66 +17,30 @@ #pragma once -#include - #include -#include #include #include -#include -#include -#include -#include -#include -#include +#include "common/config.h" #include "common/status.h" -#include "gen_cpp/Types_types.h" // for TUniqueId #include "util/mem_info.h" -#include "util/metrics.h" #include "util/runtime_profile.h" #include "util/spinlock.h" namespace doris { -/// Mode argument passed to various MemTracker methods to indicate whether a soft or hard -/// limit should be used. -enum class MemLimit { HARD, SOFT }; - -/// The Level use to decide whether to show it in web page -/// each MemTracker have a Level equals to parent, only be set explicit -enum class MemTrackerLevel { OVERVIEW = 0, TASK, VERBOSE }; - -// The smallest negative number allowed for consumption value, Unit byte. -// Usually, a negative values means that the statistics are not accurate, -// but A small range of negative values ​​is allowed, because TCMalloc Hook will cache -// a batch of untracked values ​​when it consumes/releases MemTracker, -// which may cause tracker->consumption to be temporarily less than 0. -// Note that, this may obscure other errors. -// consumption_ < 0 will make the memory statistics inaccurate, so it should be avoided. -// 1. The released memory is not consumed. -// 2. The same block of memory, tracker A calls consume, and tracker B calls release. -// 3. Repeated releases of MemTacker. When the consume is called on the child MemTracker, -// after the release is called on the parent MemTracker, -// the child ~MemTracker will cause repeated releases. -static const int MIN_NEGATIVE_CONSUMPTION_VALUE = -10 * 1024 * 1024; - -class ObjectPool; +// The Level use to decide whether to show it in web page, +// each MemTracker have a Level less than or equal to parent, only be set explicit, +// TASK contains query, import, compaction, etc. +enum class MemTrackerLevel { OVERVIEW = 0, TASK, INSTANCE, VERBOSE }; + class MemTracker; -struct ReservationTrackerCounters; class RuntimeState; -class TQueryOptions; /// A MemTracker tracks memory consumption; it contains an optional limit /// and can be arranged into a tree structure such that the consumption tracked /// by a MemTracker is also tracked by its ancestors. /// -/// A MemTracker has a hard and a soft limit derived from the limit. If the hard limit -/// is exceeded, all memory allocations and queries should fail until we are under the -/// limit again. The soft limit can be exceeded without causing query failures, but -/// consumers of memory that can tolerate running without more memory should not allocate -/// memory in excess of the soft limit. -/// /// We use a five-level hierarchy of mem trackers: process, pool, query, fragment /// instance. Specific parts of the fragment (exec nodes, sinks, etc) will add a /// fifth level when they are initialized. This function also initializes a user @@ -93,7 +57,7 @@ class TQueryOptions; /// Release(). /// /// GcFunctions can be attached to a MemTracker in order to free up memory if the limit is -/// reached. If LimitExceeded() is called and the limit is exceeded, it will first call +/// reached. If limit_exceeded() is called and the limit is exceeded, it will first call /// the GcFunctions to try to free memory and recheck the limit. For example, the process /// tracker has a GcFunction that releases any unused memory still held by tcmalloc, so /// this will be called before the process limit is reported as exceeded. GcFunctions are @@ -102,135 +66,85 @@ class TQueryOptions; /// call back into MemTrackers, except to release memory. // /// This class is thread-safe. -class MemTracker : public std::enable_shared_from_this { +class MemTracker { public: - // Creates and adds the tracker to the tree so that it can be retrieved with - // FindTracker/FindOrCreateTracker. - static std::shared_ptr CreateTracker( + // Creates and adds the tracker to the tree + static std::shared_ptr create_tracker( int64_t byte_limit = -1, const std::string& label = std::string(), - std::shared_ptr parent = std::shared_ptr(), - bool log_usage_if_zero = true, bool reset_label_name = true, - MemTrackerLevel level = MemTrackerLevel::VERBOSE, - const std::string& query_id = std::string()); - - static std::shared_ptr CreateTracker( - RuntimeProfile* profile, int64_t byte_limit, const std::string& label = std::string(), const std::shared_ptr& parent = std::shared_ptr(), - bool reset_label_name = true, MemTrackerLevel level = MemTrackerLevel::VERBOSE, - const std::string& query_id = std::string()); + MemTrackerLevel level = MemTrackerLevel::VERBOSE, RuntimeProfile* profile = nullptr); // this is used for creating an orphan mem tracker, or for unit test. - // If a mem tracker has parent, it should be created by `CreateTracker()` + // If a mem tracker has parent, it should be created by `create_tracker()` MemTracker(int64_t byte_limit = -1, const std::string& label = std::string()); ~MemTracker(); // Returns a list of all the valid trackers. - static void ListTrackers(std::vector>* trackers); - - /// Include counters from a ReservationTracker in logs and other diagnostics. - /// The counters should be owned by the fragment's RuntimeProfile. - void EnableReservationReporting(const ReservationTrackerCounters& counters); + static void list_root_trackers(std::vector>* trackers); // Gets a shared_ptr to the "root" tracker, creating it if necessary. - static std::shared_ptr GetRootTracker(); - - // Gets a shared_ptr to the "global_hook" tracker, creating it if necessary. - static std::shared_ptr GetGlobalHookTracker(); - - /// Increases consumption of this tracker and its ancestors by 'bytes'. - void Consume(int64_t bytes) { - // DCHECK_GE(bytes, 0); - if (bytes < 0) { - Release(-bytes); - return; - } - if (bytes == 0) { + static std::shared_ptr get_root_tracker(); + + // Increases consumption of this tracker and its ancestors by 'bytes'. + // up to (but not including) end_tracker. + // This is useful if we want to move tracking between trackers that share a common (i.e. end_tracker) + // ancestor. This happens when we want to update tracking on a particular mem tracker but the consumption + // against the limit recorded in one of its ancestors already happened. + void consume(int64_t bytes, MemTracker* end_tracker = nullptr) { + if (bytes <= 0) { + release(-bytes, end_tracker); return; } - - if (UNLIKELY(consumption_metric_ != nullptr)) { - RefreshConsumptionFromMetric(); - return; // TODO(yingchun): why return not update tracker? - } - for (auto& tracker : all_trackers_) { - tracker->consumption_->add(bytes); - if (LIKELY(tracker->consumption_metric_ == nullptr)) { - DCHECK_GE(tracker->consumption_->current_value(), - std::min(MIN_NEGATIVE_CONSUMPTION_VALUE, - -config::mem_tracker_consume_min_size_bytes * 10)); - } + for (auto& tracker : _all_trackers) { + if (tracker == end_tracker) return; + tracker->_consumption->add(bytes); + memory_leak_check(tracker); } } - /// Increases the consumption of this tracker and the ancestors up to (but - /// not including) end_tracker. This is useful if we want to move tracking between - /// trackers that share a common (i.e. end_tracker) ancestor. This happens when we want - /// to update tracking on a particular mem tracker but the consumption against - /// the limit recorded in one of its ancestors already happened. - void ConsumeLocal(int64_t bytes, MemTracker* end_tracker) { - DCHECK_GE(bytes, 0); - if (UNLIKELY(bytes < 0)) return; // needed in RELEASE, hits DCHECK in DEBUG - ChangeConsumption(bytes, end_tracker); - } - - /// Same as above, but it decreases the consumption. - void ReleaseLocal(int64_t bytes, MemTracker* end_tracker) { - DCHECK_GE(bytes, 0); - if (UNLIKELY(bytes < 0)) return; // needed in RELEASE, hits DCHECK in DEBUG - ChangeConsumption(-bytes, end_tracker); - } - - /// Increases consumption of this tracker and its ancestors by 'bytes' only if - /// they can all consume 'bytes' without exceeding limit (hard or soft) specified - /// by 'mode'. If any limit would be exceed, no MemTrackers are updated. If the - /// caller can tolerate an allocation failing, it should set mode=SOFT so that - /// other callers that may not tolerate allocation failures have a better chance - /// of success. Returns true if the consumption was successfully updated. + // Increases consumption of this tracker and its ancestors by 'bytes' only if + // they can all consume 'bytes' without exceeding limit. If limit would be exceed, + // no MemTrackers are updated. Returns true if the consumption was successfully updated. WARN_UNUSED_RESULT - Status TryConsume(int64_t bytes, MemLimit mode = MemLimit::HARD) { - // DCHECK_GE(bytes, 0); + Status try_consume(int64_t bytes) { if (bytes <= 0) { - Release(-bytes); + release(-bytes); return Status::OK(); } // TCMalloc new/delete hook will call consume before MemInfo is initialized. if (MemInfo::initialized() && MemInfo::current_mem() + bytes >= MemInfo::mem_limit()) { return Status::MemoryLimitExceeded(fmt::format( "{}: TryConsume failed, bytes={} process whole consumption={} mem limit={}", - label_, bytes, MemInfo::current_mem(), MemInfo::mem_limit())); + _label, bytes, MemInfo::current_mem(), MemInfo::mem_limit())); } - // if (UNLIKELY(bytes == 0)) return true; - // if (UNLIKELY(bytes < 0)) return false; // needed in RELEASE, hits DCHECK in DEBUG - if (UNLIKELY(consumption_metric_ != nullptr)) RefreshConsumptionFromMetric(); int i; // Walk the tracker tree top-down. - for (i = all_trackers_.size() - 1; i >= 0; --i) { - MemTracker* tracker = all_trackers_[i]; - const int64_t limit = tracker->GetLimit(mode); + for (i = _all_trackers.size() - 1; i >= 0; --i) { + MemTracker* tracker = _all_trackers[i]; + const int64_t limit = tracker->limit(); if (limit < 0) { - tracker->consumption_->add(bytes); // No limit at this tracker. + tracker->_consumption->add(bytes); // No limit at this tracker. } else { // If TryConsume fails, we can try to GC, but we may need to try several times if // there are concurrent consumers because we don't take a lock before trying to - // update consumption_. + // update _consumption. while (true) { - if (LIKELY(tracker->consumption_->try_add(bytes, limit))) break; + if (LIKELY(tracker->_consumption->try_add(bytes, limit))) break; - if (UNLIKELY(tracker->GcMemory(limit - bytes))) { - DCHECK_GE(i, 0); + if (UNLIKELY(tracker->gc_memory(limit - bytes))) { // Failed for this mem tracker. Roll back the ones that succeeded. - for (int j = all_trackers_.size() - 1; j > i; --j) { - all_trackers_[j]->consumption_->add(-bytes); + for (int j = _all_trackers.size() - 1; j > i; --j) { + _all_trackers[j]->_consumption->add(-bytes); } return Status::MemoryLimitExceeded(fmt::format( "{}: TryConsume failed, bytes={} consumption={} imit={} " "attempting to GC", - tracker->label(), bytes, tracker->consumption_->current_value(), + tracker->label(), bytes, tracker->_consumption->current_value(), limit)); } VLOG_NOTICE << "GC succeeded, TryConsume bytes=" << bytes - << " consumption=" << tracker->consumption_->current_value() + << " consumption=" << tracker->_consumption->current_value() << " limit=" << limit; } } @@ -240,74 +154,37 @@ class MemTracker : public std::enable_shared_from_this { return Status::OK(); } - /// Decreases consumption of this tracker and its ancestors by 'bytes'. - void Release(int64_t bytes) { - // DCHECK_GE(bytes, 0); + // Decreases consumption of this tracker and its ancestors by 'bytes'. + // up to (but not including) end_tracker. + void release(int64_t bytes, MemTracker* end_tracker = nullptr) { if (bytes < 0) { - Consume(-bytes); + consume(-bytes, end_tracker); return; } - if (bytes == 0) { return; } - - // if (UNLIKELY(bytes <= 0)) return; // < 0 needed in RELEASE, hits DCHECK in DEBUG - - if (UNLIKELY(consumption_metric_ != nullptr)) { - RefreshConsumptionFromMetric(); - return; - } - for (auto& tracker : all_trackers_) { - tracker->consumption_->add(-bytes); - /// If a UDF calls FunctionContext::TrackAllocation() but allocates less than the - /// reported amount, the subsequent call to FunctionContext::Free() may cause the - /// process mem tracker to go negative until it is synced back to the tcmalloc - /// metric. Don't blow up in this case. (Note that this doesn't affect non-process - /// trackers since we can enforce that the reported memory usage is internally - /// consistent.) - if (LIKELY(tracker->consumption_metric_ == nullptr)) { - // A query corresponds to multiple threads, and each thread may have - // config::mem_tracker_consume_min_size_bytes. The length is not cosumeed. Here, - // 10 is just a guess. - DCHECK_GE(tracker->consumption_->current_value(), - std::min(MIN_NEGATIVE_CONSUMPTION_VALUE, - -config::mem_tracker_consume_min_size_bytes * 10)) - << std::endl - << tracker->LogUsage(UNLIMITED_DEPTH); - } + for (auto& tracker : _all_trackers) { + if (tracker == end_tracker) return; + tracker->_consumption->add(-bytes); + memory_leak_check(tracker); } } - /// Transfer 'bytes' of consumption from this tracker to 'dst', updating - /// all ancestors up to the first shared ancestor. Must not be used if - /// 'dst' has a limit, or an ancestor with a limit, that is not a common - /// ancestor with the tracker, because this does not check memory limits. - void TransferTo(MemTracker* dst, int64_t bytes); - - /// Returns true if a valid limit of this tracker or one of its ancestors is - /// exceeded. - bool AnyLimitExceeded(MemLimit mode) { - for (const auto& tracker : limit_trackers_) { - if (tracker->LimitExceeded(mode)) { - return true; + static Status batch_consume(int64_t bytes, + const std::vector>& trackers) { + for (auto& tracker : trackers) { + Status st = tracker->try_consume(bytes); + if (!st) { + return st; } } - return false; - } - - /// If this tracker has a limit, checks the limit and attempts to free up some memory if - /// the hard limit is exceeded by calling any added GC functions. Returns true if the - /// limit is exceeded after calling the GC functions. Returns false if there is no limit - /// or consumption is under the limit. - bool LimitExceeded(MemLimit mode) { - if (UNLIKELY(CheckLimitExceeded(mode))) return LimitExceededSlow(mode); - return false; + return Status::OK(); } - // Return limit exceeded tracker or null - MemTracker* find_limit_exceeded_tracker() { - for (const auto& tracker : limit_trackers_) { + // Returns true if a valid limit of this tracker or one of its ancestors is exceeded. + MemTracker* limit_exceeded_tracker() const { + for (const auto& tracker : _limit_trackers) { if (tracker->limit_exceeded()) { return tracker; } @@ -315,78 +192,51 @@ class MemTracker : public std::enable_shared_from_this { return nullptr; } - /// Returns the maximum consumption that can be made without exceeding the limit on - /// this tracker or any of its parents. Returns int64_t::max() if there are no - /// limits and a negative value if any limit is already exceeded. - int64_t SpareCapacity(MemLimit mode) const; - - /// Refresh the memory consumption value from the consumption metric. Only valid to - /// call if this tracker has a consumption metric. - void RefreshConsumptionFromMetric(); - - // TODO(yingchun): following functions are old style which have no MemLimit parameter - bool limit_exceeded() const { return limit_ >= 0 && limit_ < consumption(); } - - int64_t limit() const { return limit_; } - bool has_limit() const { return limit_ >= 0; } - - int64_t soft_limit() const { return soft_limit_; } - int64_t GetLimit(MemLimit mode) const { - if (mode == MemLimit::SOFT) return soft_limit(); - DCHECK_ENUM_EQ(mode, MemLimit::HARD); - return limit(); + bool any_limit_exceeded() const { + return limit_exceeded_tracker() != nullptr; } - const std::string& label() const { return label_; } - std::string query_id() { return query_id_; } - void set_query_id(const std::string& query_id) { - if (query_id != std::string()) { - query_id_ = query_id; - _is_query_mem_tracker = true; + // Returns the maximum consumption that can be made without exceeding the limit on + // this tracker or any of its parents. Returns int64_t::max() if there are no + // limits and a negative value if any limit is already exceeded. + int64_t spare_capacity() const { + int64_t result = std::numeric_limits::max(); + for (const auto& tracker : _limit_trackers) { + int64_t mem_left = tracker->limit() - tracker->consumption(); + result = std::min(result, mem_left); } + return result; } - bool consume_or_release_missing() { return _consume_or_release_missing; } - - void exist_consume_or_release_missing() { _consume_or_release_missing = true; } - - /// Returns the lowest limit for this tracker and its ancestors. Returns - /// -1 if there is no limit. - int64_t GetLowestLimit(MemLimit mode) const; + // Returns the lowest limit for this tracker and its ancestors. Returns -1 if there is no limit. + int64_t get_lowest_limit() const { + if (_limit_trackers.empty()) return -1; + int64_t min_limit = std::numeric_limits::max(); + for (const auto& tracker : _limit_trackers) { + DCHECK(tracker->has_limit()); + min_limit = std::min(min_limit, tracker->limit()); + } + return min_limit; + } - /// Returns the memory 'reserved' by this resource pool mem tracker, which is the sum - /// of the memory reserved by the queries in it (i.e. its child trackers). The mem - /// reserved for a query that is currently executing is its limit_, if set (which - /// should be the common case with admission control). Otherwise, if the query has - /// no limit or the query is finished executing, the current consumption is used. - int64_t GetPoolMemReserved(); + bool limit_exceeded() const { return _limit >= 0 && _limit < consumption(); } + int64_t limit() const { return _limit; } + bool has_limit() const { return _limit >= 0; } - /// Returns the memory consumed in bytes. - int64_t consumption() const { return consumption_->current_value(); } + const std::string& label() const { return _label; } - /// Note that if consumption_ is based on consumption_metric_, this will the max value - /// we've recorded in consumption(), not necessarily the highest value - /// consumption_metric_ has ever reached. - int64_t peak_consumption() const { return consumption_->value(); } + // Returns the memory consumed in bytes. + int64_t consumption() const { return _consumption->current_value(); } + int64_t peak_consumption() const { return _consumption->value(); } - std::shared_ptr parent() const { return parent_; } + std::shared_ptr parent() const { return _parent; } - /// Signature for function that can be called to free some memory after limit is - /// reached. The function should try to free at least 'bytes_to_free' bytes of - /// memory. See the class header for further details on the expected behaviour of - /// these functions. typedef std::function GcFunction; - /// Add a function 'f' to be called if the limit is reached, if none of the other /// previously-added GC functions were successful at freeing up enough memory. /// 'f' does not need to be thread-safe as long as it is added to only one MemTracker. /// Note that 'f' must be valid for the lifetime of this MemTracker. - void AddGcFunction(GcFunction f); - - /// Register this MemTracker's metrics. Each key will be of the form - /// ".". - // TODO(yingchun): remove comments - //void RegisterMetrics(MetricGroup* metrics, const std::string& prefix); + void add_gc_function(GcFunction f); /// Logs the usage of this tracker and optionally its children (recursively). /// If 'logged_consumption' is non-nullptr, sets the consumption value logged. @@ -394,255 +244,131 @@ class MemTracker : public std::enable_shared_from_this { /// to include in the dump. If it is zero, then no children are dumped. /// Limiting the recursive depth reduces the cost of dumping, particularly /// for the process MemTracker. - /// TODO: once all memory is accounted in ReservationTracker hierarchy, move - /// reporting there. - std::string LogUsage(int max_recursive_depth, const std::string& prefix = "", - int64_t* logged_consumption = nullptr); - /// Dumping the process MemTracker is expensive. Limiting the recursive depth - /// to two levels limits the level of detail to a one-line summary for each query - /// MemTracker, avoiding all MemTrackers below that level. This provides a summary - /// of process usage with substantially lower cost than the full dump. - static const int PROCESS_MEMTRACKER_LIMITED_DEPTH = 2; - /// Unlimited dumping is useful for query memtrackers or error conditions that - /// are not performance sensitive - static const int UNLIMITED_DEPTH = INT_MAX; - - /// Logs the usage of 'limit' number of queries based on maximum total memory - /// consumption. - std::string LogTopNQueries(int limit); + std::string log_usage(int max_recursive_depth = INT_MAX, int64_t* logged_consumption = nullptr); /// Log the memory usage when memory limit is exceeded and return a status object with /// details of the allocation which caused the limit to be exceeded. /// If 'failed_allocation_size' is greater than zero, logs the allocation size. If /// 'failed_allocation_size' is zero, nothing about the allocation size is logged. /// If 'state' is non-nullptr, logs the error to 'state'. - Status MemLimitExceeded(RuntimeState* state, const std::string& details, - int64_t failed_allocation = 0) WARN_UNUSED_RESULT { - return MemLimitExceeded(this, state, details, failed_allocation); - } - - /// Makes MemLimitExceeded callable for nullptr MemTrackers. - static Status MemLimitExceeded(MemTracker* mtracker, RuntimeState* state, - const std::string& details, - int64_t failed_allocation = 0) WARN_UNUSED_RESULT; - - static void update_limits(int64_t bytes, - const std::vector>& trackers) { - for (auto& tracker : trackers) { - tracker->Consume(bytes); - } - } - - static bool limit_exceeded(const std::vector>& trackers) { - for (const auto& tracker : trackers) { - if (tracker->limit_exceeded()) { - // TODO: remove logging - LOG(WARNING) << "exceeded limit: limit=" << tracker->limit() - << " consumption=" << tracker->consumption(); - return true; - } - } - - return false; - } + Status mem_limit_exceeded(RuntimeState* state, const std::string& details = std::string(), + int64_t failed_allocation = 0) WARN_UNUSED_RESULT; std::string debug_string() { std::stringstream msg; - msg << "limit: " << limit_ << "; " - << "consumption: " << consumption_->current_value() << "; " - << "label: " << label_ << "; " - << "all tracker size: " << all_trackers_.size() << "; " - << "limit trackers size: " << limit_trackers_.size() << "; " - << "parent is null: " << ((parent_ == nullptr) ? "true" : "false") << "; "; + msg << "limit: " << _limit << "; " + << "consumption: " << _consumption->current_value() << "; " + << "label: " << _label << "; " + << "all tracker size: " << _all_trackers.size() << "; " + << "limit trackers size: " << _limit_trackers.size() << "; " + << "parent is null: " << ((_parent == nullptr) ? "true" : "false") << "; "; return msg.str(); } - bool is_consumption_metric_null() const { return consumption_metric_ == nullptr; } - static const std::string COUNTER_NAME; private: /// 'byte_limit' < 0 means no limit - /// 'label' is the label used in the usage string (LogUsage()) - /// If 'log_usage_if_zero' is false, this tracker (and its children) will not be - /// included in LogUsage() output if consumption is 0. - MemTracker(RuntimeProfile* profile, int64_t byte_limit, const std::string& label, - const std::shared_ptr& parent, bool log_usage_if_zero, MemTrackerLevel); + /// 'label' is the label used in the usage string (log_usage()) + MemTracker(int64_t byte_limit, const std::string& label, + const std::shared_ptr& parent, MemTrackerLevel, RuntimeProfile* profile); private: - friend class QueryMemTrackerRegistry; - - // TODO(HW): remove later - /// Closes this MemTracker. After closing it is invalid to consume memory on this - /// tracker and the tracker's consumption counter (which may be owned by a - /// RuntimeProfile, not this MemTracker) can be safely destroyed. MemTrackers without - /// consumption metrics in the context of a daemon must always be closed. - /// Idempotent: calling multiple times has no effect. - void Close(); - - /// Returns true if the current memory tracker's limit is exceeded. - bool CheckLimitExceeded(MemLimit mode) const { - int64_t limit = GetLimit(mode); - return limit >= 0 && limit < consumption(); - } - - /// Slow path for LimitExceeded(). - bool LimitExceededSlow(MemLimit mode); + // If consumption is higher than max_consumption, attempts to free memory by calling + // any added GC functions. Returns true if max_consumption is still exceeded. Takes gc_lock. + bool gc_memory(int64_t max_consumption); - /// If consumption is higher than max_consumption, attempts to free memory by calling - /// any added GC functions. Returns true if max_consumption is still exceeded. Takes - /// gc_lock. Updates metrics if initialized. - bool GcMemory(int64_t max_consumption); - - /// Walks the MemTracker hierarchy and populates all_trackers_ and + /// Walks the MemTracker hierarchy and populates _all_trackers and /// limit_trackers_ void Init(); - /// Adds tracker to child_trackers_ - void AddChildTracker(const std::shared_ptr& tracker); + // Adds tracker to _child_trackers + void add_child_tracker(const std::shared_ptr& tracker) { + std::lock_guard l(_child_trackers_lock); + tracker->_child_tracker_it = _child_trackers.insert(_child_trackers.end(), tracker); + } /// Log consumption of all the trackers provided. Returns the sum of consumption in /// 'logged_consumption'. 'max_recursive_depth' specifies the maximum number of levels /// of children to include in the dump. If it is zero, then no children are dumped. - static std::string LogUsage(int max_recursive_depth, const std::string& prefix, - const std::list>& trackers, - int64_t* logged_consumption); - - /// Helper function for LogTopNQueries that iterates through the MemTracker hierarchy - /// and populates 'min_pq' with 'limit' number of elements (that contain state related - /// to query MemTrackers) based on maximum total memory consumption. - void GetTopNQueries(std::priority_queue, - std::vector>, - std::greater>>& min_pq, - int limit); - - /// If an ancestor of this tracker is a query MemTracker, return that tracker. - /// Otherwise return nullptr. - MemTracker* GetQueryMemTracker(); - - /// Increases/Decreases the consumption of this tracker and the ancestors up to (but - /// not including) end_tracker. - void ChangeConsumption(int64_t bytes, MemTracker* end_tracker) { - DCHECK(consumption_metric_ == nullptr) << "Should not be called on root."; - for (MemTracker* tracker : all_trackers_) { - if (tracker == end_tracker) return; - DCHECK(!tracker->has_limit()) << tracker->label() << " have limit:" << tracker->limit(); - tracker->consumption_->add(bytes); + static std::string log_usage(int max_recursive_depth, + const std::list>& trackers, + int64_t* logged_consumption); + + // Usually, a negative values means that the statistics are not accurate, + // 1. The released memory is not consumed. + // 2. The same block of memory, tracker A calls consume, and tracker B calls release. + // 3. Repeated releases of MemTacker. When the consume is called on the child MemTracker, + // after the release is called on the parent MemTracker, + // the child ~MemTracker will cause repeated releases. + // + // But TCMalloc Hook will cache a batch of untracked values ​​when it consumes/releases + // MemTracker, which may cause tracker->consumption to be temporarily less than 0. + // so a small range of negative values ​​is allowed, because, this may obscure above errors. + // + // A query corresponds to multiple threads, and each thread may have + // config::mem_tracker_consume_min_size_bytes is not consumed. Here, 100 is just a guess. + void memory_leak_check(MemTracker* tracker) { + if (config::memory_leak_detection) { + DCHECK_GE(tracker->_consumption->current_value(), + -config::mem_tracker_consume_min_size_bytes * 1024) + << std::endl + << tracker->log_usage(); } - DCHECK(false) << "end_tracker is not an ancestor"; } - // Creates the root tracker. - static void CreateRootTracker(); - - // Creates the global hook tracker. - static void CreateGlobalHookTracker(); - - /// Lock to protect GcMemory(). This prevents many GCs from occurring at once. - std::mutex gc_lock_; - - /// True if this is a Query MemTracker returned from register_query_mem_tracker(). - bool _is_query_mem_tracker = false; + // If an ancestor of this tracker is a Task MemTracker, return that tracker. Otherwise return nullptr. + MemTracker* get_task_mem_tracker(); - /// Only valid for MemTrackers returned from register_query_mem_tracker() - std::string query_id_; - - /// Hard limit on memory consumption, in bytes. May not be exceeded. If limit_ == -1, - /// there is no consumption limit. - const int64_t limit_; - - /// Soft limit on memory consumption, in bytes. Can be exceeded but callers to - /// TryConsume() can opt not to exceed this limit. If -1, there is no consumption limit. - const int64_t soft_limit_; - - // Is there a situation where different MemTracker calls consume and release in the same block. - // Happened at: The current tracker calls consume/release, and other threads call release/consume. - bool _consume_or_release_missing = false; + // Creates the root tracker. + static void create_root_tracker(); - std::string label_; + // Limit on memory consumption, in bytes. If limit_ == -1, there is no consumption limit. + const int64_t _limit; - /// The parent of this tracker. The pointer is never modified, even after this tracker - /// is unregistered. - std::shared_ptr parent_; + std::string _label; - /// in bytes - std::shared_ptr consumption_; + MemTrackerLevel _level; - /// If non-nullptr, used to measure consumption (in bytes) rather than the values provided - /// to Consume()/Release(). Only used for the process tracker, thus parent_ should be - /// nullptr if consumption_metric_ is set. - IntGauge* consumption_metric_; + std::shared_ptr _parent; // The parent of this tracker. - /// If non-nullptr, counters from a corresponding ReservationTracker that should be - /// reported in logs and other diagnostics. Owned by this MemTracker. The counters - /// are owned by the fragment's RuntimeProfile. - AtomicPtr reservation_counters_; + std::shared_ptr _consumption; // in bytes - std::vector all_trackers_; // this tracker plus all of its ancestors - std::vector limit_trackers_; // all_trackers_ with valid limits + std::vector _all_trackers; // this tracker plus all of its ancestors + std::vector _limit_trackers; // _all_trackers with valid limits // All the child trackers of this tracker. Used for error reporting and // listing only (i.e. updating the consumption of a parent tracker does not // update that of its children). - SpinLock child_trackers_lock_; - std::list> child_trackers_; - - /// Iterator into parent_->child_trackers_ for this object. Stored to have O(1) - /// remove. - std::list>::iterator child_tracker_it_; - - /// Functions to call after the limit is reached to free memory. - std::vector gc_functions_; - - /// If false, this tracker (and its children) will not be included in LogUsage() output - /// if consumption is 0. - bool log_usage_if_zero_; - - MemTrackerLevel _level; - - /// The number of times the GcFunctions were called. - IntCounter* num_gcs_metric_; - - /// The number of bytes freed by the last round of calling the GcFunctions (-1 before any - /// GCs are performed). - IntGauge* bytes_freed_by_last_gc_metric_; - - /// The number of bytes over the limit we were the last time LimitExceeded() was called - /// and the limit was exceeded pre-GC. -1 if there is no limit or the limit was never - /// exceeded. - IntGauge* bytes_over_limit_metric_; - - /// Metric for limit_. - IntGauge* limit_metric_; + SpinLock _child_trackers_lock; + std::list> _child_trackers; + // Iterator into parent_->child_trackers_ for this object. Stored to have O(1) remove. + std::list>::iterator _child_tracker_it; + + // Lock to protect gc_memory(). This prevents many GCs from occurring at once. + std::mutex _gc_lock; + // Functions to call after the limit is reached to free memory. + std::vector _gc_functions; }; -// Global registry for query MemTrackers. Owned by ExecEnv. -class QueryMemTrackerRegistry { -public: - // Construct a MemTracker object for 'query_id' with 'mem_limit' as the memory limit. - // The MemTracker is a child of the process MemTracker, Calling this with the same - // 'query_id' will return the same MemTracker object. This is used to track the local - // memory usage of all querys executing. The first time this is called for a query, - // a new MemTracker object is created with the process tracker as its parent. - // Newly created trackers will always have a limit of -1. - std::shared_ptr register_query_mem_tracker(const std::string& query_id, - int64_t mem_limit = -1); - - std::shared_ptr get_query_mem_tracker(const std::string& query_id); - - void deregister_query_mem_tracker(); - -private: - // All per-query MemTracker objects. - // The life cycle of query memtracker in the process is the same as query runtime state, - // MemTrackers will be removed from this map after query finish or cancel. - using QueryTrackersMap = phmap::parallel_flat_hash_map< - std::string, std::shared_ptr, phmap::priv::hash_default_hash, - phmap::priv::hash_default_eq, - std::allocator>>, 12, - std::mutex>; - - QueryTrackersMap _query_mem_trackers; -}; +#define LIMIT_EXCEEDED(tracker, state, msg) \ + do { \ + stringstream str; \ + str << "Memory exceed limit. " << msg << " "; \ + str << "Backend: " << BackendOptions::get_localhost() << ", "; \ + str << "fragment: " << print_id(state->fragment_instance_id()) << " "; \ + str << "Used: " << tracker->consumption() << ", Limit: " << tracker->limit() << ". "; \ + str << "You can change the limit by session variable exec_mem_limit."; \ + return Status::MemoryLimitExceeded(str.str()); \ + } while (false) + +#define RETURN_IF_LIMIT_EXCEEDED(state, msg) \ + do { \ + /* if (UNLIKELY(MemTracker::limit_exceeded(*(state)->mem_trackers()))) { */ \ + MemTracker* tracker = state->instance_mem_tracker()->limit_exceeded_tracker(); \ + if (tracker != nullptr) { \ + LIMIT_EXCEEDED(tracker, state, msg); \ + } \ + } while (false) } // namespace doris diff --git a/be/src/runtime/mem_tracker_task_pool.cpp b/be/src/runtime/mem_tracker_task_pool.cpp new file mode 100644 index 00000000000000..a233b714ed01fe --- /dev/null +++ b/be/src/runtime/mem_tracker_task_pool.cpp @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/mem_tracker_task_pool.h" + +#include "common/config.h" +#include "runtime/exec_env.h" +#include "util/pretty_printer.h" + +namespace doris { + +std::shared_ptr MemTrackerTaskPool::register_query_mem_tracker( + const std::string& query_id, int64_t mem_limit) { + DCHECK(!query_id.empty()); + VLOG_FILE << "Register query memory tracker, query id: " << query_id + << " limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); + + // First time this query_id registered, make a new object, otherwise do nothing. + // Combine create_tracker and emplace into one operation to avoid the use of locks + // Name for query MemTrackers. '$0' is replaced with the query id. + _query_mem_trackers.try_emplace_l( + query_id, [](std::shared_ptr) {}, + MemTracker::create_tracker(mem_limit, fmt::format("queryId={}", query_id), + ExecEnv::GetInstance()->query_pool_mem_tracker(), + MemTrackerLevel::TASK)); + std::shared_ptr tracker = get_query_mem_tracker(query_id); + return tracker; +} + +std::shared_ptr MemTrackerTaskPool::get_query_mem_tracker(const std::string& query_id) { + DCHECK(!query_id.empty()); + std::shared_ptr tracker = nullptr; + // Avoid using locks to resolve erase conflicts + _query_mem_trackers.if_contains(query_id, + [&tracker](std::shared_ptr v) { tracker = v; }); + return tracker; +} + +void MemTrackerTaskPool::logout_query_mem_tracker() { + std::vector expired_querys; + for (auto it = _query_mem_trackers.begin(); it != _query_mem_trackers.end(); it++) { + // No RuntimeState uses this query MemTracker, it is only referenced by this map, delete it + if (it->second.use_count() == 1) { + if (!config::memory_leak_detection || it->second->consumption() == 0) { + expired_querys.emplace_back(it->first); + } else { + LOG(WARNING) << "Memory tracker " << it->second->debug_string() << " Memory leak " + << it->second->consumption(); + } + } + } + for (auto qid : expired_querys) { + DCHECK(_query_mem_trackers[qid].use_count() == 1); + _query_mem_trackers.erase(qid); + VLOG_FILE << "Deregister query memory tracker, query id: " << qid; + } +} + +// TODO(zxy) +// /// Logs the usage of 'limit' number of queries based on maximum total memory +// /// consumption. +// std::string MemTracker::LogTopNQueries(int limit) { +// if (limit == 0) return ""; +// priority_queue, std::vector>, +// std::greater>> +// min_pq; +// GetTopNQueries(min_pq, limit); +// std::vector usage_strings(min_pq.size()); +// while (!min_pq.empty()) { +// usage_strings.push_back(min_pq.top().second); +// min_pq.pop(); +// } +// std::reverse(usage_strings.begin(), usage_strings.end()); +// return join(usage_strings, "\n"); +// } + +// /// Helper function for LogTopNQueries that iterates through the MemTracker hierarchy +// /// and populates 'min_pq' with 'limit' number of elements (that contain state related +// /// to query MemTrackers) based on maximum total memory consumption. +// void MemTracker::GetTopNQueries( +// priority_queue, std::vector>, +// greater>>& min_pq, +// int limit) { +// list> children; +// { +// lock_guard l(child_trackers_lock_); +// children = child_trackers_; +// } +// for (const auto& child_weak : children) { +// shared_ptr child = child_weak.lock(); +// if (child) { +// child->GetTopNQueries(min_pq, limit); +// } +// } +// } + +} // namespace doris diff --git a/be/src/runtime/mem_tracker_task_pool.h b/be/src/runtime/mem_tracker_task_pool.h new file mode 100644 index 00000000000000..f927dffd369663 --- /dev/null +++ b/be/src/runtime/mem_tracker_task_pool.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "runtime/mem_tracker.h" + +namespace doris { + +// Global task pool for query MemTrackers. Owned by ExecEnv. +class MemTrackerTaskPool { +public: + // Construct a MemTracker object for 'query_id' with 'mem_limit' as the memory limit. + // The MemTracker is a child of the process MemTracker, Calling this with the same + // 'query_id' will return the same MemTracker object. This is used to track the local + // memory usage of all querys executing. The first time this is called for a query, + // a new MemTracker object is created with the process tracker as its parent. + // Newly created trackers will always have a limit of -1. + std::shared_ptr register_query_mem_tracker(const std::string& query_id, + int64_t mem_limit = -1); + + std::shared_ptr get_query_mem_tracker(const std::string& query_id); + + void logout_query_mem_tracker(); + +private: + // All per-query MemTracker objects. + // The life cycle of query memtracker in the process is the same as query runtime state, + // MemTrackers will be removed from this map after query finish or cancel. + using TaskTrackersMap = phmap::parallel_flat_hash_map< + std::string, std::shared_ptr, phmap::priv::hash_default_hash, + phmap::priv::hash_default_eq, + std::allocator>>, 12, + std::mutex>; + + TaskTrackersMap _query_mem_trackers; +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/runtime/memory/chunk.h b/be/src/runtime/memory/chunk.h index 332631d3fba1aa..3be766981b28ac 100644 --- a/be/src/runtime/memory/chunk.h +++ b/be/src/runtime/memory/chunk.h @@ -22,6 +22,8 @@ namespace doris { +class MemTracker; + // A chunk of continuous memory. // Almost all files depend on this struct, and each modification // will result in recompilation of all files. So, we put it in a @@ -30,6 +32,7 @@ struct Chunk { uint8_t* data = nullptr; size_t size = 0; int core_id = -1; + std::shared_ptr mem_tracker = nullptr; }; } // namespace doris diff --git a/be/src/runtime/memory/chunk_allocator.cpp b/be/src/runtime/memory/chunk_allocator.cpp index cbc2462953c882..435ae4093319de 100644 --- a/be/src/runtime/memory/chunk_allocator.cpp +++ b/be/src/runtime/memory/chunk_allocator.cpp @@ -22,8 +22,10 @@ #include #include "gutil/dynamic_annotations.h" +#include "runtime/mem_tracker.h" #include "runtime/memory/chunk.h" #include "runtime/memory/system_allocator.h" +#include "runtime/thread_context.h" #include "util/bit_util.h" #include "util/cpu_info.h" #include "util/doris_metrics.h" @@ -114,6 +116,10 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit) : _reserve_bytes_limit(reserve_limit), _reserved_bytes(0), _arenas(CpuInfo::get_max_num_cores()) { + _chunk_allocator_mem_tracker = + MemTracker::create_tracker(static_cast(reserve_limit), "ChunkAllocator", + nullptr, MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker); for (int i = 0; i < _arenas.size(); ++i) { _arenas[i].reset(new ChunkArena()); } @@ -128,16 +134,24 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit) INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_system_free_cost_ns); } -bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { +bool ChunkAllocator::allocate(size_t size, Chunk* chunk, std::shared_ptr caller_tracker) { // fast path: allocate from current core arena + chunk->mem_tracker = thread_local_ctx.thread_mem_tracker(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker); + thread_local_ctx.transfer_to_external_tracker(chunk->mem_tracker, size); int core_id = CpuInfo::get_current_core(); chunk->size = size; chunk->core_id = core_id; + if (_arenas[core_id]->pop_free_chunk(size, &chunk->data)) { DCHECK_GE(_reserved_bytes, 0); _reserved_bytes.fetch_sub(size); chunk_pool_local_core_alloc_count->increment(1); + // thread_local_ctx.transfer_in_thread_tracker(_chunk_allocator_mem_tracker, size); + // thread_local_ctx.transfer_to_external_tracker(chunk->mem_tracker, size); + // thread_local_ctx.transfer_to_external_tracker(caller_tracker, size); + // thread_local_ctx.consume_mem(size); return true; } if (_reserved_bytes > size) { @@ -150,6 +164,10 @@ bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { chunk_pool_other_core_alloc_count->increment(1); // reset chunk's core_id to other chunk->core_id = core_id % _arenas.size(); + // thread_local_ctx.transfer_to_external_tracker(chunk->mem_tracker, size); + // thread_local_ctx.transfer_in_thread_tracker(_chunk_allocator_mem_tracker, size); + // thread_local_ctx.transfer_to_external_tracker(caller_tracker, size); + // thread_local_ctx.consume_mem(size); return true; } } @@ -160,19 +178,29 @@ bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { SCOPED_RAW_TIMER(&cost_ns); // allocate from system allocator chunk->data = SystemAllocator::allocate(size); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker); + // _chunk_allocator_mem_tracker->consume(size); } chunk_pool_system_alloc_count->increment(1); chunk_pool_system_alloc_cost_ns->increment(cost_ns); if (chunk->data == nullptr) { + thread_local_ctx.transfer_in_thread_tracker(chunk->mem_tracker, size); return false; } + // thread_local_ctx.transfer_to_external_tracker(chunk->mem_tracker, size); + // thread_local_ctx.transfer_in_thread_tracker(_chunk_allocator_mem_tracker, size); + // thread_local_ctx.transfer_to_external_tracker(caller_tracker, size); + // thread_local_ctx.consume_mem(size); return true; } -void ChunkAllocator::free(const Chunk& chunk) { +void ChunkAllocator::free(Chunk& chunk, std::shared_ptr caller_tracker) { if (chunk.core_id == -1) { return; } + DCHECK(chunk.mem_tracker != nullptr); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker); + thread_local_ctx.transfer_in_thread_tracker(chunk.mem_tracker, chunk.size); int64_t old_reserved_bytes = _reserved_bytes; int64_t new_reserved_bytes = 0; do { @@ -182,7 +210,13 @@ void ChunkAllocator::free(const Chunk& chunk) { { SCOPED_RAW_TIMER(&cost_ns); SystemAllocator::free(chunk.data, chunk.size); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker); + // _chunk_allocator_mem_tracker->release(chunk.size); } + // thread_local_ctx.release_mem(chunk.size); + // chunk.mem_tracker->transfer_to(_chunk_allocator_mem_tracker, chunk.size); + // thread_local_ctx.transfer_in_thread_tracker(chunk.mem_tracker, chunk.size); + // thread_local_ctx.transfer_in_thread_tracker(caller_tracker, chunk.size); chunk_pool_system_free_count->increment(1); chunk_pool_system_free_cost_ns->increment(cost_ns); @@ -191,6 +225,11 @@ void ChunkAllocator::free(const Chunk& chunk) { } while (!_reserved_bytes.compare_exchange_weak(old_reserved_bytes, new_reserved_bytes)); _arenas[chunk.core_id]->push_free_chunk(chunk.data, chunk.size); + chunk.mem_tracker = nullptr; + // thread_local_ctx.transfer_in_thread_tracker(chunk.mem_tracker, chunk.size); + // chunk.mem_tracker->transfer_to(_chunk_allocator_mem_tracker, chunk.size); + // thread_local_ctx.transfer_in_thread_tracker(caller_tracker, chunk.size); + // thread_local_ctx.release_mem(chunk.size); } bool ChunkAllocator::allocate_align(size_t size, Chunk* chunk) { diff --git a/be/src/runtime/memory/chunk_allocator.h b/be/src/runtime/memory/chunk_allocator.h index d7eb22fefed8a3..0ccd967c943c47 100644 --- a/be/src/runtime/memory/chunk_allocator.h +++ b/be/src/runtime/memory/chunk_allocator.h @@ -28,6 +28,7 @@ namespace doris { class Chunk; class ChunkArena; class MetricEntity; +class MemTracker; // Used to allocate memory with power-of-two length. // This Allocator allocate memory from system and cache free chunks for @@ -63,12 +64,12 @@ class ChunkAllocator { // Allocate a Chunk with a power-of-two length "size". // Return true if success and allocated chunk is saved in "chunk". // Otherwise return false. - bool allocate(size_t size, Chunk* chunk); + bool allocate(size_t size, Chunk* chunk, std::shared_ptr caller_tracker = nullptr); bool allocate_align(size_t size, Chunk* chunk); // Free chunk allocated from this allocator - void free(const Chunk& chunk); + void free(Chunk& chunk, std::shared_ptr caller_tracker = nullptr); private: static ChunkAllocator* _s_instance; @@ -79,6 +80,8 @@ class ChunkAllocator { std::vector> _arenas; std::shared_ptr _chunk_allocator_metric_entity; + + std::shared_ptr _chunk_allocator_mem_tracker; }; } // namespace doris diff --git a/be/src/runtime/mysql_table_sink.cpp b/be/src/runtime/mysql_table_sink.cpp index cb7911d9f2b904..0e5042c9b90afa 100644 --- a/be/src/runtime/mysql_table_sink.cpp +++ b/be/src/runtime/mysql_table_sink.cpp @@ -33,7 +33,7 @@ MysqlTableSink::MysqlTableSink(ObjectPool* pool, const RowDescriptor& row_desc, : _pool(pool), _row_desc(row_desc), _t_output_expr(t_exprs), - _mem_tracker(MemTracker::CreateTracker(-1, "MysqlTableSink")) { + _mem_tracker(MemTracker::create_tracker(-1, "MysqlTableSink")) { _name = "MysqlTableSink"; } diff --git a/be/src/runtime/odbc_table_sink.cpp b/be/src/runtime/odbc_table_sink.cpp index b92b1517a0f5af..e9c166d8b7e0b7 100644 --- a/be/src/runtime/odbc_table_sink.cpp +++ b/be/src/runtime/odbc_table_sink.cpp @@ -32,7 +32,7 @@ OdbcTableSink::OdbcTableSink(ObjectPool* pool, const RowDescriptor& row_desc, : _pool(pool), _row_desc(row_desc), _t_output_expr(t_exprs), - _mem_tracker(MemTracker::CreateTracker(-1, "OdbcTableSink")) { + _mem_tracker(MemTracker::create_tracker(-1, "OdbcTableSink")) { _name = "OOBC_TABLE_SINK"; } diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp index 35cd189e230ca0..7b57b3420add43 100644 --- a/be/src/runtime/plan_fragment_executor.cpp +++ b/be/src/runtime/plan_fragment_executor.cpp @@ -128,11 +128,11 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, bytes_limit = _exec_env->process_mem_tracker()->limit(); } // NOTE: this MemTracker only for olap - _mem_tracker = MemTracker::CreateTracker(bytes_limit, - "PlanFragmentExecutor:" + print_id(_query_id) + ":" + - print_id(params.fragment_instance_id), - _exec_env->process_mem_tracker(), true, false, - MemTrackerLevel::TASK); + _mem_tracker = + MemTracker::create_tracker(bytes_limit, + "PlanFragmentExecutor:" + print_id(_query_id) + ":" + + print_id(params.fragment_instance_id), + _exec_env->process_mem_tracker(), MemTrackerLevel::INSTANCE); _runtime_state->set_fragment_mem_tracker(_mem_tracker); RETURN_IF_ERROR(_runtime_state->create_block_mgr()); @@ -464,6 +464,8 @@ void PlanFragmentExecutor::_collect_node_statistics() { } void PlanFragmentExecutor::report_profile() { + SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id()); VLOG_FILE << "report_profile(): instance_id=" << _runtime_state->fragment_instance_id(); DCHECK(_report_status_cb); @@ -702,7 +704,7 @@ void PlanFragmentExecutor::close() { // _mem_tracker init failed if (_mem_tracker.get() != nullptr) { - _mem_tracker->Release(_mem_tracker->consumption()); + _mem_tracker->release(_mem_tracker->consumption()); } _closed = true; } diff --git a/be/src/runtime/result_file_sink.cpp b/be/src/runtime/result_file_sink.cpp index efe367cda6931f..c35722c3c70e1c 100644 --- a/be/src/runtime/result_file_sink.cpp +++ b/be/src/runtime/result_file_sink.cpp @@ -110,9 +110,9 @@ Status ResultFileSink::prepare(RuntimeState* state) { _local_bytes_send_counter = ADD_COUNTER(profile(), "LocalBytesSent", TUnit::BYTES); _uncompressed_bytes_counter = ADD_COUNTER(profile(), "UncompressedRowBatchSize", TUnit::BYTES); - _mem_tracker = MemTracker::CreateTracker( - _profile, -1, "ResultFileSink:" + print_id(state->fragment_instance_id()), - state->instance_mem_tracker()); + _mem_tracker = MemTracker::create_tracker( + -1, "ResultFileSink:" + print_id(state->fragment_instance_id()), + state->instance_mem_tracker(), MemTrackerLevel::VERBOSE, _profile); // create writer _output_batch = new RowBatch(_output_row_descriptor, 1024, _mem_tracker.get()); _writer.reset(new (std::nothrow) FileResultWriter( diff --git a/be/src/runtime/result_sink.cpp b/be/src/runtime/result_sink.cpp index 610f105074c5bb..b83ae8af3b0f4e 100644 --- a/be/src/runtime/result_sink.cpp +++ b/be/src/runtime/result_sink.cpp @@ -23,6 +23,7 @@ #include "runtime/exec_env.h" #include "runtime/file_result_writer.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "runtime/mysql_result_writer.h" #include "runtime/result_buffer_mgr.h" #include "runtime/row_batch.h" @@ -100,6 +101,10 @@ Status ResultSink::open(RuntimeState* state) { } Status ResultSink::send(RuntimeState* state, RowBatch* batch) { + // The memory consumption in the process of sending the results is not recorded in the query memory. + // 1. Avoid the query being cancelled when the memory limit is reached after the query result comes out. + // 2. If record this memory, also need to record on the receiving end, need to consider the life cycle of MemTracker. + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); return _writer->append_row_batch(batch); } diff --git a/be/src/runtime/row_batch.cpp b/be/src/runtime/row_batch.cpp index eb223d661d5d7e..53b8f8f3948d75 100644 --- a/be/src/runtime/row_batch.cpp +++ b/be/src/runtime/row_batch.cpp @@ -59,7 +59,7 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_ DCHECK_GT(_tuple_ptrs_size, 0); // TODO: switch to Init() pattern so we can check memory limit and return Status. if (config::enable_partitioned_aggregation) { - _mem_tracker->Consume(_tuple_ptrs_size); + _mem_tracker->consume(_tuple_ptrs_size); _tuple_ptrs = (Tuple**)(malloc(_tuple_ptrs_size)); DCHECK(_tuple_ptrs != nullptr); } else { @@ -91,7 +91,7 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch, DCHECK_GT(_tuple_ptrs_size, 0); // TODO: switch to Init() pattern so we can check memory limit and return Status. if (config::enable_partitioned_aggregation) { - _mem_tracker->Consume(_tuple_ptrs_size); + _mem_tracker->consume(_tuple_ptrs_size); _tuple_ptrs = (Tuple**)(malloc(_tuple_ptrs_size)); DCHECK(_tuple_ptrs != nullptr); } else { @@ -215,6 +215,138 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch, } } +// TODO: we want our input_batch's tuple_data to come from our (not yet implemented) +// global runtime memory segment; how do we get thrift to allocate it from there? +// maybe change line (in Data_types.cc generated from Data.thrift) +// xfer += iprot->readString(this->tuple_data[_i9]); +// to allocated string data in special mempool +// (change via python script that runs over Data_types.cc) +RowBatch::RowBatch(const RowDescriptor& row_desc, const TRowBatch& input_batch, MemTracker* tracker) + : _mem_tracker(tracker), + _has_in_flight_row(false), + _num_rows(input_batch.num_rows), + _num_uncommitted_rows(0), + _capacity(_num_rows), + _flush(FlushMode::NO_FLUSH_RESOURCES), + _needs_deep_copy(false), + _num_tuples_per_row(input_batch.row_tuples.size()), + _row_desc(row_desc), + _auxiliary_mem_usage(0), + _need_to_return(false), + _tuple_data_pool(_mem_tracker) { + DCHECK(_mem_tracker != nullptr); + _tuple_ptrs_size = _num_rows * input_batch.row_tuples.size() * sizeof(Tuple*); + DCHECK_GT(_tuple_ptrs_size, 0); + // TODO: switch to Init() pattern so we can check memory limit and return Status. + if (config::enable_partitioned_aggregation) { + _mem_tracker->consume(_tuple_ptrs_size); + _tuple_ptrs = (Tuple**)malloc(_tuple_ptrs_size); + DCHECK(_tuple_ptrs != nullptr); + } else { + _tuple_ptrs = (Tuple**)_tuple_data_pool.allocate(_tuple_ptrs_size); + } + + char* tuple_data = nullptr; + if (input_batch.is_compressed) { + // Decompress tuple data into data pool + const char* compressed_data = input_batch.tuple_data.c_str(); + size_t compressed_size = input_batch.tuple_data.size(); + size_t uncompressed_size = 0; + bool success = + snappy::GetUncompressedLength(compressed_data, compressed_size, &uncompressed_size); + DCHECK(success) << "snappy::GetUncompressedLength failed"; + tuple_data = (char*)_tuple_data_pool.allocate(uncompressed_size); + success = snappy::RawUncompress(compressed_data, compressed_size, tuple_data); + DCHECK(success) << "snappy::RawUncompress failed"; + } else { + // Tuple data uncompressed, copy directly into data pool + tuple_data = (char*)_tuple_data_pool.allocate(input_batch.tuple_data.size()); + memcpy(tuple_data, input_batch.tuple_data.c_str(), input_batch.tuple_data.size()); + } + + // convert input_batch.tuple_offsets into pointers + int tuple_idx = 0; + for (auto offset : input_batch.tuple_offsets) { + if (offset == -1) { + _tuple_ptrs[tuple_idx++] = nullptr; + } else { + _tuple_ptrs[tuple_idx++] = convert_to(tuple_data + offset); + } + } + + // Check whether we have slots that require offset-to-pointer conversion. + if (!_row_desc.has_varlen_slots()) { + return; + } + + const auto& tuple_descs = _row_desc.tuple_descriptors(); + + // For every unique tuple, convert string offsets contained in tuple data into + // pointers. Tuples were serialized in the order we are deserializing them in, + // so the first occurrence of a tuple will always have a higher offset than any tuple + // we already converted. + for (int i = 0; i < _num_rows; ++i) { + TupleRow* row = get_row(i); + for (size_t j = 0; j < tuple_descs.size(); ++j) { + auto desc = tuple_descs[j]; + if (desc->string_slots().empty() && desc->collection_slots().empty()) { + continue; + } + + Tuple* tuple = row->get_tuple(j); + if (tuple == nullptr) { + continue; + } + + for (auto slot : desc->string_slots()) { + DCHECK(slot->type().is_string_type()); + StringValue* string_val = tuple->get_string_slot(slot->tuple_offset()); + + int offset = convert_to(string_val->ptr); + string_val->ptr = tuple_data + offset; + + // Why we do this mask? Field len of StringValue is changed from int to size_t in + // Doris 0.11. When upgrading, some bits of len sent from 0.10 is random value, + // this works fine in version 0.10, however in 0.11 this will lead to an invalid + // length. So we make the high bits zero here. + string_val->len &= 0x7FFFFFFFL; + } + + // copy collection slot + for (auto slot_collection : desc->collection_slots()) { + DCHECK(slot_collection->type().is_collection_type()); + CollectionValue* array_val = + tuple->get_collection_slot(slot_collection->tuple_offset()); + + int offset = convert_to(array_val->data()); + array_val->set_data(tuple_data + offset); + int null_offset = convert_to(array_val->null_signs()); + array_val->set_null_signs(convert_to(tuple_data + null_offset)); + + const TypeDescriptor& item_type = slot_collection->type().children.at(0); + if (!item_type.is_string_type()) { + continue; + } + + // copy string item + for (size_t k = 0; k < array_val->length(); ++k) { + if (array_val->is_null_at(k)) { + continue; + } + + StringValue* dst_item_v = convert_to( + (uint8_t*)array_val->data() + k * item_type.get_slot_size()); + + if (dst_item_v->len != 0) { + int offset = convert_to(dst_item_v->ptr); + dst_item_v->ptr = tuple_data + offset; + } + } + } + } + } +} + void RowBatch::clear() { if (_cleared) { return; @@ -237,7 +369,7 @@ void RowBatch::clear() { if (config::enable_partitioned_aggregation) { DCHECK(_tuple_ptrs != nullptr); free(_tuple_ptrs); - _mem_tracker->Release(_tuple_ptrs_size); + _mem_tracker->release(_tuple_ptrs_size); _tuple_ptrs = nullptr; } _cleared = true; diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index b5302aeaceb89e..42ec2e43561587 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -150,7 +150,7 @@ Status RuntimeFilterMergeControllerEntity::_init_with_desc( cntVal->runtime_filter_desc = *runtime_filter_desc; cntVal->target_info = *target_info; cntVal->pool.reset(new ObjectPool()); - cntVal->tracker = MemTracker::CreateTracker(); + cntVal->tracker = MemTracker::create_tracker(); cntVal->filter = cntVal->pool->add( new IRuntimeFilter(nullptr, cntVal->tracker.get(), cntVal->pool.get())); diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index bda83e7c770182..c09a138fb5e20a 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -36,6 +36,7 @@ #include "runtime/initial_reservations.h" #include "runtime/load_path_mgr.h" #include "runtime/mem_tracker.h" +#include "runtime/mem_tracker_task_pool.h" #include "runtime/runtime_filter_mgr.h" #include "runtime/thread_context.h" #include "util/cpu_info.h" @@ -207,32 +208,24 @@ Status RuntimeState::init(const TUniqueId& fragment_instance_id, const TQueryOpt Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { bool has_query_mem_tracker = _query_options.__isset.mem_limit && (_query_options.mem_limit > 0); int64_t bytes_limit = has_query_mem_tracker ? _query_options.mem_limit : -1; - // we do not use global query-map for now, to avoid mem-exceeded different fragments - // running on the same machine. - // TODO(lingbin): open it later. note that open with BufferedBlockMgr's BlockMgrsMap - // at the same time. - - // _query_mem_tracker = MemTracker::get_query_mem_tracker( - // query_id, bytes_limit, _exec_env->process_mem_tracker()); - auto mem_tracker_counter = ADD_COUNTER(&_profile, "MemoryLimit", TUnit::BYTES); mem_tracker_counter->set(bytes_limit); _query_mem_tracker = - MemTracker::CreateTracker(bytes_limit, "RuntimeState:query:" + print_id(query_id), - _exec_env->process_mem_tracker(), true, false); + MemTracker::create_tracker(bytes_limit, "RuntimeState:query:" + print_id(query_id), + _exec_env->process_mem_tracker(), MemTrackerLevel::INSTANCE); #ifdef BE_TEST - if (ExecEnv::GetInstance()->query_mem_tracker_registry() == nullptr) { - _hook_query_mem_tracker = - _exec_env->query_mem_tracker_registry()->register_query_mem_tracker( + if (ExecEnv::GetInstance()->task_pool_mem_tracker_registry() == nullptr) { + _new_query_mem_tracker = + _exec_env->task_pool_mem_tracker_registry()->register_query_mem_tracker( print_id(query_id), bytes_limit); } #else - _hook_query_mem_tracker = _exec_env->query_mem_tracker_registry()->register_query_mem_tracker( + _new_query_mem_tracker = _exec_env->task_pool_mem_tracker_registry()->register_query_mem_tracker( print_id(query_id), bytes_limit); #endif - _instance_mem_tracker = - MemTracker::CreateTracker(&_profile, -1, "RuntimeState:instance:", _query_mem_tracker); + _instance_mem_tracker = MemTracker::create_tracker( + -1, "RuntimeState:instance:", _query_mem_tracker, MemTrackerLevel::INSTANCE, &_profile); /* // TODO: this is a stopgap until we implement ExprContext @@ -262,13 +255,13 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { } Status RuntimeState::init_instance_mem_tracker() { - _instance_mem_tracker = MemTracker::CreateTracker(-1, "RuntimeState"); + _instance_mem_tracker = MemTracker::create_tracker(-1, "RuntimeState"); return Status::OK(); } Status RuntimeState::init_buffer_poolstate() { ExecEnv* exec_env = ExecEnv::GetInstance(); - int64_t mem_limit = _query_mem_tracker->GetLowestLimit(MemLimit::HARD); + int64_t mem_limit = _query_mem_tracker->get_lowest_limit(); int64_t max_reservation; if (query_options().__isset.buffer_pool_limit && query_options().buffer_pool_limit > 0) { max_reservation = query_options().buffer_pool_limit; @@ -367,10 +360,10 @@ Status RuntimeState::set_mem_limit_exceeded(MemTracker* tracker, int64_t failed_ << " without exceeding limit." << std::endl; } - // if (_exec_env->process_mem_tracker()->LimitExceeded()) { - // ss << _exec_env->process_mem_tracker()->LogUsage(); + // if (_exec_env->process_mem_tracker()->limit_exceeded()) { + // ss << _exec_env->process_mem_tracker()->log_usage(); // } else { - // ss << _query_mem_tracker->LogUsage(); + // ss << _query_mem_tracker->log_usage(); // } // log_error(ErrorMsg(TErrorCode::GENERAL, ss.str())); log_error(ss.str()); diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 62483aa0b84755..49248b3eb01784 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -407,7 +407,7 @@ class RuntimeState { // The query mem tracker must be released after the _instance_mem_tracker. std::shared_ptr _query_mem_tracker; // TODO(zxy): Will replace _query_mem_tracker in future. - std::shared_ptr _hook_query_mem_tracker; + std::shared_ptr _new_query_mem_tracker; // Memory usage of this fragment instance std::shared_ptr _instance_mem_tracker; diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp index 75dc64fffdd78c..bddd9fa281b9b3 100644 --- a/be/src/runtime/tablets_channel.cpp +++ b/be/src/runtime/tablets_channel.cpp @@ -35,7 +35,7 @@ TabletsChannel::TabletsChannel(const TabletsChannelKey& key, const std::shared_ptr& mem_tracker, bool is_high_priority) : _key(key), _state(kInitialized), _closed_senders(64), _is_high_priority(is_high_priority) { - _mem_tracker = MemTracker::CreateTracker(-1, "TabletsChannel", mem_tracker); + _mem_tracker = MemTracker::create_tracker(-1, "TabletsChannel", mem_tracker); static std::once_flag once_flag; std::call_once(once_flag, [] { REGISTER_HOOK_METRIC(tablet_writer_count, [&]() { return _s_tablet_writer_count.load(); }); diff --git a/be/src/runtime/tcmalloc_hook.h b/be/src/runtime/tcmalloc_hook.h index 5d8f80df1d45f7..3b8bc2d53c8f91 100644 --- a/be/src/runtime/tcmalloc_hook.h +++ b/be/src/runtime/tcmalloc_hook.h @@ -29,12 +29,15 @@ void delete_hook(const void* ptr) { doris::thread_local_ctx.release_mem(tc_malloc_size(const_cast(ptr))); } -// Notice: modify the command in New/Delete Hook should be careful enough, +// Notice: modify the command in New/Delete Hook should be careful enough!!!, // and should be as simple as possible, otherwise it may cause weird errors. E.g: // 1. The first New Hook call of the process may be before some variables of // the process are initialized. // 2. Allocating memory in the Hook command causes the Hook to be entered again, // infinite recursion. +// 3. TCMalloc hook will be triggered during the process of initializing/Destructor +// memtracker shared_ptr, Using the object pointed to by this memtracker shared_ptr +// in TCMalloc hook may cause crash. void init_hook() { MallocHook::AddNewHook(&new_hook); MallocHook::AddDeleteHook(&delete_hook); diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index 85a9e0cc05b8ec..5b78597776da3c 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -20,11 +20,19 @@ #include #include -#include "gen_cpp/Types_types.h" -#include "runtime/thread_mem_tracker.h" +#include "runtime/thread_mem_tracker_mgr.h" + +#define SCOPED_ATTACH_TASK_THREAD(type, task_id, fragment_instance_id) \ + auto VARNAME_LINENUM(attach_task_thread) = AttachTaskThread(type, task_id, fragment_instance_id) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(mem_tracker) \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker) // type, +#define SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER() \ + auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker() namespace doris { +class TUniqueId; + // The thread context saves some info about a working thread. // 2 requried info: // 1. thread_id: Current thread id, Auto generated. @@ -45,7 +53,7 @@ class ThreadContext { public: ThreadContext() : _thread_id(std::this_thread::get_id()), _type(TaskType::UNKNOWN) { - _thread_mem_tracker.reset(new ThreadMemTracker()); + _thread_mem_tracker_mgr.reset(new ThreadMemTrackerMgr()); } ~ThreadContext() {} @@ -55,17 +63,15 @@ class ThreadContext { _task_id = task_id; if (type == TaskType::QUERY) { _fragment_instance_id = fragment_instance_id; - _thread_mem_tracker->attach_query(task_id, fragment_instance_id); + _thread_mem_tracker_mgr->attach_query(task_id, fragment_instance_id); } } void detach() { - if (_type == TaskType::QUERY) { - _thread_mem_tracker->detach_query(); - } _type = TaskType::UNKNOWN; _task_id = ""; _fragment_instance_id = TUniqueId(); + _thread_mem_tracker_mgr->detach(); } const std::string type() const; @@ -74,16 +80,31 @@ class ThreadContext { const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } void consume_mem(int64_t size) { - if (_thread_mem_tracker != nullptr) { - _thread_mem_tracker->try_consume(size); + if (_thread_mem_tracker_mgr != nullptr) { + _thread_mem_tracker_mgr->cache_consume(size); } } + void release_mem(int64_t size) { - if (_thread_mem_tracker != nullptr) { - _thread_mem_tracker->try_consume(-size); + if (_thread_mem_tracker_mgr != nullptr) { + _thread_mem_tracker_mgr->cache_consume(-size); } } - void stop_mem_tracker() { _thread_mem_tracker->stop_mem_tracker(); } + + std::shared_ptr thread_mem_tracker() { + return _thread_mem_tracker_mgr->mem_tracker().lock(); + } + std::weak_ptr update_mem_tracker(std::weak_ptr mem_tracker) { + return _thread_mem_tracker_mgr->update_tracker(mem_tracker); + } + void transfer_to_external_tracker(std::shared_ptr dst_tracker, int64_t size) { + _thread_mem_tracker_mgr->transfer_to(dst_tracker, size); + } + void transfer_in_thread_tracker(std::shared_ptr source_tracker, int64_t size) { + _thread_mem_tracker_mgr->transfer_in(source_tracker, size); + } + void start_mem_tracker() { _thread_mem_tracker_mgr->start_mem_tracker(); } + void stop_mem_tracker() { _thread_mem_tracker_mgr->stop_mem_tracker(); } private: std::thread::id _thread_id; @@ -91,9 +112,13 @@ class ThreadContext { std::string _task_id; TUniqueId _fragment_instance_id; - // After _thread_mem_tracker is initialized, - // the current thread TCMalloc Hook starts to consume/release mem_tracker - std::unique_ptr _thread_mem_tracker; + // After _thread_mem_tracker_mgr is initialized, the current thread TCMalloc Hook starts to + // consume/release mem_tracker. + // Note that the use of shared_ptr will cause a crash. The guess is that there is an + // intermediate state during the copy construction of shared_ptr. Shared_ptr is not equal + // to nullptr, but the object it points to is not initialized. At this time, when the memory + // is released somewhere, the TCMalloc hook is triggered to cause the crash. + std::unique_ptr _thread_mem_tracker_mgr; }; inline thread_local ThreadContext thread_local_ctx; @@ -115,4 +140,33 @@ inline const std::string ThreadContext::type() const { return task_type_string(_type); } +class AttachTaskThread { +public: + explicit AttachTaskThread(const ThreadContext::TaskType& type, const std::string& task_id, + const TUniqueId& fragment_instance_id = TUniqueId()) { + thread_local_ctx.attach(type, task_id, fragment_instance_id); + } + + ~AttachTaskThread() { thread_local_ctx.detach(); } +}; + +class SwitchThreadMemTracker { +public: + explicit SwitchThreadMemTracker(std::shared_ptr new_mem_tracker) { + _old_mem_tracker = thread_local_ctx.update_mem_tracker(new_mem_tracker); + } + + ~SwitchThreadMemTracker() { thread_local_ctx.update_mem_tracker(_old_mem_tracker); } + +private: + std::weak_ptr _old_mem_tracker; +}; + +class StopThreadMemTracker { +public: + explicit StopThreadMemTracker() { thread_local_ctx.stop_mem_tracker(); } + + ~StopThreadMemTracker() { thread_local_ctx.start_mem_tracker(); } +}; + } // namespace doris diff --git a/be/src/runtime/thread_mem_tracker.cpp b/be/src/runtime/thread_mem_tracker.cpp deleted file mode 100644 index 220a504c087a76..00000000000000 --- a/be/src/runtime/thread_mem_tracker.cpp +++ /dev/null @@ -1,128 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "runtime/thread_mem_tracker.h" - -#include "service/backend_options.h" - -namespace doris { - -void ThreadMemTracker::attach_query(const std::string& query_id, - const TUniqueId& fragment_instance_id) { -#ifdef BE_TEST - if (ExecEnv::GetInstance()->query_mem_tracker_registry() == nullptr) { - return; - } -#endif - update_query_mem_tracker( - ExecEnv::GetInstance()->query_mem_tracker_registry()->get_query_mem_tracker(query_id)); - _query_id = query_id; - _fragment_instance_id = fragment_instance_id; -} - -void ThreadMemTracker::detach_query() { - update_query_mem_tracker(std::weak_ptr()); - _query_id = ""; - _fragment_instance_id = TUniqueId(); -} - -void ThreadMemTracker::update_query_mem_tracker(std::weak_ptr mem_tracker) { - if (_untracked_mem != 0) { - consume(); - _untracked_mem = 0; - } - _query_mem_tracker = mem_tracker; -} - -void ThreadMemTracker::query_mem_limit_exceeded(int64_t mem_usage) { - if (_fragment_instance_id != TUniqueId() && ExecEnv::GetInstance()->initialized() && - ExecEnv::GetInstance()->fragment_mgr()->is_canceling(_fragment_instance_id).ok()) { - std::string detail = "Query Memory exceed limit in TCMalloc Hook New."; - auto st = _query_mem_tracker.lock()->MemLimitExceeded(nullptr, detail, mem_usage); - - detail += - " Query Memory exceed limit in TCMalloc Hook New, Backend: {}, Query: {}, " - "Fragment: {}, Used: {}, Limit: {}. You can change the limit by session variable " - "exec_mem_limit."; - fmt::format(detail, BackendOptions::get_localhost(), _query_id, - print_id(_fragment_instance_id), - std::to_string(_query_mem_tracker.lock()->consumption()), - std::to_string(_query_mem_tracker.lock()->limit())); - ExecEnv::GetInstance()->fragment_mgr()->cancel( - _fragment_instance_id, PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED, detail); - _fragment_instance_id = TUniqueId(); // Make sure it will only be canceled once - } -} - -void ThreadMemTracker::global_mem_limit_exceeded(int64_t mem_usage) { - if (time(nullptr) - global_exceeded_interval > 60) { - std::string detail = "Global Memory exceed limit in TCMalloc Hook New."; - auto st = _global_hook_tracker->MemLimitExceeded(nullptr, detail, mem_usage); - global_exceeded_interval = time(nullptr); - } -} - -void ThreadMemTracker::consume() { - // Query_mem_tracker and global_hook_tracker are counted separately, - // in order to ensure that the process memory counted by global_hook_tracker is accurate enough. - // - // Otherwise, if query_mem_tracker is the child of global_hook_tracker and global_hook_tracker - // is the default tracker, it may be the same block of memory. Consume is called in query_mem_tracker, - // and release is called in global_hook_tracker, which is repeatedly released after ~query_mem_tracker. - if (!_query_mem_tracker.expired()) { - if (_stop_query_mem_tracker == false) { - _stop_query_mem_tracker = true; - if (!_query_mem_tracker.lock()->TryConsume(_untracked_mem)) { - query_mem_limit_exceeded(_untracked_mem); - } - _stop_query_mem_tracker = false; - } - } - - // The first time GetGlobalHookTracker is called after the main thread starts, == nullptr - if (_global_hook_tracker != nullptr) { - if (_stop_global_mem_tracker == false) { - _stop_global_mem_tracker = true; - if (!_global_hook_tracker->TryConsume(_untracked_mem)) { - // Currently, _global_hook_tracker is only used for real-time observation to verify - // the accuracy of MemTracker statistics. Therefore, when the _global_hook_tracker - // TryConsume fails, the process is not expected to terminate. To ensure the accuracy - // of real-time statistics, continue to complete the Consume. - _global_hook_tracker->Consume(_untracked_mem); - global_mem_limit_exceeded(_untracked_mem); - } - _stop_global_mem_tracker = false; - } - } -} - -void ThreadMemTracker::try_consume(int64_t size) { - if (_stop_mem_tracker == true) { - return; - } - _untracked_mem += size; - // When some threads `0 < _untracked_mem < _tracker_consume_cache_size` - // and some threads `_untracked_mem <= -_tracker_consume_cache_size` trigger consumption(), - // it will cause tracker->consumption to be temporarily less than 0. - if (_untracked_mem >= _tracker_consume_cache_size || - _untracked_mem <= -_tracker_consume_cache_size) { - consume(); - _untracked_mem = 0; - } -} - -} // namespace doris diff --git a/be/src/runtime/thread_mem_tracker_mgr.cpp b/be/src/runtime/thread_mem_tracker_mgr.cpp new file mode 100644 index 00000000000000..1fc6368875360c --- /dev/null +++ b/be/src/runtime/thread_mem_tracker_mgr.cpp @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/thread_mem_tracker_mgr.h" + +#include "runtime/mem_tracker_task_pool.h" +#include "service/backend_options.h" + +namespace doris { + +std::shared_ptr ThreadMemTrackerMgr::default_mem_tracker() { + ExecEnv* exec_env = ExecEnv::GetInstance(); + std::shared_ptr process_tracker = exec_env->new_process_mem_tracker(); + if (process_tracker != nullptr) { + return process_tracker; + } else { + return MemTracker::get_root_tracker(); + } +} + +void ThreadMemTrackerMgr::attach_query(const std::string& query_id, + const TUniqueId& fragment_instance_id) { + DCHECK(query_id != "" && fragment_instance_id != TUniqueId()); + _query_id = query_id; + _fragment_instance_id = fragment_instance_id; +#ifdef BE_TEST + if (ExecEnv::GetInstance()->task_pool_mem_tracker_registry() == nullptr) { + return; + } +#endif + update_tracker(ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_query_mem_tracker( + query_id)); +} + +void ThreadMemTrackerMgr::detach() { + update_tracker(default_mem_tracker()); + _query_id = ""; + _fragment_instance_id = TUniqueId(); +} + +std::weak_ptr ThreadMemTrackerMgr::update_tracker( + std::weak_ptr mem_tracker) { + if (_untracked_mem != 0) { + noncache_consume(); + _untracked_mem = 0; + } + DCHECK(!_mem_tracker.expired()); + DCHECK(!mem_tracker.expired()); + std::weak_ptr old_mem_tracker = _mem_tracker.lock(); + _mem_tracker = mem_tracker; + return old_mem_tracker; +} + +void ThreadMemTrackerMgr::exceeded_cancel_query(std::shared_ptr query_mem_tracker) { + if (_fragment_instance_id != TUniqueId() && ExecEnv::GetInstance()->initialized() && + ExecEnv::GetInstance()->fragment_mgr()->is_canceling(_fragment_instance_id).ok()) { + std::string detail = + " Query Memory exceed limit in TCMalloc Hook New, Backend: {}, Query: {}, " + "Fragment: {}, Used: {}, Limit: {}. You can change the limit by session variable " + "exec_mem_limit."; + fmt::format(detail, BackendOptions::get_localhost(), _query_id, + print_id(_fragment_instance_id), + std::to_string(query_mem_tracker->consumption()), + std::to_string(query_mem_tracker->limit())); + ExecEnv::GetInstance()->fragment_mgr()->cancel( + _fragment_instance_id, PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED, detail); + _fragment_instance_id = TUniqueId(); // Make sure it will only be canceled once + } +} + +void ThreadMemTrackerMgr::exceeded(Status st, int64_t mem_usage) { + DCHECK(st.is_mem_limit_exceeded()); + std::string detail = st.to_string() + ", in TCMalloc Hook New."; + auto rst = _mem_tracker.lock()->mem_limit_exceeded(nullptr, detail, mem_usage); + if (_query_id != "") { + std::shared_ptr query_mem_tracker = + ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_query_mem_tracker( + _query_id); + DCHECK(query_mem_tracker->limit_exceeded()); + exceeded_cancel_query(query_mem_tracker); + } + LOG(WARNING) << rst.to_string(); +} + +void ThreadMemTrackerMgr::noncache_consume() { + // Ensure thread safety + auto tracker = _mem_tracker.lock(); + // The first time get_root_tracker is called after the main thread starts, == nullptr. + if (tracker) { + _stop_mem_tracker = true; + Status st = _mem_tracker.lock()->try_consume(_untracked_mem); + if (!st) { + // The memory has been allocated, so when TryConsume fails, need to continue to complete + // the consume to ensure the accuracy of the statistics. + _mem_tracker.lock()->consume(_untracked_mem); + exceeded(st, _untracked_mem); + } + _stop_mem_tracker = false; + } +} + +void ThreadMemTrackerMgr::cache_consume(int64_t size) { + if (_stop_mem_tracker == true) { + return; + } + _untracked_mem += size; + // When some threads `0 < _untracked_mem < _tracker_consume_cache_size` + // and some threads `_untracked_mem <= -_tracker_consume_cache_size` trigger consumption(), + // it will cause tracker->consumption to be temporarily less than 0. + if (_untracked_mem >= _tracker_consume_cache_size || + _untracked_mem <= -_tracker_consume_cache_size) { + noncache_consume(); + _untracked_mem = 0; + } +} + +} // namespace doris diff --git a/be/src/runtime/thread_mem_tracker.h b/be/src/runtime/thread_mem_tracker_mgr.h similarity index 54% rename from be/src/runtime/thread_mem_tracker.h rename to be/src/runtime/thread_mem_tracker_mgr.h index 521aa56d65fd8d..874fe6c1cc958a 100644 --- a/be/src/runtime/thread_mem_tracker.h +++ b/be/src/runtime/thread_mem_tracker_mgr.h @@ -25,39 +25,61 @@ namespace doris { -// TCMalloc new/delete Hook is counted in the memory_tracker of the current thread -class ThreadMemTracker { +// TCMalloc new/delete Hook is counted in the memory_tracker of the current thread. +// +// In the original design, the MemTracker consume method is called before the memory is allocated. +// If the consume succeeds, the memory is actually allocated, otherwise an exception is thrown. +// But the statistics of memory through TCMalloc new/delete Hook are after the memory is actually allocated, +// which is different from the previous behavior. Therefore, when alloc for some large memory, +// need to manually call cosume after stop_mem_tracker, and then start_mem_tracker. +class ThreadMemTrackerMgr { public: - ThreadMemTracker() : _global_hook_tracker(MemTracker::GetGlobalHookTracker()) {} - ~ThreadMemTracker() { detach_query(); } + ThreadMemTrackerMgr() : _mem_tracker(default_mem_tracker()) {} + ~ThreadMemTrackerMgr() { detach(); } + + std::shared_ptr default_mem_tracker(); // After attach, the current thread TCMalloc Hook starts to consume/release query mem_tracker void attach_query(const std::string& query_id, const TUniqueId& fragment_instance_id); - void detach_query(); - - void update_query_mem_tracker(std::weak_ptr mem_tracker); - - void try_consume(int64_t size); - - void stop_mem_tracker() { _stop_mem_tracker = true; } - -private: - void query_mem_limit_exceeded(int64_t mem_usage); + void detach(); - void global_mem_limit_exceeded(int64_t mem_usage); + std::weak_ptr update_tracker(std::weak_ptr mem_tracker); // Note that, If call the memory allocation operation in TCMalloc new/delete Hook, // such as calling LOG/iostream/sstream/stringstream/etc. related methods, // must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck, - void consume(); + void cache_consume(int64_t size); + + void noncache_consume(); + + void transfer_to(std::shared_ptr dst_tracker, int64_t size) { + DCHECK(!_mem_tracker.expired()); + if (dst_tracker != nullptr) { + _mem_tracker.lock()->release(size); + dst_tracker->consume(size); + } + } + + void transfer_in(std::shared_ptr source_tracker, int64_t size) { + DCHECK(!_mem_tracker.expired()); + if (source_tracker != nullptr) { + source_tracker->release(size); + _mem_tracker.lock()->consume(size); + } + } + + std::weak_ptr mem_tracker() { return _mem_tracker; } + void stop_mem_tracker() { _stop_mem_tracker = true; } + void start_mem_tracker() { _stop_mem_tracker = false; } private: - std::string _query_id; - TUniqueId _fragment_instance_id; + void exceeded_cancel_query(std::shared_ptr query_mem_tracker); + + void exceeded(Status st, int64_t mem_usage); - std::weak_ptr _query_mem_tracker; - std::shared_ptr _global_hook_tracker = nullptr; +private: + std::weak_ptr _mem_tracker; // Consume size smaller than _tracker_consume_cache_size will continue to accumulate // to avoid frequent calls to consume/release of MemTracker. @@ -67,16 +89,12 @@ class ThreadMemTracker { // If there is a memory new/delete operation in the consume method, it may enter infinite recursion. // Note: After the tracker is stopped, the memory alloc in the consume method should be released in time, // otherwise the MemTracker statistics will be inaccurate. - bool _stop_query_mem_tracker = false; - bool _stop_global_mem_tracker = false; - - // In some cases, we want to turn off memory statistics. - // For example, when ~GlobalHookTracker, TCMalloc delete hook - // release GlobalHookTracker will crash. + // In some cases, we want to turn off thread automatic memory statistics, manually call consume. + // In addition, when ~RootTracker, TCMalloc delete hook release RootTracker will crash. bool _stop_mem_tracker = false; - // Control the interval of printing Log. - int64_t global_exceeded_interval = 0; + std::string _query_id; + TUniqueId _fragment_instance_id; }; } // namespace doris diff --git a/be/src/runtime/vectorized_row_batch.cpp b/be/src/runtime/vectorized_row_batch.cpp index 1fcdcd93582fc8..f26822833ce96c 100644 --- a/be/src/runtime/vectorized_row_batch.cpp +++ b/be/src/runtime/vectorized_row_batch.cpp @@ -29,7 +29,7 @@ VectorizedRowBatch::VectorizedRowBatch(const TabletSchema* schema, _selected_in_use = false; _size = 0; - _tracker = MemTracker::CreateTracker(-1, "VectorizedRowBatch", parent_tracker); + _tracker = MemTracker::create_tracker(-1, "VectorizedRowBatch", parent_tracker); _mem_pool.reset(new MemPool(_tracker.get())); _selected = reinterpret_cast(new char[sizeof(uint16_t) * _capacity]); diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index ccfe38200f149a..a5f27b76930d6c 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -133,7 +133,7 @@ int main(int argc, char** argv) { return -1; } - if (doris::config::tc_init_hook) { + if (doris::config::use_tc_hook) { init_hook(); } @@ -290,7 +290,7 @@ int main(int argc, char** argv) { #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER) doris::MemInfo::refresh_current_mem(); #endif - doris::ExecEnv::GetInstance()->query_mem_tracker_registry()->deregister_query_mem_tracker(); + doris::ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->logout_query_mem_tracker(); sleep(10); } diff --git a/be/test/exec/hash_table_test.cpp b/be/test/exec/hash_table_test.cpp index 3502d0b9d104bf..52e1b1f08ab929 100644 --- a/be/test/exec/hash_table_test.cpp +++ b/be/test/exec/hash_table_test.cpp @@ -47,8 +47,8 @@ namespace doris { class HashTableTest : public testing::Test { public: HashTableTest() { - _tracker = MemTracker::CreateTracker(-1, "root"); - _pool_tracker = MemTracker::CreateTracker(-1, "mem-pool", _tracker); + _tracker = MemTracker::create_tracker(-1, "root"); + _pool_tracker = MemTracker::create_tracker(-1, "mem-pool", _tracker); _mem_pool.reset(new MemPool(_pool_tracker.get())); _state = _pool.add(new RuntimeState(TQueryGlobals())); _state->init_instance_mem_tracker(); @@ -196,7 +196,7 @@ TEST_F(HashTableTest, SetupTest) { // The hash table is rehashed a few times and the scans/finds are tested again. TEST_F(HashTableTest, BasicTest) { std::shared_ptr hash_table_tracker = - MemTracker::CreateTracker(-1, "hash-table-basic-tracker", _tracker); + MemTracker::create_tracker(-1, "hash-table-basic-tracker", _tracker); TupleRow* build_rows[5]; TupleRow* scan_rows[5] = {0}; @@ -260,7 +260,7 @@ TEST_F(HashTableTest, BasicTest) { // This tests makes sure we can scan ranges of buckets TEST_F(HashTableTest, ScanTest) { std::shared_ptr hash_table_tracker = - MemTracker::CreateTracker(-1, "hash-table-scan-tracker", _tracker); + MemTracker::create_tracker(-1, "hash-table-scan-tracker", _tracker); std::vector is_null_safe = {false}; int initial_seed = 1; @@ -314,7 +314,7 @@ TEST_F(HashTableTest, GrowTableTest) { int expected_size = 0; std::shared_ptr mem_tracker = - MemTracker::CreateTracker(1024 * 1024, "hash-table-grow-tracker", _tracker); + MemTracker::create_tracker(1024 * 1024, "hash-table-grow-tracker", _tracker); std::vector is_null_safe = {false}; int initial_seed = 1; int64_t num_buckets = 4; @@ -357,7 +357,7 @@ TEST_F(HashTableTest, GrowTableTest2) { int expected_size = 0; std::shared_ptr mem_tracker = - MemTracker::CreateTracker(1024 * 1024 * 1024, "hash-table-grow2-tracker", _tracker); + MemTracker::create_tracker(1024 * 1024 * 1024, "hash-table-grow2-tracker", _tracker); std::vector is_null_safe = {false}; int initial_seed = 1; int64_t num_buckets = 4; diff --git a/be/test/exec/tablet_sink_test.cpp b/be/test/exec/tablet_sink_test.cpp index 405e7c0d8293f1..e59e972097ff78 100644 --- a/be/test/exec/tablet_sink_test.cpp +++ b/be/test/exec/tablet_sink_test.cpp @@ -57,7 +57,7 @@ class OlapTableSinkTest : public testing::Test { _env->_internal_client_cache = new BrpcClientCache(); _env->_function_client_cache = new BrpcClientCache(); _env->_buffer_reservation = new ReservationTracker(); - _env->_query_mem_tracker_registry.reset(new QueryMemTrackerRegistry()); + _env->_task_pool_mem_tracker_registry.reset(new MemTrackerTaskPool()); ThreadPoolBuilder("SendBatchThreadPool") .set_min_threads(1) .set_max_threads(5) diff --git a/be/test/exprs/bloom_filter_predicate_test.cpp b/be/test/exprs/bloom_filter_predicate_test.cpp index ca6e5a9f0d4795..1cba866e7d1fe8 100644 --- a/be/test/exprs/bloom_filter_predicate_test.cpp +++ b/be/test/exprs/bloom_filter_predicate_test.cpp @@ -31,7 +31,7 @@ class BloomFilterPredicateTest : public testing::Test { }; TEST_F(BloomFilterPredicateTest, bloom_filter_func_int_test) { - auto tracker = MemTracker::CreateTracker(); + auto tracker = MemTracker::create_tracker(); std::unique_ptr func( create_bloom_filter(tracker.get(), PrimitiveType::TYPE_INT)); ASSERT_TRUE(func->init(1024, 0.05).ok()); @@ -53,7 +53,7 @@ TEST_F(BloomFilterPredicateTest, bloom_filter_func_int_test) { } TEST_F(BloomFilterPredicateTest, bloom_filter_func_stringval_test) { - auto tracker = MemTracker::CreateTracker(); + auto tracker = MemTracker::create_tracker(); std::unique_ptr func( create_bloom_filter(tracker.get(), PrimitiveType::TYPE_VARCHAR)); ASSERT_TRUE(func->init(1024, 0.05).ok()); @@ -104,7 +104,7 @@ TEST_F(BloomFilterPredicateTest, bloom_filter_func_stringval_test) { } TEST_F(BloomFilterPredicateTest, bloom_filter_size_test) { - auto tracker = MemTracker::CreateTracker(); + auto tracker = MemTracker::create_tracker(); std::unique_ptr func( create_bloom_filter(tracker.get(), PrimitiveType::TYPE_VARCHAR)); int length = 4096; diff --git a/be/test/olap/bloom_filter_column_predicate_test.cpp b/be/test/olap/bloom_filter_column_predicate_test.cpp index 24abea12151ea9..7921fc88de9dda 100644 --- a/be/test/olap/bloom_filter_column_predicate_test.cpp +++ b/be/test/olap/bloom_filter_column_predicate_test.cpp @@ -95,7 +95,7 @@ TEST_F(TestBloomFilterColumnPredicate, FLOAT_COLUMN) { return_columns.push_back(i); } - auto tracker = MemTracker::CreateTracker(-1, "OlapScanner"); + auto tracker = MemTracker::create_tracker(-1, "OlapScanner"); std::shared_ptr bloom_filter( create_bloom_filter(tracker.get(), PrimitiveType::TYPE_FLOAT)); diff --git a/be/test/olap/generic_iterators_test.cpp b/be/test/olap/generic_iterators_test.cpp index b73ad0271e6531..59bff103ba53bc 100644 --- a/be/test/olap/generic_iterators_test.cpp +++ b/be/test/olap/generic_iterators_test.cpp @@ -83,8 +83,8 @@ TEST(GenericIteratorsTest, Union) { inputs.push_back(new_auto_increment_iterator(schema, 200)); inputs.push_back(new_auto_increment_iterator(schema, 300)); - auto iter = new_union_iterator(inputs, - MemTracker::CreateTracker(-1, "UnionIterator", nullptr, false)); + auto iter = new_union_iterator(std::move(inputs), + MemTracker::create_tracker(-1, "UnionIterator", nullptr)); StorageReadOptions opts; auto st = iter->init(opts); ASSERT_TRUE(st.ok()); @@ -124,7 +124,7 @@ TEST(GenericIteratorsTest, Merge) { inputs.push_back(new_auto_increment_iterator(schema, 300)); auto iter = new_merge_iterator( - std::move(inputs), MemTracker::CreateTracker(-1, "MergeIterator", nullptr, false), -1); + std::move(inputs), MemTracker::create_tracker(-1, "MergeIterator", nullptr), -1); StorageReadOptions opts; auto st = iter->init(opts); ASSERT_TRUE(st.ok()); diff --git a/be/test/runtime/mem_limit_test.cpp b/be/test/runtime/mem_limit_test.cpp index b2c4017ea1e47f..378b9c5d083e5b 100644 --- a/be/test/runtime/mem_limit_test.cpp +++ b/be/test/runtime/mem_limit_test.cpp @@ -24,121 +24,121 @@ namespace doris { TEST(MemTrackerTest, SingleTrackerNoLimit) { - auto t = MemTracker::CreateTracker(); + auto t = MemTracker::create_tracker(); EXPECT_FALSE(t->has_limit()); - t->Consume(10); + t->consume(10); EXPECT_EQ(t->consumption(), 10); - t->Consume(10); + t->consume(10); EXPECT_EQ(t->consumption(), 20); - t->Release(15); + t->release(15); EXPECT_EQ(t->consumption(), 5); - EXPECT_FALSE(t->LimitExceeded(MemLimit::HARD)); - t->Release(5); + EXPECT_FALSE(t->limit_exceeded()); + t->release(5); } TEST(MemTestTest, SingleTrackerWithLimit) { - auto t = MemTracker::CreateTracker(11, "limit tracker"); + auto t = MemTracker::create_tracker(11, "limit tracker"); EXPECT_TRUE(t->has_limit()); - t->Consume(10); + t->consume(10); EXPECT_EQ(t->consumption(), 10); - EXPECT_FALSE(t->LimitExceeded(MemLimit::HARD)); - t->Consume(10); + EXPECT_FALSE(t->limit_exceeded()); + t->consume(10); EXPECT_EQ(t->consumption(), 20); - EXPECT_TRUE(t->LimitExceeded(MemLimit::HARD)); - t->Release(15); + EXPECT_TRUE(t->limit_exceeded()); + t->release(15); EXPECT_EQ(t->consumption(), 5); - EXPECT_FALSE(t->LimitExceeded(MemLimit::HARD)); - t->Release(5); + EXPECT_FALSE(t->limit_exceeded()); + t->release(5); } TEST(MemTestTest, TrackerHierarchy) { - auto p = MemTracker::CreateTracker(100); - auto c1 = MemTracker::CreateTracker(80, "c1", p); - auto c2 = MemTracker::CreateTracker(50, "c2", p); + auto p = MemTracker::create_tracker(100); + auto c1 = MemTracker::create_tracker(80, "c1", p); + auto c2 = MemTracker::create_tracker(50, "c2", p); // everything below limits - c1->Consume(60); + c1->consume(60); EXPECT_EQ(c1->consumption(), 60); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_FALSE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 0); - EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c2->limit_exceeded()); + EXPECT_FALSE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 60); - EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(p->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(p->limit_exceeded()); + EXPECT_FALSE(p->any_limit_exceeded()); // p goes over limit - c2->Consume(50); + c2->consume(50); EXPECT_EQ(c1->consumption(), 60); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_TRUE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_TRUE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 50); - EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_TRUE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c2->limit_exceeded()); + EXPECT_TRUE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 110); - EXPECT_TRUE(p->LimitExceeded(MemLimit::HARD)); + EXPECT_TRUE(p->limit_exceeded()); // c2 goes over limit, p drops below limit - c1->Release(20); - c2->Consume(10); + c1->release(20); + c2->consume(10); EXPECT_EQ(c1->consumption(), 40); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_FALSE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 60); - EXPECT_TRUE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_TRUE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_TRUE(c2->limit_exceeded()); + EXPECT_TRUE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 100); - EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD)); - c1->Release(40); - c2->Release(60); + EXPECT_FALSE(p->limit_exceeded()); + c1->release(40); + c2->release(60); } TEST(MemTestTest, TrackerHierarchyTryConsume) { - auto p = MemTracker::CreateTracker(100); - auto c1 = MemTracker::CreateTracker(80, "c1", p); - auto c2 = MemTracker::CreateTracker(50, "c2", p); + auto p = MemTracker::create_tracker(100); + auto c1 = MemTracker::create_tracker(80, "c1", p); + auto c2 = MemTracker::create_tracker(50, "c2", p); // everything below limits - bool consumption = c1->TryConsume(60).ok(); + bool consumption = c1->try_consume(60).ok(); EXPECT_EQ(consumption, true); EXPECT_EQ(c1->consumption(), 60); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_FALSE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 0); - EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c2->limit_exceeded()); + EXPECT_FALSE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 60); - EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(p->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(p->limit_exceeded()); + EXPECT_FALSE(p->any_limit_exceeded()); // p goes over limit - consumption = c2->TryConsume(50).ok(); + consumption = c2->try_consume(50).ok(); EXPECT_EQ(consumption, false); EXPECT_EQ(c1->consumption(), 60); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_FALSE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 0); - EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c2->limit_exceeded()); + EXPECT_FALSE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 60); - EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(p->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(p->limit_exceeded()); + EXPECT_FALSE(p->any_limit_exceeded()); // c2 goes over limit, p drops below limit - c1->Release(20); - c2->Consume(10); + c1->release(20); + c2->consume(10); EXPECT_EQ(c1->consumption(), 40); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_FALSE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 10); - EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c2->limit_exceeded()); + EXPECT_FALSE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 50); - EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(p->limit_exceeded()); - c1->Release(40); - c2->Release(10); + c1->release(40); + c2->release(10); } } // end namespace doris diff --git a/be/test/runtime/memory_scratch_sink_test.cpp b/be/test/runtime/memory_scratch_sink_test.cpp index b2443aced80aa7..e20f1023fce790 100644 --- a/be/test/runtime/memory_scratch_sink_test.cpp +++ b/be/test/runtime/memory_scratch_sink_test.cpp @@ -115,7 +115,7 @@ void MemoryScratchSinkTest::init_runtime_state() { _state = new RuntimeState(query_id, query_options, TQueryGlobals(), _env->exec_env()); _state->init_instance_mem_tracker(); _mem_tracker = - MemTracker::CreateTracker(-1, "MemoryScratchSinkTest", _state->instance_mem_tracker()); + MemTracker::create_tracker(-1, "MemoryScratchSinkTest", _state->instance_mem_tracker()); _state->set_desc_tbl(_desc_tbl); _state->_load_dir = "./test_run/output/"; _state->init_mem_trackers(TUniqueId()); diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc index 82263f2e31ed21..1456cfc6ae2dad 100644 --- a/be/test/runtime/test_env.cc +++ b/be/test/runtime/test_env.cc @@ -31,14 +31,14 @@ namespace doris { TestEnv::TestEnv() - : _block_mgr_parent_tracker(MemTracker::CreateTracker(-1, "BufferedBlockMgr2")), - _io_mgr_tracker(MemTracker::CreateTracker(-1, "DiskIoMgr")) { + : _block_mgr_parent_tracker(MemTracker::create_tracker(-1, "BufferedBlockMgr2")), + _io_mgr_tracker(MemTracker::create_tracker(-1, "DiskIoMgr")) { // Some code will use ExecEnv::GetInstance(), so init the global ExecEnv singleton _exec_env = ExecEnv::GetInstance(); _exec_env->_thread_mgr = new ThreadResourceMgr(2); _exec_env->_buffer_reservation = new ReservationTracker(); - _exec_env->_process_mem_tracker = MemTracker::CreateTracker(-1, "TestEnv"); - _exec_env->_query_mem_tracker_registry.reset(new QueryMemTrackerRegistry()); + _exec_env->_process_mem_tracker = MemTracker::create_tracker(-1, "TestEnv"); + _exec_env->_task_pool_mem_tracker_registry.reset(new MemTrackerTaskPool()); _exec_env->_disk_io_mgr = new DiskIoMgr(1, 1, 1, 10); _exec_env->disk_io_mgr()->init(_io_mgr_tracker); _exec_env->_scan_thread_pool = new PriorityThreadPool(1, 16); diff --git a/be/test/util/arrow/arrow_work_flow_test.cpp b/be/test/util/arrow/arrow_work_flow_test.cpp index 01c11224056936..5a5f2dc36c14e6 100644 --- a/be/test/util/arrow/arrow_work_flow_test.cpp +++ b/be/test/util/arrow/arrow_work_flow_test.cpp @@ -91,7 +91,7 @@ void ArrowWorkFlowTest::init_runtime_state() { _exec_env->_result_queue_mgr = new ResultQueueMgr(); _exec_env->_thread_mgr = new ThreadResourceMgr(); _exec_env->_buffer_reservation = new ReservationTracker(); - _exec_env->_query_mem_tracker_registry.reset(new QueryMemTrackerRegistry()); + _exec_env->_task_pool_mem_tracker_registry.reset(new MemTrackerTaskPool()); TQueryOptions query_options; query_options.batch_size = 1024; TUniqueId query_id; @@ -100,7 +100,7 @@ void ArrowWorkFlowTest::init_runtime_state() { _state = new RuntimeState(query_id, query_options, TQueryGlobals(), _exec_env); _state->init_instance_mem_tracker(); _mem_tracker = - MemTracker::CreateTracker(-1, "ArrowWorkFlowTest", _state->instance_mem_tracker()); + MemTracker::create_tracker(-1, "ArrowWorkFlowTest", _state->instance_mem_tracker()); _state->set_desc_tbl(_desc_tbl); _state->_load_dir = "./test_run/output/"; _state->init_mem_trackers(TUniqueId()); diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index 120338468d4666..e75bf710b16fa9 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -1303,12 +1303,6 @@ Available memory, value range: [0-100] If the system is found to be in a high-stress scenario and a large number of threads are found in the tcmalloc lock competition phase through the BE thread stack, such as a large number of `SpinLock` related stacks, you can try increasing this parameter to improve system performance. [Reference](https://github.com/gperftools/gperftools/issues/1111) -### `tc_init_hook` - -* Type: bool -* Description: Whether to initialize TCmalloc new/delete Hook, MemTracker is currently counted in Hook. -* Default: true - ### `tc_use_memory_min` Default:10737418240 @@ -1437,22 +1431,36 @@ The size of the buffer before flashing * Default: 3 +### `use_tc_hook` + +* Type: bool +* Description: Whether to initialize TCmalloc new/delete Hook, MemTracker is currently counted in Hook. +* Default: true + ### `mem_tracker_level` * Type: int16 * Description: The level at which MemTracker is displayed on the Web page equal or lower than this level will be displayed on the Web page ``` - RELEASE = 0 - DEBUG = 1 + OVERVIEW = 0 + TASK = 1 + INSTANCE = 2 + VERBOSE = 3 ``` * Default: 0 ### `mem_tracker_consume_min_size_bytes` * Type: int32 -* Description: The minimum length of TCMalloc Hook when consume/release MemTracker. Consume size smaller than this value will continue to accumulate to avoid frequent calls to consume/release of MemTracker. Decreasing this value will increase the frequency of consume/release. Increasing this value will cause MemTracker statistics to be inaccurate. Theoretically, the statistical value of a MemTracker differs from the true value = (mem_tracker_consume_min_size_bytes * the number of BE threads where the MemTracker is located). +* Description: The minimum length of TCMalloc Hook when consume/release MemTracker. Consume size smaller than this value will continue to accumulate to avoid frequent calls to consume/release of MemTracker. Decreasing this value will increase the frequency of consume/release. Increasing this value will cause MemTracker statistics to be inaccurate. Theoretically, the statistical value of a MemTracker differs from the true value = ( mem_tracker_consume_min_size_bytes * the number of BE threads where the MemTracker is located). * Default: 1048576 +### `memory_leak_detection` + +* Type: bool +* Description: Whether to start memory leak detection, memory leak occurs when MemTracker is considered to be a negative value, but the actual MemTracker records inaccurately will also cause a negative value, so this feature is in the experimental stage. +* Default: false + ### `max_segment_num_per_rowset` * Type: int32 diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index 8bbfec30be4bcb..a4944a3ad7b091 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -1325,12 +1325,6 @@ tablet状态缓存的更新间隔,单位:秒 如果发现系统在高压力场景下,通过 BE 线程堆栈发现大量线程处于 tcmalloc 的锁竞争阶段,如大量的 `SpinLock` 相关堆栈,则可以尝试增大该参数来提升系统性能。[参考](https://github.com/gperftools/gperftools/issues/1111) -### `tc_init_hook` - -* 类型:bool -* 描述:是否初始化TCmalloc new/delete Hook,目前在Hook中统计MemTracker。 -* 默认值:true - ### `tc_use_memory_min` 默认值:10737418240 @@ -1456,13 +1450,21 @@ webserver默认工作线程数 ``` * 默认值: 3 +### `use_tc_hook` + +* 类型:bool +* 描述:是否初始化TCmalloc new/delete Hook,目前在Hook中统计MemTracker。 +* 默认值:true + ### `mem_tracker_level` * 类型: int16 * 描述: MemTracker在Web页面上展示的级别,等于或低于这个级别的MemTracker会在Web页面上展示 ``` - RELEASE = 0 - DEBUG = 1 + OVERVIEW = 0 + TASK = 1 + INSTANCE = 2 + VERBOSE = 3 ``` * 默认值: 0 @@ -1472,6 +1474,12 @@ webserver默认工作线程数 * 描述: TCMalloc Hook consume/release MemTracker时的最小长度,小于该值的consume size会持续累加,避免频繁调用MemTracker的consume/release,减小该值会增加consume/release的频率,增大该值会导致MemTracker统计不准,理论上一个MemTracker的统计值与真实值相差 = (mem_tracker_consume_min_size_bytes * 这个MemTracker所在的BE线程数)。 * 默认值: 1048576 +### `memory_leak_detection` + +* 类型: bool +* 描述: 是否启动内存泄漏检测,认为 MemTracker 为负值时发生内存泄漏,但实际 MemTracker 记录不准确时也会导致负值,所以这个功能处于实验阶段。 +* 默认值: false + ### `max_segment_num_per_rowset` * 类型: int32 From 50dceaaa8e195b76eacf5bc887f165c8923ae229 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Wed, 5 Jan 2022 22:17:48 +0800 Subject: [PATCH 12/14] fix bug --- be/src/common/config.h | 3 + be/src/olap/lru_cache.cpp | 28 ++++---- be/src/olap/memtable_flush_executor.cpp | 3 +- be/src/olap/rowset/segment_v2/page_io.cpp | 29 +++----- be/src/runtime/mem_pool.cpp | 49 ++++--------- be/src/runtime/mem_pool.h | 1 - be/src/runtime/mem_tracker.cpp | 40 ++++++++--- be/src/runtime/mem_tracker.h | 18 +++-- be/src/runtime/memory/chunk_allocator.cpp | 43 +++--------- be/src/runtime/memory/chunk_allocator.h | 4 +- be/src/runtime/thread_context.h | 64 +++++++++++++---- be/src/runtime/thread_mem_tracker_mgr.cpp | 69 +++++++++++++------ be/src/runtime/thread_mem_tracker_mgr.h | 35 +++++----- be/test/runtime/test_env.cc | 5 +- .../administrator-guide/config/be_config.md | 2 +- .../administrator-guide/config/be_config.md | 2 +- 16 files changed, 212 insertions(+), 183 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index 563785ec900a13..78fa344e2ad4af 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -613,6 +613,9 @@ CONF_mInt16(mem_tracker_level, "0"); // Increasing this value will cause MemTracker statistics to be inaccurate. CONF_mInt32(mem_tracker_consume_min_size_bytes, "1048576"); +// When MemTracker is a negative value, it is considered that a memory leak has occurred, +// but the actual MemTracker records inaccurately will also cause a negative value, +// so this feature is in the experimental stage. CONF_mBool(memory_leak_detection, "false"); // The version information of the tablet will be stored in the memory diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index b93e1190f10b3a..d0ab7da32ee972 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -14,9 +14,9 @@ #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/olap_index.h" -#include "runtime/thread_context.h" #include "olap/row_block.h" #include "olap/utils.h" +#include "runtime/thread_context.h" #include "util/doris_metrics.h" using std::string; @@ -293,8 +293,8 @@ void LRUCache::_evict_one_entry(LRUHandle* e) { Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const CacheKey& key, void* value), - CachePriority priority, std::shared_ptr source_mem_tracker) { - + CachePriority priority, + std::shared_ptr source_mem_tracker) { size_t handle_size = sizeof(LRUHandle) - 1 + key.size(); LRUHandle* e = reinterpret_cast(malloc(handle_size)); e->value = value; @@ -320,8 +320,8 @@ Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, // note that the cache might get larger than its capacity if not enough // space was freed auto old = _table.insert(e); - // DCHECK(thread_local_ctx.thread_mem_tracker()->GetQueryMemTracker() == nullptr); - thread_local_ctx.transfer_in_thread_tracker(source_mem_tracker, charge); + DCHECK(thread_local_ctx.thread_mem_tracker()->get_task_mem_tracker() == nullptr); + source_mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), charge); _usage += e->total_size; if (old != nullptr) { old->in_cache = false; @@ -471,32 +471,32 @@ ShardedLRUCache::~ShardedLRUCache() { } _entity->deregister_hook(_name); DorisMetrics::instance()->metric_registry()->deregister_entity(_entity); - // _mem_tracker->release(_mem_tracker->consumption()); } Cache::Handle* ShardedLRUCache::insert(const CacheKey& key, void* value, size_t charge, void (*deleter)(const CacheKey& key, void* value), CachePriority priority) { std::shared_ptr source_mem_tracker = thread_local_ctx.thread_mem_tracker(); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); const uint32_t hash = _hash_slice(key); - return _shards[_shard(hash)]->insert(key, hash, value, charge, deleter, priority, source_mem_tracker); + return _shards[_shard(hash)]->insert(key, hash, value, charge, deleter, priority, + source_mem_tracker); } Cache::Handle* ShardedLRUCache::lookup(const CacheKey& key) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); const uint32_t hash = _hash_slice(key); return _shards[_shard(hash)]->lookup(key, hash); } void ShardedLRUCache::release(Handle* handle) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); LRUHandle* h = reinterpret_cast(handle); _shards[_shard(h->hash)]->release(handle); } void ShardedLRUCache::erase(const CacheKey& key) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); const uint32_t hash = _hash_slice(key); _shards[_shard(hash)]->erase(key, hash); } @@ -515,7 +515,7 @@ uint64_t ShardedLRUCache::new_id() { } int64_t ShardedLRUCache::prune() { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); int64_t num_prune = 0; for (int s = 0; s < kNumShards; s++) { num_prune += _shards[s]->prune(); @@ -524,7 +524,7 @@ int64_t ShardedLRUCache::prune() { } int64_t ShardedLRUCache::prune_if(CacheValuePredicate pred) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); int64_t num_prune = 0; for (int s = 0; s < kNumShards; s++) { num_prune += _shards[s]->prune_if(pred); @@ -551,8 +551,6 @@ void ShardedLRUCache::update_cache_metrics() const { usage_ratio->set_value(total_capacity == 0 ? 0 : ((double)total_usage / total_capacity)); hit_ratio->set_value(total_lookup_count == 0 ? 0 : ((double)total_hit_count / total_lookup_count)); - - // _mem_tracker->consume(total_usage - _mem_tracker->consumption()); } Cache* new_lru_cache(const std::string& name, size_t capacity, diff --git a/be/src/olap/memtable_flush_executor.cpp b/be/src/olap/memtable_flush_executor.cpp index 67392d913463ba..a2251a73afe104 100644 --- a/be/src/olap/memtable_flush_executor.cpp +++ b/be/src/olap/memtable_flush_executor.cpp @@ -57,7 +57,8 @@ OLAPStatus FlushToken::wait() { } void FlushToken::_flush_memtable(std::shared_ptr memtable, int64_t submit_task_time) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(memtable->mem_tracker()); + // TODO(zxy) + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(memtable->mem_tracker(), "FlushToken", false); _stats.flush_wait_time_ns += (MonotonicNanos() - submit_task_time); SCOPED_CLEANUP({ memtable.reset(); }); // If previous flush has failed, return directly diff --git a/be/src/olap/rowset/segment_v2/page_io.cpp b/be/src/olap/rowset/segment_v2/page_io.cpp index 0fd13dd164a8aa..fe7d3c32e860a8 100644 --- a/be/src/olap/rowset/segment_v2/page_io.cpp +++ b/be/src/olap/rowset/segment_v2/page_io.cpp @@ -26,7 +26,6 @@ #include "olap/fs/block_manager.h" #include "olap/page_cache.h" #include "util/block_compression.h" -#include "runtime/thread_context.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/faststring.h" @@ -48,7 +47,7 @@ Status PageIO::compress_page_body(const BlockCompressionCodec* codec, double min Slice compressed_slice(buf); RETURN_IF_ERROR(codec->compress(body, &compressed_slice)); buf.resize(compressed_slice.get_size()); - + double space_saving = 1.0 - static_cast(buf.size()) / uncompressed_size; // return compressed body only when it saves more than min_space_saving if (space_saving > 0 && space_saving >= min_space_saving) { @@ -117,8 +116,10 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* auto cache = StoragePageCache::instance(); PageCacheHandle cache_handle; - StoragePageCache::CacheKey cache_key(opts.rblock->path_desc().filepath, opts.page_pointer.offset); - if (opts.use_page_cache && cache->is_cache_available(opts.type) && cache->lookup(cache_key, &cache_handle, opts.type)) { + StoragePageCache::CacheKey cache_key(opts.rblock->path_desc().filepath, + opts.page_pointer.offset); + if (opts.use_page_cache && cache->is_cache_available(opts.type) && + cache->lookup(cache_key, &cache_handle, opts.type)) { // we find page in cache, use it *handle = PageHandle(std::move(cache_handle)); opts.stats->cached_pages_num++; @@ -140,11 +141,7 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* } // hold compressed page at first, reset to decompressed page later - std::unique_ptr page; - { - // SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); - page.reset(new char[page_size]); - } + std::unique_ptr page(new char[page_size]); Slice page_slice(page.get(), page_size); { SCOPED_RAW_TIMER(&opts.stats->io_ns); @@ -175,11 +172,8 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* return Status::Corruption("Bad page: page is compressed but codec is NO_COMPRESSION"); } SCOPED_RAW_TIMER(&opts.stats->decompress_ns); - std::unique_ptr decompressed_page; - { - // SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); - decompressed_page.reset(new char[footer->uncompressed_size() + footer_size + 4]); - } + std::unique_ptr decompressed_page( + new char[footer->uncompressed_size() + footer_size + 4]); // decompress page body Slice compressed_body(page_slice.data, body_size); @@ -193,11 +187,8 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* // append footer and footer size memcpy(decompressed_body.data + decompressed_body.size, page_slice.data + body_size, footer_size + 4); - { - // SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); - // free memory of compressed page - page = std::move(decompressed_page); - } + // free memory of compressed page + page = std::move(decompressed_page); page_slice = Slice(page.get(), footer->uncompressed_size() + footer_size + 4); opts.stats->uncompressed_bytes_read += page_slice.size; } else { diff --git a/be/src/runtime/mem_pool.cpp b/be/src/runtime/mem_pool.cpp index 29ed12795dac05..3e373d288efc3a 100644 --- a/be/src/runtime/mem_pool.cpp +++ b/be/src/runtime/mem_pool.cpp @@ -44,7 +44,7 @@ MemPool::MemPool(MemTracker* mem_tracker) total_allocated_bytes_(0), total_reserved_bytes_(0), peak_allocated_bytes_(0), - // new_mem_tracker_(thread_local_ctx.thread_mem_tracker()), + new_mem_tracker_(thread_local_ctx.thread_mem_tracker()), mem_tracker_(mem_tracker) { DCHECK(mem_tracker != nullptr); } @@ -57,11 +57,9 @@ MemPool::~MemPool() { int64_t total_bytes_released = 0; for (auto& chunk : chunks_) { total_bytes_released += chunk.chunk.size; - ChunkAllocator::instance()->free(chunk.chunk, new_mem_tracker_); + ChunkAllocator::instance()->free(chunk.chunk); } mem_tracker_->release(total_bytes_released); - // DCHECK(new_mem_tracker_ == thread_local_ctx.thread_mem_tracker()); - // new_mem_tracker_->release(total_bytes_released); DorisMetrics::instance()->memory_pool_bytes_total->increment(-total_bytes_released); } @@ -79,7 +77,7 @@ void MemPool::free_all() { int64_t total_bytes_released = 0; for (auto& chunk : chunks_) { total_bytes_released += chunk.chunk.size; - ChunkAllocator::instance()->free(chunk.chunk, new_mem_tracker_); + ChunkAllocator::instance()->free(chunk.chunk); } chunks_.clear(); next_chunk_size_ = INITIAL_CHUNK_SIZE; @@ -88,8 +86,6 @@ void MemPool::free_all() { total_reserved_bytes_ = 0; mem_tracker_->release(total_bytes_released); - // DCHECK(new_mem_tracker_ == thread_local_ctx.thread_mem_tracker()); - // new_mem_tracker_->release(total_bytes_released); DorisMetrics::instance()->memory_pool_bytes_total->increment(-total_bytes_released); } @@ -131,22 +127,18 @@ bool MemPool::find_chunk(size_t min_size, bool check_limits) { } chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size); - // DCHECK(new_mem_tracker_ == thread_local_ctx.thread_mem_tracker()); if (check_limits) { Status st = mem_tracker_->try_consume(chunk_size); - // Status st2 = new_mem_tracker_->try_consume(chunk_size); WARN_IF_ERROR(st, "try to allocate a new buffer failed"); if (!st) return false; } else { mem_tracker_->consume(chunk_size); - // new_mem_tracker_->consume(chunk_size); } // Allocate a new chunk. Return early if allocate fails. Chunk chunk; - if (!ChunkAllocator::instance()->allocate(chunk_size, &chunk, new_mem_tracker_)) { + if (!ChunkAllocator::instance()->allocate(chunk_size, &chunk)) { mem_tracker_->release(chunk_size); - // new_mem_tracker_->release(chunk_size); return false; } ASAN_POISON_MEMORY_REGION(chunk.data, chunk_size); @@ -187,6 +179,8 @@ void MemPool::acquire_data(MemPool* src, bool keep_current) { int64_t total_transferred_bytes = 0; for (auto i = src->chunks_.begin(); i != end_chunk; ++i) { total_transferred_bytes += i->chunk.size; + i->chunk.mem_tracker->transfer_to(new_mem_tracker_, i->chunk.size); + i->chunk.mem_tracker = new_mem_tracker_; } src->total_reserved_bytes_ -= total_transferred_bytes; total_reserved_bytes_ += total_transferred_bytes; @@ -196,24 +190,6 @@ void MemPool::acquire_data(MemPool* src, bool keep_current) { src->mem_tracker_->release(total_transferred_bytes); mem_tracker_->consume(total_transferred_bytes); } - // if (src->new_mem_tracker_ != new_mem_tracker_) { - // // if (task_type_ == "QUERY" && src->task_type() == "UNKNOWN") { - // // thread_local_ctx.consume_mem(total_transferred_bytes); - // // } - // // if (mem_tracker_->GetQueryMemTracker() != nullptr && - // // src->mem_tracker_->GetQueryMemTracker() == nullptr) { - // // thread_local_ctx.consume_mem(total_transferred_bytes); - // // } - // // if ((new_mem_tracker_.lock()->GetQueryMemTracker() != nullptr && - // // src->new_mem_tracker_.lock()->GetQueryMemTracker() == nullptr) || - // // (new_mem_tracker_.lock()->GetQueryMemTracker() == nullptr && - // // src->new_mem_tracker_.lock()->GetQueryMemTracker() != nullptr)) { - // // new_mem_tracker_.lock()->consume(total_transferred_bytes); - // // src->new_mem_tracker_.lock()->release(total_transferred_bytes); - // // } - // new_mem_tracker_->consume(total_transferred_bytes); - // src->new_mem_tracker_->release(total_transferred_bytes); - // } // insert new chunks after current_chunk_idx_ auto insert_chunk = chunks_.begin() + current_chunk_idx_ + 1; @@ -248,16 +224,21 @@ void MemPool::exchange_data(MemPool* other) { std::swap(total_reserved_bytes_, other->total_reserved_bytes_); std::swap(peak_allocated_bytes_, other->peak_allocated_bytes_); std::swap(chunks_, other->chunks_); + + for (auto i = chunks_.begin(); i != chunks_.end(); ++i) { + i->chunk.mem_tracker->transfer_to(new_mem_tracker_, i->chunk.size); + i->chunk.mem_tracker = new_mem_tracker_; + } + for (auto i = other->chunks_.begin(); i != other->chunks_.end(); ++i) { + i->chunk.mem_tracker->transfer_to(other->new_mem_tracker_, i->chunk.size); + i->chunk.mem_tracker = other->new_mem_tracker_; + } // update MemTracker if (other->mem_tracker_ != mem_tracker_) { mem_tracker_->consume(delta_size); other->mem_tracker_->release(delta_size); } - // if (other->new_mem_tracker_ != new_mem_tracker_) { - // new_mem_tracker_->consume(delta_size); - // other->new_mem_tracker_->release(delta_size); - // } } std::string MemPool::debug_string() { diff --git a/be/src/runtime/mem_pool.h b/be/src/runtime/mem_pool.h index faa0a4ad9c4ec2..3880bf3bbf4d3a 100644 --- a/be/src/runtime/mem_pool.h +++ b/be/src/runtime/mem_pool.h @@ -273,7 +273,6 @@ class MemPool { /// The current and peak memory footprint of this pool. This is different from /// total allocated_bytes_ since it includes bytes in chunks that are not used. MemTracker* mem_tracker_; - std::shared_ptr new_mem_tracker_; }; diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index b3a3b07996d2c8..fb5929a7b1f4f5 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -94,7 +94,6 @@ std::shared_ptr MemTracker::create_tracker(int64_t byte_limit, const reset_parent = get_root_tracker(); } - SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); std::shared_ptr tracker( new MemTracker(byte_limit, reset_label, reset_parent, level > reset_parent->_level ? level : reset_parent->_level, profile)); @@ -131,6 +130,8 @@ void MemTracker::Init() { } MemTracker::~MemTracker() { + // TCMalloc hook will be triggered during destructor memtracker, may cause crash. + if (_label == "Root") GLOBAL_STOP_THREAD_LOCAL_MEM_TRACKER(); if (parent()) { if (consumption() != 0) { memory_leak_check(this); @@ -149,8 +150,24 @@ MemTracker::~MemTracker() { _child_tracker_it = _parent->_child_trackers.end(); } } - // TCMalloc hook will be triggered during destructor memtracker, may cause crash. - SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); +} + +void MemTracker::transfer_to(std::shared_ptr dst, int64_t bytes) { + DCHECK_EQ(_all_trackers.back(), dst->_all_trackers.back()) << "Must have same ancestor"; + // Find the common ancestor and update trackers between 'this'/'dst' and + // the common ancestor. This logic handles all cases, including the + // two trackers being the same or being ancestors of each other because + // 'all_trackers_' includes the current tracker. + int ancestor_idx = _all_trackers.size() - 1; + int dst_ancestor_idx = dst->_all_trackers.size() - 1; + while (ancestor_idx > 0 && dst_ancestor_idx > 0 && + _all_trackers[ancestor_idx - 1] == dst->_all_trackers[dst_ancestor_idx - 1]) { + --ancestor_idx; + --dst_ancestor_idx; + } + MemTracker* common_ancestor = _all_trackers[ancestor_idx]; + release(bytes, common_ancestor); + dst->consume(bytes, common_ancestor); } // Calling this on the query tracker results in output like: @@ -184,10 +201,10 @@ std::string MemTracker::log_usage(int max_recursive_depth, int64_t* logged_consu std::string detail = "MemTracker log_usage Label: {}, Limit: {}, Total: {}, Peak: {}, Exceeded: {}"; - fmt::format(detail, _label, PrettyPrinter::print(_limit, TUnit::BYTES), - PrettyPrinter::print(curr_consumption, TUnit::BYTES), - PrettyPrinter::print(peak_consumption, TUnit::BYTES), - limit_exceeded() ? "true" : "false"); + detail = fmt::format(detail, _label, PrettyPrinter::print(_limit, TUnit::BYTES), + PrettyPrinter::print(curr_consumption, TUnit::BYTES), + PrettyPrinter::print(peak_consumption, TUnit::BYTES), + limit_exceeded() ? "true" : "false"); // This call does not need the children, so return early. if (max_recursive_depth == 0) return detail; @@ -238,10 +255,11 @@ Status MemTracker::mem_limit_exceeded(RuntimeState* state, const std::string& de std::string detail = "Memory exceed limit. details: {}, Label: {}, could not allocate size {} without " "exceeding limit on backend: {}, Memory left in process limit: {}, by fragment: {}."; - fmt::format(detail, details, _label, PrettyPrinter::print(failed_allocation_size, TUnit::BYTES), - BackendOptions::get_localhost(), - PrettyPrinter::print(process_tracker->spare_capacity(), TUnit::BYTES), - print_id(state->fragment_instance_id())); + detail = fmt::format( + detail, details, _label, PrettyPrinter::print(failed_allocation_size, TUnit::BYTES), + BackendOptions::get_localhost(), + PrettyPrinter::print(process_tracker->spare_capacity(), TUnit::BYTES), + state != nullptr ? print_id(state->fragment_instance_id()) : std::string()); Status status = Status::MemoryLimitExceeded(detail); if (state != nullptr) state->log_error(detail); diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index 2126895fa9946b..c984f0a3accf19 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -86,7 +86,7 @@ class MemTracker { // Gets a shared_ptr to the "root" tracker, creating it if necessary. static std::shared_ptr get_root_tracker(); - // Increases consumption of this tracker and its ancestors by 'bytes'. + // Increases consumption of this tracker and its ancestors by 'bytes'. // up to (but not including) end_tracker. // This is useful if we want to move tracking between trackers that share a common (i.e. end_tracker) // ancestor. This happens when we want to update tracking on a particular mem tracker but the consumption @@ -182,6 +182,12 @@ class MemTracker { return Status::OK(); } + /// Transfer 'bytes' of consumption from this tracker to 'dst', updating + /// all ancestors up to the first shared ancestor. Must not be used if + /// 'dst' has a limit, or an ancestor with a limit, that is not a common + /// ancestor with the tracker, because this does not check memory limits. + void transfer_to(std::shared_ptr dst, int64_t bytes); + // Returns true if a valid limit of this tracker or one of its ancestors is exceeded. MemTracker* limit_exceeded_tracker() const { for (const auto& tracker : _limit_trackers) { @@ -192,9 +198,7 @@ class MemTracker { return nullptr; } - bool any_limit_exceeded() const { - return limit_exceeded_tracker() != nullptr; - } + bool any_limit_exceeded() const { return limit_exceeded_tracker() != nullptr; } // Returns the maximum consumption that can be made without exceeding the limit on // this tracker or any of its parents. Returns int64_t::max() if there are no @@ -254,6 +258,9 @@ class MemTracker { Status mem_limit_exceeded(RuntimeState* state, const std::string& details = std::string(), int64_t failed_allocation = 0) WARN_UNUSED_RESULT; + // If an ancestor of this tracker is a Task MemTracker, return that tracker. Otherwise return nullptr. + MemTracker* get_task_mem_tracker(); + std::string debug_string() { std::stringstream msg; msg << "limit: " << _limit << "; " @@ -317,9 +324,6 @@ class MemTracker { } } - // If an ancestor of this tracker is a Task MemTracker, return that tracker. Otherwise return nullptr. - MemTracker* get_task_mem_tracker(); - // Creates the root tracker. static void create_root_tracker(); diff --git a/be/src/runtime/memory/chunk_allocator.cpp b/be/src/runtime/memory/chunk_allocator.cpp index 435ae4093319de..25d6ed82a8fe24 100644 --- a/be/src/runtime/memory/chunk_allocator.cpp +++ b/be/src/runtime/memory/chunk_allocator.cpp @@ -118,8 +118,8 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit) _arenas(CpuInfo::get_max_num_cores()) { _chunk_allocator_mem_tracker = MemTracker::create_tracker(static_cast(reserve_limit), "ChunkAllocator", - nullptr, MemTrackerLevel::OVERVIEW); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker); + nullptr, MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker, "ChunkAllocator", false); for (int i = 0; i < _arenas.size(); ++i) { _arenas[i].reset(new ChunkArena()); } @@ -134,24 +134,19 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit) INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_system_free_cost_ns); } -bool ChunkAllocator::allocate(size_t size, Chunk* chunk, std::shared_ptr caller_tracker) { +bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { // fast path: allocate from current core arena chunk->mem_tracker = thread_local_ctx.thread_mem_tracker(); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker); - thread_local_ctx.transfer_to_external_tracker(chunk->mem_tracker, size); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker, "ChunkAllocator", false); + thread_local_ctx.thread_mem_tracker()->transfer_to(chunk->mem_tracker, size); int core_id = CpuInfo::get_current_core(); chunk->size = size; chunk->core_id = core_id; - if (_arenas[core_id]->pop_free_chunk(size, &chunk->data)) { DCHECK_GE(_reserved_bytes, 0); _reserved_bytes.fetch_sub(size); chunk_pool_local_core_alloc_count->increment(1); - // thread_local_ctx.transfer_in_thread_tracker(_chunk_allocator_mem_tracker, size); - // thread_local_ctx.transfer_to_external_tracker(chunk->mem_tracker, size); - // thread_local_ctx.transfer_to_external_tracker(caller_tracker, size); - // thread_local_ctx.consume_mem(size); return true; } if (_reserved_bytes > size) { @@ -164,10 +159,6 @@ bool ChunkAllocator::allocate(size_t size, Chunk* chunk, std::shared_ptrincrement(1); // reset chunk's core_id to other chunk->core_id = core_id % _arenas.size(); - // thread_local_ctx.transfer_to_external_tracker(chunk->mem_tracker, size); - // thread_local_ctx.transfer_in_thread_tracker(_chunk_allocator_mem_tracker, size); - // thread_local_ctx.transfer_to_external_tracker(caller_tracker, size); - // thread_local_ctx.consume_mem(size); return true; } } @@ -178,29 +169,23 @@ bool ChunkAllocator::allocate(size_t size, Chunk* chunk, std::shared_ptrdata = SystemAllocator::allocate(size); - // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker); - // _chunk_allocator_mem_tracker->consume(size); } chunk_pool_system_alloc_count->increment(1); chunk_pool_system_alloc_cost_ns->increment(cost_ns); if (chunk->data == nullptr) { - thread_local_ctx.transfer_in_thread_tracker(chunk->mem_tracker, size); + chunk->mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), size); return false; } - // thread_local_ctx.transfer_to_external_tracker(chunk->mem_tracker, size); - // thread_local_ctx.transfer_in_thread_tracker(_chunk_allocator_mem_tracker, size); - // thread_local_ctx.transfer_to_external_tracker(caller_tracker, size); - // thread_local_ctx.consume_mem(size); return true; } -void ChunkAllocator::free(Chunk& chunk, std::shared_ptr caller_tracker) { +void ChunkAllocator::free(Chunk& chunk) { if (chunk.core_id == -1) { return; } DCHECK(chunk.mem_tracker != nullptr); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker); - thread_local_ctx.transfer_in_thread_tracker(chunk.mem_tracker, chunk.size); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker, "ChunkAllocator", false); + chunk.mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), chunk.size); int64_t old_reserved_bytes = _reserved_bytes; int64_t new_reserved_bytes = 0; do { @@ -210,13 +195,7 @@ void ChunkAllocator::free(Chunk& chunk, std::shared_ptr caller_track { SCOPED_RAW_TIMER(&cost_ns); SystemAllocator::free(chunk.data, chunk.size); - // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker); - // _chunk_allocator_mem_tracker->release(chunk.size); } - // thread_local_ctx.release_mem(chunk.size); - // chunk.mem_tracker->transfer_to(_chunk_allocator_mem_tracker, chunk.size); - // thread_local_ctx.transfer_in_thread_tracker(chunk.mem_tracker, chunk.size); - // thread_local_ctx.transfer_in_thread_tracker(caller_tracker, chunk.size); chunk_pool_system_free_count->increment(1); chunk_pool_system_free_cost_ns->increment(cost_ns); @@ -226,10 +205,6 @@ void ChunkAllocator::free(Chunk& chunk, std::shared_ptr caller_track _arenas[chunk.core_id]->push_free_chunk(chunk.data, chunk.size); chunk.mem_tracker = nullptr; - // thread_local_ctx.transfer_in_thread_tracker(chunk.mem_tracker, chunk.size); - // chunk.mem_tracker->transfer_to(_chunk_allocator_mem_tracker, chunk.size); - // thread_local_ctx.transfer_in_thread_tracker(caller_tracker, chunk.size); - // thread_local_ctx.release_mem(chunk.size); } bool ChunkAllocator::allocate_align(size_t size, Chunk* chunk) { diff --git a/be/src/runtime/memory/chunk_allocator.h b/be/src/runtime/memory/chunk_allocator.h index 0ccd967c943c47..c94089596a4879 100644 --- a/be/src/runtime/memory/chunk_allocator.h +++ b/be/src/runtime/memory/chunk_allocator.h @@ -64,12 +64,12 @@ class ChunkAllocator { // Allocate a Chunk with a power-of-two length "size". // Return true if success and allocated chunk is saved in "chunk". // Otherwise return false. - bool allocate(size_t size, Chunk* chunk, std::shared_ptr caller_tracker = nullptr); + bool allocate(size_t size, Chunk* chunk); bool allocate_align(size_t size, Chunk* chunk); // Free chunk allocated from this allocator - void free(Chunk& chunk, std::shared_ptr caller_tracker = nullptr); + void free(Chunk& chunk); private: static ChunkAllocator* _s_instance; diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index 5b78597776da3c..d97a3c7c9b745e 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -24,10 +24,17 @@ #define SCOPED_ATTACH_TASK_THREAD(type, task_id, fragment_instance_id) \ auto VARNAME_LINENUM(attach_task_thread) = AttachTaskThread(type, task_id, fragment_instance_id) -#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(mem_tracker) \ - auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker) // type, +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(mem_tracker, action_name, cancel_work) \ + auto VARNAME_LINENUM(switch_tracker) = \ + SwitchThreadMemTracker(mem_tracker, action_name, cancel_work) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB(mem_tracker, action_name, cancel_work, \ + err_call_back_func) \ + auto VARNAME_LINENUM(switch_tracker) = \ + SwitchThreadMemTracker(mem_tracker, action_name, cancel_work, err_call_back_func) #define SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER() \ - auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker() + auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker(true) +#define GLOBAL_STOP_THREAD_LOCAL_MEM_TRACKER() \ + auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker(false) namespace doris { @@ -94,17 +101,28 @@ class ThreadContext { std::shared_ptr thread_mem_tracker() { return _thread_mem_tracker_mgr->mem_tracker().lock(); } - std::weak_ptr update_mem_tracker(std::weak_ptr mem_tracker) { + std::weak_ptr update_thread_tracker(std::weak_ptr mem_tracker) { return _thread_mem_tracker_mgr->update_tracker(mem_tracker); } - void transfer_to_external_tracker(std::shared_ptr dst_tracker, int64_t size) { - _thread_mem_tracker_mgr->transfer_to(dst_tracker, size); + std::shared_ptr update_thread_tracker_call_back( + const std::string& action_name, bool cancel_task, ERRCALLBACK err_call_back_func) { + return _thread_mem_tracker_mgr->update_consume_err_call_back(action_name, cancel_task, + err_call_back_func); } - void transfer_in_thread_tracker(std::shared_ptr source_tracker, int64_t size) { - _thread_mem_tracker_mgr->transfer_in(source_tracker, size); + std::shared_ptr update_thread_tracker_call_back( + std::shared_ptr tracker_call_back) { + return _thread_mem_tracker_mgr->update_consume_err_call_back(tracker_call_back); + } + void start_mem_tracker() { + if (_thread_mem_tracker_mgr != nullptr) { + _thread_mem_tracker_mgr->start_mem_tracker(); + } + } + void stop_mem_tracker() { + if (_thread_mem_tracker_mgr != nullptr) { + _thread_mem_tracker_mgr->stop_mem_tracker(); + } } - void start_mem_tracker() { _thread_mem_tracker_mgr->start_mem_tracker(); } - void stop_mem_tracker() { _thread_mem_tracker_mgr->stop_mem_tracker(); } private: std::thread::id _thread_id; @@ -152,21 +170,37 @@ class AttachTaskThread { class SwitchThreadMemTracker { public: - explicit SwitchThreadMemTracker(std::shared_ptr new_mem_tracker) { - _old_mem_tracker = thread_local_ctx.update_mem_tracker(new_mem_tracker); + explicit SwitchThreadMemTracker(std::shared_ptr mem_tracker, + const std::string& action_name = std::string(), + bool cancel_work = true, + ERRCALLBACK err_call_back_func = nullptr) { + _old_mem_tracker = thread_local_ctx.update_thread_tracker(mem_tracker); + _old_tracker_call_back = thread_local_ctx.update_thread_tracker_call_back( + action_name, cancel_work, err_call_back_func); } - ~SwitchThreadMemTracker() { thread_local_ctx.update_mem_tracker(_old_mem_tracker); } + ~SwitchThreadMemTracker() { + thread_local_ctx.update_thread_tracker(_old_mem_tracker); + thread_local_ctx.update_thread_tracker_call_back(_old_tracker_call_back); + } private: std::weak_ptr _old_mem_tracker; + std::shared_ptr _old_tracker_call_back; }; class StopThreadMemTracker { public: - explicit StopThreadMemTracker() { thread_local_ctx.stop_mem_tracker(); } + explicit StopThreadMemTracker(const bool scope = true) : _scope(scope) { + thread_local_ctx.stop_mem_tracker(); + } - ~StopThreadMemTracker() { thread_local_ctx.start_mem_tracker(); } + ~StopThreadMemTracker() { + if (_scope == true) thread_local_ctx.start_mem_tracker(); + } + +private: + bool _scope; }; } // namespace doris diff --git a/be/src/runtime/thread_mem_tracker_mgr.cpp b/be/src/runtime/thread_mem_tracker_mgr.cpp index 1fc6368875360c..053991e53f5f89 100644 --- a/be/src/runtime/thread_mem_tracker_mgr.cpp +++ b/be/src/runtime/thread_mem_tracker_mgr.cpp @@ -37,6 +37,7 @@ void ThreadMemTrackerMgr::attach_query(const std::string& query_id, DCHECK(query_id != "" && fragment_instance_id != TUniqueId()); _query_id = query_id; _fragment_instance_id = fragment_instance_id; + _consume_err_call_back = std::make_shared("Query", true, nullptr); #ifdef BE_TEST if (ExecEnv::GetInstance()->task_pool_mem_tracker_registry() == nullptr) { return; @@ -50,6 +51,7 @@ void ThreadMemTrackerMgr::detach() { update_tracker(default_mem_tracker()); _query_id = ""; _fragment_instance_id = TUniqueId(); + _consume_err_call_back = std::make_shared("", true, nullptr); } std::weak_ptr ThreadMemTrackerMgr::update_tracker( @@ -65,19 +67,35 @@ std::weak_ptr ThreadMemTrackerMgr::update_tracker( return old_mem_tracker; } -void ThreadMemTrackerMgr::exceeded_cancel_query(std::shared_ptr query_mem_tracker) { +std::shared_ptr ThreadMemTrackerMgr::update_consume_err_call_back( + const std::string& action_name, bool cancel_task, ERRCALLBACK call_back_func) { + std::shared_ptr old_consume_err_call_back = _consume_err_call_back; + _consume_err_call_back = + std::make_shared(action_name, cancel_task, call_back_func); + return old_consume_err_call_back; +} + +std::shared_ptr ThreadMemTrackerMgr::update_consume_err_call_back( + std::shared_ptr consume_err_call_back) { + std::shared_ptr old_consume_err_call_back = _consume_err_call_back; + _consume_err_call_back = consume_err_call_back; + return old_consume_err_call_back; +} + +void ThreadMemTrackerMgr::exceeded_cancel_query() { if (_fragment_instance_id != TUniqueId() && ExecEnv::GetInstance()->initialized() && ExecEnv::GetInstance()->fragment_mgr()->is_canceling(_fragment_instance_id).ok()) { std::string detail = - " Query Memory exceed limit in TCMalloc Hook New, Backend: {}, Query: {}, " - "Fragment: {}, Used: {}, Limit: {}. You can change the limit by session variable " + " {} Memory exceed limit in TCMalloc Hook New, Backend: {}, QueryID: {}, " + "FragmentID: {}, Used: {}, Limit: {}. You can change the limit by session variable " "exec_mem_limit."; - fmt::format(detail, BackendOptions::get_localhost(), _query_id, - print_id(_fragment_instance_id), - std::to_string(query_mem_tracker->consumption()), - std::to_string(query_mem_tracker->limit())); ExecEnv::GetInstance()->fragment_mgr()->cancel( - _fragment_instance_id, PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED, detail); + _fragment_instance_id, PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED, + fmt::format(detail, _consume_err_call_back->action_name, + BackendOptions::get_localhost(), _query_id, + print_id(_fragment_instance_id), + std::to_string(_mem_tracker.lock()->consumption()), + std::to_string(_mem_tracker.lock()->limit()))); _fragment_instance_id = TUniqueId(); // Make sure it will only be canceled once } } @@ -86,31 +104,38 @@ void ThreadMemTrackerMgr::exceeded(Status st, int64_t mem_usage) { DCHECK(st.is_mem_limit_exceeded()); std::string detail = st.to_string() + ", in TCMalloc Hook New."; auto rst = _mem_tracker.lock()->mem_limit_exceeded(nullptr, detail, mem_usage); + if (_consume_err_call_back->call_back_func != nullptr) { + _consume_err_call_back->call_back_func(); + } if (_query_id != "") { std::shared_ptr query_mem_tracker = ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_query_mem_tracker( _query_id); - DCHECK(query_mem_tracker->limit_exceeded()); - exceeded_cancel_query(query_mem_tracker); + if (_consume_err_call_back->cancel_task == true || + (query_mem_tracker != nullptr && query_mem_tracker->limit_exceeded())) { + exceeded_cancel_query(); + } } LOG(WARNING) << rst.to_string(); } void ThreadMemTrackerMgr::noncache_consume() { - // Ensure thread safety - auto tracker = _mem_tracker.lock(); - // The first time get_root_tracker is called after the main thread starts, == nullptr. - if (tracker) { - _stop_mem_tracker = true; - Status st = _mem_tracker.lock()->try_consume(_untracked_mem); - if (!st) { - // The memory has been allocated, so when TryConsume fails, need to continue to complete - // the consume to ensure the accuracy of the statistics. - _mem_tracker.lock()->consume(_untracked_mem); - exceeded(st, _untracked_mem); + _stop_mem_tracker = true; + { + // Ensure thread safety + auto tracker = _mem_tracker.lock(); + // The first time get_root_tracker is called after the main thread starts, == nullptr. + if (tracker) { + Status st = _mem_tracker.lock()->try_consume(_untracked_mem); + if (!st) { + // The memory has been allocated, so when TryConsume fails, need to continue to complete + // the consume to ensure the accuracy of the statistics. + _mem_tracker.lock()->consume(_untracked_mem); + exceeded(st, _untracked_mem); + } } - _stop_mem_tracker = false; } + _stop_mem_tracker = false; } void ThreadMemTrackerMgr::cache_consume(int64_t size) { diff --git a/be/src/runtime/thread_mem_tracker_mgr.h b/be/src/runtime/thread_mem_tracker_mgr.h index 874fe6c1cc958a..d519ae4eeeec83 100644 --- a/be/src/runtime/thread_mem_tracker_mgr.h +++ b/be/src/runtime/thread_mem_tracker_mgr.h @@ -25,6 +25,17 @@ namespace doris { +typedef void (*ERRCALLBACK)(); + +struct ConsumeErrCallBackInfo { + std::string action_name; + bool cancel_task; + ERRCALLBACK call_back_func; + + ConsumeErrCallBackInfo(std::string action_name, bool cancel_task, ERRCALLBACK call_back_func) + : action_name(action_name), cancel_task(cancel_task), call_back_func(call_back_func) {} +}; + // TCMalloc new/delete Hook is counted in the memory_tracker of the current thread. // // In the original design, the MemTracker consume method is called before the memory is allocated. @@ -45,6 +56,10 @@ class ThreadMemTrackerMgr { void detach(); std::weak_ptr update_tracker(std::weak_ptr mem_tracker); + std::shared_ptr update_consume_err_call_back( + const std::string& action_name, bool cancel_task, ERRCALLBACK call_back_func); + std::shared_ptr update_consume_err_call_back( + std::shared_ptr consume_err_call_back); // Note that, If call the memory allocation operation in TCMalloc new/delete Hook, // such as calling LOG/iostream/sstream/stringstream/etc. related methods, @@ -53,28 +68,12 @@ class ThreadMemTrackerMgr { void noncache_consume(); - void transfer_to(std::shared_ptr dst_tracker, int64_t size) { - DCHECK(!_mem_tracker.expired()); - if (dst_tracker != nullptr) { - _mem_tracker.lock()->release(size); - dst_tracker->consume(size); - } - } - - void transfer_in(std::shared_ptr source_tracker, int64_t size) { - DCHECK(!_mem_tracker.expired()); - if (source_tracker != nullptr) { - source_tracker->release(size); - _mem_tracker.lock()->consume(size); - } - } - std::weak_ptr mem_tracker() { return _mem_tracker; } void stop_mem_tracker() { _stop_mem_tracker = true; } void start_mem_tracker() { _stop_mem_tracker = false; } private: - void exceeded_cancel_query(std::shared_ptr query_mem_tracker); + void exceeded_cancel_query(); void exceeded(Status st, int64_t mem_usage); @@ -93,6 +92,8 @@ class ThreadMemTrackerMgr { // In addition, when ~RootTracker, TCMalloc delete hook release RootTracker will crash. bool _stop_mem_tracker = false; + std::shared_ptr _consume_err_call_back; + std::string _query_id; TUniqueId _fragment_instance_id; }; diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc index 1456cfc6ae2dad..fd51c40b699c99 100644 --- a/be/test/runtime/test_env.cc +++ b/be/test/runtime/test_env.cc @@ -31,8 +31,7 @@ namespace doris { TestEnv::TestEnv() - : _block_mgr_parent_tracker(MemTracker::create_tracker(-1, "BufferedBlockMgr2")), - _io_mgr_tracker(MemTracker::create_tracker(-1, "DiskIoMgr")) { + : _block_mgr_parent_tracker(MemTracker::create_tracker(-1, "BufferedBlockMgr2")) { // Some code will use ExecEnv::GetInstance(), so init the global ExecEnv singleton _exec_env = ExecEnv::GetInstance(); _exec_env->_thread_mgr = new ThreadResourceMgr(2); @@ -40,7 +39,7 @@ TestEnv::TestEnv() _exec_env->_process_mem_tracker = MemTracker::create_tracker(-1, "TestEnv"); _exec_env->_task_pool_mem_tracker_registry.reset(new MemTrackerTaskPool()); _exec_env->_disk_io_mgr = new DiskIoMgr(1, 1, 1, 10); - _exec_env->disk_io_mgr()->init(_io_mgr_tracker); + _exec_env->disk_io_mgr()->init(-1); _exec_env->_scan_thread_pool = new PriorityThreadPool(1, 16); _exec_env->_result_queue_mgr = new ResultQueueMgr(); // TODO may need rpc support, etc. diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index e75bf710b16fa9..771d767c762676 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -1458,7 +1458,7 @@ The size of the buffer before flashing ### `memory_leak_detection` * Type: bool -* Description: Whether to start memory leak detection, memory leak occurs when MemTracker is considered to be a negative value, but the actual MemTracker records inaccurately will also cause a negative value, so this feature is in the experimental stage. +* Description: Whether to start memory leak detection, when MemTracker is a negative value, it is considered that a memory leak has occurred, but the actual MemTracker records inaccurately will also cause a negative value, so this feature is in the experimental stage. * Default: false ### `max_segment_num_per_rowset` diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index a4944a3ad7b091..07587dd4545f81 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -1477,7 +1477,7 @@ webserver默认工作线程数 ### `memory_leak_detection` * 类型: bool -* 描述: 是否启动内存泄漏检测,认为 MemTracker 为负值时发生内存泄漏,但实际 MemTracker 记录不准确时也会导致负值,所以这个功能处于实验阶段。 +* 描述: 是否启动内存泄漏检测,当 MemTracker 为负值时认为发生了内存泄漏,但实际 MemTracker 记录不准确时也会导致负值,所以这个功能处于实验阶段。 * 默认值: false ### `max_segment_num_per_rowset` From 70174ab0c2041ea1cb526835b4d9999e0bc91867 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Wed, 26 Jan 2022 19:09:01 +0800 Subject: [PATCH 13/14] last --- be/src/agent/task_worker_pool.cpp | 2 +- be/src/exec/aggregation_node.cpp | 9 +- be/src/exec/analytic_eval_node.cpp | 21 ++- be/src/exec/assert_num_rows_node.cpp | 3 + be/src/exec/base_scanner.cpp | 11 +- be/src/exec/blocking_join_node.cpp | 10 +- be/src/exec/broker_scan_node.cpp | 7 +- be/src/exec/broker_scanner.cpp | 1 - be/src/exec/broker_scanner.h | 1 - be/src/exec/cross_join_node.cpp | 11 +- be/src/exec/csv_scan_node.cpp | 7 +- be/src/exec/data_sink.cpp | 2 +- be/src/exec/data_sink.h | 1 - be/src/exec/es/es_scroll_parser.cpp | 10 +- be/src/exec/es_http_scan_node.cpp | 11 +- be/src/exec/es_http_scanner.cpp | 10 +- be/src/exec/es_http_scanner.h | 2 - be/src/exec/es_scan_node.cpp | 10 +- be/src/exec/except_node.cpp | 7 +- be/src/exec/exchange_node.cpp | 8 +- be/src/exec/exec_node.cpp | 5 - be/src/exec/exec_node.h | 9 +- be/src/exec/hash_join_node.cpp | 20 ++- be/src/exec/hash_join_node_ir.cpp | 170 ++++++++++++++++++ be/src/exec/hash_table.cpp | 6 +- be/src/exec/intersect_node.cpp | 6 +- be/src/exec/json_scanner.cpp | 1 - be/src/exec/json_scanner.h | 1 - be/src/exec/merge_join_node.cpp | 21 +-- be/src/exec/merge_join_node.h | 5 +- be/src/exec/merge_node.cpp | 9 +- be/src/exec/mysql_scan_node.cpp | 7 +- be/src/exec/odbc_scan_node.cpp | 7 +- be/src/exec/olap_scan_node.cpp | 49 ++--- be/src/exec/olap_scan_node.h | 2 +- be/src/exec/olap_scanner.cpp | 19 +- be/src/exec/olap_scanner.h | 3 +- be/src/exec/orc_scanner.cpp | 3 +- be/src/exec/partitioned_aggregation_node.cc | 42 +++-- be/src/exec/partitioned_hash_table.cc | 12 +- be/src/exec/partitioned_hash_table.h | 3 +- be/src/exec/repeat_node.cpp | 8 +- be/src/exec/schema_scan_node.cpp | 7 +- be/src/exec/select_node.cpp | 7 +- be/src/exec/set_operation_node.cpp | 9 +- be/src/exec/spill_sort_node.cc | 6 +- be/src/exec/table_function_node.cpp | 7 +- be/src/exec/tablet_info.cpp | 10 +- be/src/exec/tablet_info.h | 2 - be/src/exec/tablet_sink.cpp | 50 ++++-- be/src/exec/tablet_sink.h | 14 +- be/src/exec/topn_node.cpp | 9 +- be/src/exec/union_node.cpp | 10 +- be/src/exprs/agg_fn.h | 1 - be/src/exprs/agg_fn_evaluator.cpp | 2 +- be/src/exprs/anyval_util.cpp | 5 +- be/src/exprs/bloomfilter_predicate.h | 13 +- be/src/exprs/create_predicate_function.h | 48 +++-- be/src/exprs/expr.cpp | 25 ++- be/src/exprs/expr.h | 10 +- be/src/exprs/expr_context.cpp | 29 ++- be/src/exprs/expr_context.h | 2 + be/src/exprs/new_agg_fn_evaluator.cc | 16 +- be/src/exprs/new_agg_fn_evaluator.h | 6 +- be/src/exprs/runtime_filter.cpp | 42 ++--- be/src/exprs/runtime_filter.h | 14 +- be/src/http/action/compaction_action.cpp | 8 +- be/src/olap/aggregate_func.h | 4 - be/src/olap/base_compaction.cpp | 9 +- be/src/olap/base_compaction.h | 3 +- be/src/olap/bloom_filter_predicate.cpp | 6 +- be/src/olap/compaction.cpp | 13 +- be/src/olap/compaction.h | 4 +- be/src/olap/cumulative_compaction.cpp | 9 +- be/src/olap/cumulative_compaction.h | 3 +- be/src/olap/delta_writer.cpp | 35 ++-- be/src/olap/delta_writer.h | 7 +- be/src/olap/fs/block_manager.h | 5 - be/src/olap/fs/file_block_manager.cpp | 5 +- be/src/olap/fs/file_block_manager.h | 5 - be/src/olap/generic_iterators.cpp | 29 ++- be/src/olap/generic_iterators.h | 4 +- be/src/olap/iterators.h | 3 - be/src/olap/lru_cache.cpp | 34 ++-- be/src/olap/lru_cache.h | 9 +- be/src/olap/memtable.cpp | 9 +- be/src/olap/memtable.h | 3 +- be/src/olap/memtable_flush_executor.cpp | 3 +- be/src/olap/merger.cpp | 4 +- be/src/olap/olap_index.cpp | 3 +- be/src/olap/olap_index.h | 1 - be/src/olap/page_cache.cpp | 10 +- be/src/olap/push_handler.cpp | 4 +- be/src/olap/push_handler.h | 1 - be/src/olap/reader.cpp | 5 +- be/src/olap/reader.h | 1 - be/src/olap/row_block.cpp | 9 +- be/src/olap/row_block.h | 4 +- be/src/olap/row_block2.cpp | 6 +- be/src/olap/row_block2.h | 3 - be/src/olap/rowset/alpha_rowset.cpp | 8 - be/src/olap/rowset/alpha_rowset.h | 3 - be/src/olap/rowset/alpha_rowset_reader.cpp | 10 +- be/src/olap/rowset/alpha_rowset_reader.h | 5 +- be/src/olap/rowset/beta_rowset.cpp | 8 - be/src/olap/rowset/beta_rowset.h | 3 - be/src/olap/rowset/beta_rowset_reader.cpp | 21 +-- be/src/olap/rowset/beta_rowset_reader.h | 6 +- be/src/olap/rowset/beta_rowset_writer.cpp | 4 +- be/src/olap/rowset/column_data.cpp | 21 +-- be/src/olap/rowset/column_data.h | 7 +- be/src/olap/rowset/rowset.h | 5 - be/src/olap/rowset/rowset_writer_context.h | 1 - be/src/olap/rowset/segment_reader.cpp | 26 +-- be/src/olap/rowset/segment_reader.h | 4 +- .../rowset/segment_v2/binary_dict_page.cpp | 3 +- .../olap/rowset/segment_v2/binary_dict_page.h | 2 - .../rowset/segment_v2/bitmap_index_reader.h | 5 +- .../rowset/segment_v2/bitmap_index_writer.cpp | 5 +- .../segment_v2/bloom_filter_index_reader.h | 5 +- .../segment_v2/bloom_filter_index_writer.cpp | 5 +- be/src/olap/rowset/segment_v2/column_reader.h | 7 +- be/src/olap/rowset/segment_v2/column_writer.h | 4 - .../segment_v2/indexed_column_writer.cpp | 3 +- .../rowset/segment_v2/indexed_column_writer.h | 2 - be/src/olap/rowset/segment_v2/segment.cpp | 19 +- be/src/olap/rowset/segment_v2/segment.h | 6 +- .../rowset/segment_v2/segment_iterator.cpp | 18 +- .../olap/rowset/segment_v2/segment_iterator.h | 3 +- .../olap/rowset/segment_v2/segment_writer.cpp | 7 +- .../olap/rowset/segment_v2/segment_writer.h | 2 +- .../olap/rowset/segment_v2/zone_map_index.cpp | 6 +- .../olap/rowset/segment_v2/zone_map_index.h | 2 - be/src/olap/schema_change.cpp | 92 +++++----- be/src/olap/schema_change.h | 16 +- be/src/olap/segment_loader.cpp | 5 +- be/src/olap/segment_loader.h | 2 - be/src/olap/snapshot_manager.cpp | 4 + be/src/olap/snapshot_manager.h | 7 +- be/src/olap/storage_engine.cpp | 25 +-- be/src/olap/storage_engine.h | 7 + be/src/olap/tablet.cpp | 10 +- be/src/olap/tablet.h | 6 +- be/src/olap/tablet_manager.cpp | 5 +- be/src/olap/task/engine_alter_tablet_task.cpp | 10 +- be/src/olap/task/engine_alter_tablet_task.h | 1 + be/src/olap/task/engine_batch_load_task.cpp | 5 + be/src/olap/task/engine_batch_load_task.h | 1 + be/src/olap/task/engine_checksum_task.cpp | 13 +- be/src/olap/task/engine_clone_task.cpp | 14 +- be/src/olap/task/engine_clone_task.h | 1 + be/src/olap/tuple_reader.cpp | 1 - be/src/runtime/buffered_block_mgr2.cc | 25 +-- be/src/runtime/buffered_block_mgr2.h | 9 +- be/src/runtime/buffered_tuple_stream2.cc | 2 +- be/src/runtime/buffered_tuple_stream3.cc | 7 +- be/src/runtime/buffered_tuple_stream3.h | 4 +- be/src/runtime/bufferpool/buffer_pool.cc | 3 +- be/src/runtime/cache/result_cache.h | 1 - be/src/runtime/data_stream_mgr.cpp | 2 +- be/src/runtime/data_stream_mgr.h | 1 - be/src/runtime/data_stream_recvr.cc | 34 ++-- be/src/runtime/data_stream_recvr.h | 11 +- be/src/runtime/data_stream_sender.cpp | 7 +- be/src/runtime/disk_io_mgr.cc | 88 ++++----- be/src/runtime/disk_io_mgr.h | 18 +- be/src/runtime/dpp_sink.cpp | 5 +- be/src/runtime/exec_env.h | 9 +- be/src/runtime/exec_env_init.cpp | 19 +- be/src/runtime/export_sink.cpp | 5 +- be/src/runtime/export_sink.h | 3 - be/src/runtime/fold_constant_executor.cpp | 6 +- be/src/runtime/fragment_mgr.cpp | 13 +- be/src/runtime/free_pool.hpp | 2 +- be/src/runtime/initial_reservations.cc | 3 +- be/src/runtime/load_channel.cpp | 13 +- be/src/runtime/load_channel.h | 3 +- be/src/runtime/load_channel_mgr.cpp | 25 ++- be/src/runtime/mem_pool.cpp | 82 +++++---- be/src/runtime/mem_pool.h | 53 +++--- be/src/runtime/mem_tracker.cpp | 109 ++++++----- be/src/runtime/mem_tracker.h | 141 +++++++++------ be/src/runtime/mem_tracker_task_pool.cpp | 62 ++++--- be/src/runtime/mem_tracker_task_pool.h | 28 +-- be/src/runtime/memory/chunk_allocator.cpp | 37 ++-- be/src/runtime/memory/chunk_allocator.h | 7 +- be/src/runtime/memory_scratch_sink.h | 1 - be/src/runtime/odbc_table_sink.cpp | 14 +- be/src/runtime/odbc_table_sink.h | 4 +- be/src/runtime/plan_fragment_executor.cpp | 27 +-- be/src/runtime/plan_fragment_executor.h | 1 - be/src/runtime/qsorter.cpp | 2 +- be/src/runtime/result_file_sink.cpp | 6 +- be/src/runtime/result_file_sink.h | 1 - be/src/runtime/result_sink.h | 1 - be/src/runtime/row_batch.cpp | 33 ++-- be/src/runtime/row_batch.h | 6 +- be/src/runtime/runtime_filter_mgr.cpp | 38 ++-- be/src/runtime/runtime_filter_mgr.h | 5 +- be/src/runtime/runtime_state.cpp | 96 +++------- be/src/runtime/runtime_state.h | 42 +---- be/src/runtime/sorted_run_merger.cc | 7 +- be/src/runtime/sorted_run_merger.h | 5 +- be/src/runtime/spill_sorter.cc | 8 +- be/src/runtime/tablets_channel.cpp | 16 +- be/src/runtime/tablets_channel.h | 2 +- be/src/runtime/thread_context.h | 131 ++++++++++---- be/src/runtime/thread_mem_tracker_mgr.cpp | 77 ++++---- be/src/runtime/thread_mem_tracker_mgr.h | 29 +-- be/src/runtime/vectorized_row_batch.cpp | 6 +- be/src/runtime/vectorized_row_batch.h | 4 +- be/src/service/doris_main.cpp | 2 +- be/src/testutil/function_utils.cpp | 10 +- be/src/testutil/function_utils.h | 2 - be/src/util/arrow/row_batch.cpp | 11 +- be/src/util/arrow/row_batch.h | 5 +- be/src/util/doris_metrics.h | 1 + be/src/vec/exec/join/vhash_join_node.cpp | 15 +- be/src/vec/exec/join/vhash_join_node.h | 2 + be/src/vec/exec/vaggregation_node.cpp | 18 +- be/src/vec/exec/vaggregation_node.h | 2 + be/src/vec/exec/vanalytic_eval_node.cpp | 7 +- be/src/vec/exec/vblocking_join_node.cpp | 8 +- be/src/vec/exec/vcross_join_node.cpp | 11 +- be/src/vec/exec/vcross_join_node.h | 2 + be/src/vec/exec/ves_http_scan_node.cpp | 2 +- be/src/vec/exec/volap_scan_node.cpp | 18 +- be/src/vec/exec/volap_scan_node.h | 2 + be/src/vec/exec/volap_scanner.cpp | 7 +- be/src/vec/exec/volap_scanner.h | 3 +- be/src/vec/exec/vset_operation_node.cpp | 18 +- be/src/vec/exec/vset_operation_node.h | 2 + be/src/vec/exec/vsort_node.cpp | 10 +- be/src/vec/exec/vsort_node.h | 2 + be/src/vec/exprs/vexpr_context.cpp | 11 +- be/src/vec/exprs/vexpr_context.h | 2 + be/src/vec/olap/vgeneric_iterators.cpp | 18 +- be/src/vec/olap/vgeneric_iterators.h | 4 +- be/src/vec/runtime/vdata_stream_mgr.cpp | 2 +- be/src/vec/runtime/vdata_stream_recvr.cpp | 35 ++-- be/src/vec/runtime/vdata_stream_recvr.h | 10 +- be/src/vec/sink/vdata_stream_sender.cpp | 11 +- .../runtime/buffered_tuple_stream2_test.cpp | 4 +- .../org/apache/doris/qe/SessionVariable.java | 19 +- .../org/apache/doris/qe/SimpleScheduler.java | 4 +- 245 files changed, 1763 insertions(+), 1534 deletions(-) create mode 100644 be/src/exec/hash_join_node_ir.cpp diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index a0eaf570714ebd..2aa7b522afb823 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -515,7 +515,7 @@ void TaskWorkerPool::_alter_tablet(const TAgentTaskRequest& agent_task_req, int6 string process_name; switch (task_type) { case TTaskType::ALTER: - process_name = "alter"; + process_name = "AlterTablet"; break; default: std::string task_name; diff --git a/be/src/exec/aggregation_node.cpp b/be/src/exec/aggregation_node.cpp index 82c8f6c267d35f..0d3ed04ca50565 100644 --- a/be/src/exec/aggregation_node.cpp +++ b/be/src/exec/aggregation_node.cpp @@ -34,6 +34,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/string_value.hpp" +#include "runtime/thread_context.h" #include "runtime/tuple.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" @@ -77,6 +78,7 @@ Status AggregationNode::init(const TPlanNode& tnode, RuntimeState* state) { Status AggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _get_results_timer = ADD_TIMER(runtime_profile(), "GetResultsTime"); _hash_table_buckets_counter = ADD_COUNTER(runtime_profile(), "BuildBuckets", TUnit::UNIT); @@ -106,7 +108,7 @@ Status AggregationNode::prepare(RuntimeState* state) { RowDescriptor build_row_desc(_intermediate_tuple_desc, false); RETURN_IF_ERROR(Expr::prepare(_build_expr_ctxs, state, build_row_desc, expr_mem_tracker())); - _tuple_pool.reset(new MemPool(mem_tracker().get())); + _tuple_pool.reset(new MemPool()); _agg_fn_ctxs.resize(_aggregate_evaluators.size()); int j = _probe_expr_ctxs.size(); @@ -141,6 +143,7 @@ Status AggregationNode::prepare(RuntimeState* state) { } Status AggregationNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); @@ -153,7 +156,7 @@ Status AggregationNode::open(RuntimeState* state) { RETURN_IF_ERROR(_children[0]->open(state)); - RowBatch batch(_children[0]->row_desc(), state->batch_size(), mem_tracker().get()); + RowBatch batch(_children[0]->row_desc(), state->batch_size()); int64_t num_input_rows = 0; int64_t num_agg_rows = 0; @@ -227,6 +230,7 @@ Status AggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* // 3. `child(0)->rows_returned() == 0` mean not data from child // in level two aggregation node should return nullptr result // level one aggregation node set `eos = true` return directly + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (UNLIKELY(!_needs_finalize && _singleton_output_tuple != nullptr && child(0)->rows_returned() == 0)) { *eos = true; @@ -288,6 +292,7 @@ Status AggregationNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // Iterate through the remaining rows in the hash table and call Serialize/Finalize on // them in order to free any memory allocated by UDAs. Finalize() requires a dst tuple diff --git a/be/src/exec/analytic_eval_node.cpp b/be/src/exec/analytic_eval_node.cpp index 2a9afa38687c04..3d12cb48fd6ec3 100644 --- a/be/src/exec/analytic_eval_node.cpp +++ b/be/src/exec/analytic_eval_node.cpp @@ -22,6 +22,7 @@ #include "runtime/descriptors.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "udf/udf_internal.h" namespace doris { @@ -141,10 +142,11 @@ Status AnalyticEvalNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); DCHECK(child(0)->row_desc().is_prefix_of(row_desc())); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _child_tuple_desc = child(0)->row_desc().tuple_descriptors()[0]; - _curr_tuple_pool.reset(new MemPool(mem_tracker().get())); - _prev_tuple_pool.reset(new MemPool(mem_tracker().get())); - _mem_pool.reset(new MemPool(mem_tracker().get())); + _curr_tuple_pool.reset(new MemPool()); + _prev_tuple_pool.reset(new MemPool()); + _mem_pool.reset(new MemPool()); _evaluation_timer = ADD_TIMER(runtime_profile(), "EvaluationTime"); DCHECK_EQ(_result_tuple_desc->slots().size(), _evaluators.size()); @@ -183,6 +185,7 @@ Status AnalyticEvalNode::prepare(RuntimeState* state) { } Status AnalyticEvalNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_CANCELLED(state); @@ -201,7 +204,7 @@ Status AnalyticEvalNode::open(RuntimeState* state) { "Failed to acquire initial read buffer for analytic function " "evaluation. Reducing query concurrency or increasing the memory limit may " "help this query to complete successfully."); - return mem_tracker()->mem_limit_exceeded(state, msg, -1); + RETURN_LIMIT_EXCEEDED(mem_tracker(), state, msg); } DCHECK_EQ(_evaluators.size(), _fn_ctxs.size()); @@ -236,10 +239,8 @@ Status AnalyticEvalNode::open(RuntimeState* state) { // Fetch the first input batch so that some _prev_input_row can be set here to avoid // special casing in GetNext(). - _prev_child_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); - _curr_child_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + _prev_child_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); + _curr_child_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); while (!_input_eos && _prev_input_row == nullptr) { RETURN_IF_ERROR(child(0)->get_next(state, _curr_child_batch.get(), &_input_eos)); @@ -738,7 +739,7 @@ Status AnalyticEvalNode::get_next_output_batch(RuntimeState* state, RowBatch* ou ExprContext** ctxs = &_conjunct_ctxs[0]; int num_ctxs = _conjunct_ctxs.size(); - RowBatch input_batch(child(0)->row_desc(), output_batch->capacity(), mem_tracker().get()); + RowBatch input_batch(child(0)->row_desc(), output_batch->capacity()); int64_t stream_idx = _input_stream->rows_returned(); RETURN_IF_ERROR(_input_stream->get_next(&input_batch, eos)); @@ -813,6 +814,7 @@ inline int64_t AnalyticEvalNode::num_output_rows_ready() const { } Status AnalyticEvalNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); @@ -857,6 +859,7 @@ Status AnalyticEvalNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (_input_stream.get() != nullptr) { _input_stream->close(); diff --git a/be/src/exec/assert_num_rows_node.cpp b/be/src/exec/assert_num_rows_node.cpp index 6c84dfc1f05f8d..d25a0c071da1fc 100644 --- a/be/src/exec/assert_num_rows_node.cpp +++ b/be/src/exec/assert_num_rows_node.cpp @@ -21,6 +21,7 @@ #include "gutil/strings/substitute.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris { @@ -48,6 +49,7 @@ Status AssertNumRowsNode::prepare(RuntimeState* state) { } Status AssertNumRowsNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); // ISSUE-3435 @@ -56,6 +58,7 @@ Status AssertNumRowsNode::open(RuntimeState* state) { } Status AssertNumRowsNode::get_next(RuntimeState* state, RowBatch* output_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); SCOPED_TIMER(_runtime_profile->total_time_counter()); output_batch->reset(); diff --git a/be/src/exec/base_scanner.cpp b/be/src/exec/base_scanner.cpp index eb29bafa4ba066..d5dd7ec5ff2ad7 100644 --- a/be/src/exec/base_scanner.cpp +++ b/be/src/exec/base_scanner.cpp @@ -33,8 +33,7 @@ namespace doris { BaseScanner::BaseScanner(RuntimeState* state, RuntimeProfile* profile, const TBrokerScanRangeParams& params, - const std::vector& pre_filter_texprs, - ScannerCounter* counter) + const std::vector& pre_filter_texprs, ScannerCounter* counter) : _state(state), _params(params), _counter(counter), @@ -43,11 +42,10 @@ BaseScanner::BaseScanner(RuntimeState* state, RuntimeProfile* profile, #if BE_TEST _mem_tracker(new MemTracker()), #else - _mem_tracker( - MemTracker::create_tracker(-1, "BaseScanner:" + std::to_string(state->load_job_id()), - state->instance_mem_tracker())), + _mem_tracker(MemTracker::create_tracker( + -1, "Scanner:" + std::to_string(state->load_job_id()))), #endif - _mem_pool(_mem_tracker.get()), + _mem_pool(_mem_tracker), _dest_tuple_desc(nullptr), _pre_filter_texprs(pre_filter_texprs), _strict_mode(false), @@ -259,5 +257,4 @@ void BaseScanner::close() { } } - } // namespace doris diff --git a/be/src/exec/blocking_join_node.cpp b/be/src/exec/blocking_join_node.cpp index d46e54dc0eefca..0f5d4f626dd284 100644 --- a/be/src/exec/blocking_join_node.cpp +++ b/be/src/exec/blocking_join_node.cpp @@ -46,8 +46,9 @@ BlockingJoinNode::~BlockingJoinNode() { Status BlockingJoinNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); - _build_pool.reset(new MemPool(mem_tracker().get())); + _build_pool.reset(new MemPool()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _left_child_timer = ADD_TIMER(runtime_profile(), "LeftChildTime"); _build_row_counter = ADD_COUNTER(runtime_profile(), "BuildRows", TUnit::UNIT); @@ -70,7 +71,7 @@ Status BlockingJoinNode::prepare(RuntimeState* state) { _probe_tuple_row_size = num_left_tuples * sizeof(Tuple*); _build_tuple_row_size = num_build_tuples * sizeof(Tuple*); - _left_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + _left_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); return Status::OK(); } @@ -83,12 +84,13 @@ Status BlockingJoinNode::close(RuntimeState* state) { } void BlockingJoinNode::build_side_thread(RuntimeState* state, std::promise* status) { - SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(state->query_id()), - state->fragment_instance_id()); + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), mem_tracker()); status->set_value(construct_build_side(state)); } Status BlockingJoinNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); SCOPED_TIMER(_runtime_profile->total_time_counter()); // RETURN_IF_ERROR(Expr::open(_conjuncts, state)); diff --git a/be/src/exec/broker_scan_node.cpp b/be/src/exec/broker_scan_node.cpp index 344ca3f95e2515..bfdce6ca3f4b96 100644 --- a/be/src/exec/broker_scan_node.cpp +++ b/be/src/exec/broker_scan_node.cpp @@ -30,6 +30,7 @@ #include "runtime/dpp_sink_internal.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris { @@ -60,6 +61,7 @@ Status BrokerScanNode::init(const TPlanNode& tnode, RuntimeState* state) { Status BrokerScanNode::prepare(RuntimeState* state) { VLOG_QUERY << "BrokerScanNode prepare"; RETURN_IF_ERROR(ScanNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // get tuple desc _runtime_state = state; _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -86,6 +88,7 @@ Status BrokerScanNode::prepare(RuntimeState* state) { } Status BrokerScanNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); @@ -106,6 +109,7 @@ Status BrokerScanNode::start_scanners() { } Status BrokerScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); // check if CANCELLED. if (state->is_cancelled()) { @@ -190,6 +194,7 @@ Status BrokerScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); _scan_finished.store(true); @@ -254,7 +259,7 @@ Status BrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, while (!scanner_eof) { // Fill one row batch std::shared_ptr row_batch( - new RowBatch(row_desc(), _runtime_state->batch_size(), mem_tracker().get())); + new RowBatch(row_desc(), _runtime_state->batch_size())); // create new tuple buffer for row_batch MemPool* tuple_pool = row_batch->tuple_data_pool(); diff --git a/be/src/exec/broker_scanner.cpp b/be/src/exec/broker_scanner.cpp index af44b8f7ea99ba..24fa94fe25753d 100644 --- a/be/src/exec/broker_scanner.cpp +++ b/be/src/exec/broker_scanner.cpp @@ -35,7 +35,6 @@ #include "exprs/expr.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" #include "runtime/raw_value.h" #include "runtime/stream_load/load_stream_mgr.h" #include "runtime/stream_load/stream_load_pipe.h" diff --git a/be/src/exec/broker_scanner.h b/be/src/exec/broker_scanner.h index ca66cd0c44826e..638b6eb99e8d7e 100644 --- a/be/src/exec/broker_scanner.h +++ b/be/src/exec/broker_scanner.h @@ -46,7 +46,6 @@ class ExprContext; class TupleDescriptor; class TupleRow; class RowDescriptor; -class MemTracker; class RuntimeProfile; class StreamLoadPipe; diff --git a/be/src/exec/cross_join_node.cpp b/be/src/exec/cross_join_node.cpp index 8ef9b662f8fb65..a4f00e0e4d8b78 100644 --- a/be/src/exec/cross_join_node.cpp +++ b/be/src/exec/cross_join_node.cpp @@ -23,6 +23,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/debug_util.h" #include "util/runtime_profile.h" @@ -52,10 +53,10 @@ Status CrossJoinNode::close(RuntimeState* state) { Status CrossJoinNode::construct_build_side(RuntimeState* state) { // Do a full scan of child(1) and store all build row batches. RETURN_IF_ERROR(child(1)->open(state)); - + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB("Cross join, while getting next from the child 1"); while (true) { - RowBatch* batch = _build_batch_pool->add( - new RowBatch(child(1)->row_desc(), state->batch_size(), mem_tracker().get())); + RowBatch* batch = + _build_batch_pool->add(new RowBatch(child(1)->row_desc(), state->batch_size())); RETURN_IF_CANCELLED(state); // TODO(zhaochun): @@ -63,9 +64,6 @@ Status CrossJoinNode::construct_build_side(RuntimeState* state) { bool eos = false; RETURN_IF_ERROR(child(1)->get_next(state, batch, &eos)); - // to prevent use too many memory - RETURN_IF_LIMIT_EXCEEDED(state, "Cross join, while getting next from the child 1."); - SCOPED_TIMER(_build_timer); _build_batches.add_row_batch(batch); VLOG_ROW << build_list_debug_string(); @@ -86,6 +84,7 @@ void CrossJoinNode::init_get_next(TupleRow* first_left_row) { Status CrossJoinNode::get_next(RuntimeState* state, RowBatch* output_batch, bool* eos) { // RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT, state)); RETURN_IF_CANCELLED(state); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); *eos = false; // TOOD(zhaochun) // RETURN_IF_ERROR(state->check_query_state()); diff --git a/be/src/exec/csv_scan_node.cpp b/be/src/exec/csv_scan_node.cpp index e005b81d9002e9..d9ac8ea0483a4e 100644 --- a/be/src/exec/csv_scan_node.cpp +++ b/be/src/exec/csv_scan_node.cpp @@ -29,6 +29,7 @@ #include "olap/utils.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/string_value.h" #include "runtime/tuple_row.h" #include "util/debug_util.h" @@ -128,6 +129,7 @@ Status CsvScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // add timer _split_check_timer = ADD_TIMER(_runtime_profile, "split check timer"); @@ -195,7 +197,7 @@ Status CsvScanNode::prepare(RuntimeState* state) { return Status::InternalError("new a csv scanner failed."); } - _tuple_pool.reset(new (std::nothrow) MemPool(state->instance_mem_tracker().get())); + _tuple_pool.reset(new (std::nothrow) MemPool()); if (_tuple_pool.get() == nullptr) { return Status::InternalError("new a mem pool failed."); } @@ -210,6 +212,7 @@ Status CsvScanNode::prepare(RuntimeState* state) { } Status CsvScanNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); VLOG_CRITICAL << "CsvScanNode::Open"; @@ -232,6 +235,7 @@ Status CsvScanNode::open(RuntimeState* state) { } Status CsvScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "CsvScanNode::GetNext"; if (nullptr == state || nullptr == row_batch || nullptr == eos) { return Status::InternalError("input is nullptr pointer"); @@ -320,6 +324,7 @@ Status CsvScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "CsvScanNode::Close"; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); diff --git a/be/src/exec/data_sink.cpp b/be/src/exec/data_sink.cpp index 994d98a35d233c..1113d98dcf4782 100644 --- a/be/src/exec/data_sink.cpp +++ b/be/src/exec/data_sink.cpp @@ -182,7 +182,7 @@ Status DataSink::init(const TDataSink& thrift_sink) { Status DataSink::prepare(RuntimeState* state) { _expr_mem_tracker = MemTracker::create_tracker(-1, _name + ":Expr:" + std::to_string(state->load_job_id()), - state->instance_mem_tracker()); + state->instance_mem_tracker()); return Status::OK(); } diff --git a/be/src/exec/data_sink.h b/be/src/exec/data_sink.h index fcec10aed8825e..a3d8b4ee1ee712 100644 --- a/be/src/exec/data_sink.h +++ b/be/src/exec/data_sink.h @@ -68,7 +68,6 @@ class DataSink { // It must be okay to call this multiple times. Subsequent calls should // be ignored. virtual Status close(RuntimeState* state, Status exec_status) { - _expr_mem_tracker.reset(); _closed = true; return Status::OK(); } diff --git a/be/src/exec/es/es_scroll_parser.cpp b/be/src/exec/es/es_scroll_parser.cpp index 06e693a5faea80..cfcc1ff2a0db3a 100644 --- a/be/src/exec/es/es_scroll_parser.cpp +++ b/be/src/exec/es/es_scroll_parser.cpp @@ -355,11 +355,12 @@ Status ScrollParser::fill_tuple(const TupleDescriptor* tuple_desc, Tuple* tuple, // obj[FIELD_ID] must not be nullptr std::string _id = obj[FIELD_ID].GetString(); size_t len = _id.length(); - char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(len)); + Status rst; + char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(len, &rst)); if (UNLIKELY(buffer == nullptr)) { std::string details = strings::Substitute(ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", len, "string slot"); - return tuple_pool->mem_tracker()->mem_limit_exceeded(nullptr, details, len); + RETURN_ALLOC_LIMIT_EXCEEDED(tuple_pool->mem_tracker(), nullptr, details, len, rst); } memcpy(buffer, _id.data(), len); reinterpret_cast(slot)->ptr = buffer; @@ -413,11 +414,12 @@ Status ScrollParser::fill_tuple(const TupleDescriptor* tuple_desc, Tuple* tuple, } } size_t val_size = val.length(); - char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size)); + Status rst; + char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size, &rst)); if (UNLIKELY(buffer == nullptr)) { std::string details = strings::Substitute( ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", val_size, "string slot"); - return tuple_pool->mem_tracker()->mem_limit_exceeded(nullptr, details, val_size); + RETURN_ALLOC_LIMIT_EXCEEDED(tuple_pool->mem_tracker(), nullptr, details, val_size, rst); } memcpy(buffer, val.data(), val_size); reinterpret_cast(slot)->ptr = buffer; diff --git a/be/src/exec/es_http_scan_node.cpp b/be/src/exec/es_http_scan_node.cpp index 7b67486401be2f..eae73138cd68cc 100644 --- a/be/src/exec/es_http_scan_node.cpp +++ b/be/src/exec/es_http_scan_node.cpp @@ -30,6 +30,7 @@ #include "runtime/dpp_sink_internal.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "service/backend_options.h" #include "util/runtime_profile.h" @@ -67,6 +68,7 @@ Status EsHttpScanNode::init(const TPlanNode& tnode, RuntimeState* state) { Status EsHttpScanNode::prepare(RuntimeState* state) { VLOG_QUERY << "EsHttpScanNode prepare"; RETURN_IF_ERROR(ScanNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _runtime_state = state; _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -113,6 +115,7 @@ Status EsHttpScanNode::build_conjuncts_list() { } Status EsHttpScanNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); @@ -181,6 +184,7 @@ Status EsHttpScanNode::collect_scanners_status() { } Status EsHttpScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (state->is_cancelled()) { std::unique_lock l(_batch_queue_lock); @@ -268,6 +272,7 @@ Status EsHttpScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); _scan_finished.store(true); @@ -307,8 +312,7 @@ Status EsHttpScanNode::scanner_scan(std::unique_ptr scanner, while (!scanner_eof) { // Fill one row batch - std::shared_ptr row_batch( - new RowBatch(row_desc(), _runtime_state->batch_size(), mem_tracker().get())); + std::shared_ptr row_batch(new RowBatch(row_desc(), _runtime_state->batch_size())); // create new tuple buffer for row_batch MemPool* tuple_pool = row_batch->tuple_data_pool(); @@ -406,6 +410,9 @@ static std::string get_host_port(const std::vector& es_hosts) { } void EsHttpScanNode::scanner_worker(int start_idx, int length, std::promise& p_status) { + SCOPED_ATTACH_TASK_THREAD_4ARG(_runtime_state->query_type(), + print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id(), mem_tracker()); // Clone expr context std::vector scanner_expr_ctxs; DCHECK(start_idx < length); diff --git a/be/src/exec/es_http_scanner.cpp b/be/src/exec/es_http_scanner.cpp index 9a914b90c2361a..545c4699e1ade2 100644 --- a/be/src/exec/es_http_scanner.cpp +++ b/be/src/exec/es_http_scanner.cpp @@ -24,7 +24,6 @@ #include "exprs/expr_context.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" #include "runtime/raw_value.h" #include "runtime/runtime_state.h" #include "runtime/tuple.h" @@ -43,14 +42,7 @@ EsHttpScanner::EsHttpScanner(RuntimeState* state, RuntimeProfile* profile, Tuple _next_range(0), _line_eof(false), _batch_eof(false), -#if BE_TEST - _mem_tracker(new MemTracker()), -#else - _mem_tracker( - MemTracker::create_tracker(-1, "EsHttpScanner:" + std::to_string(state->load_job_id()), - state->instance_mem_tracker())), -#endif - _mem_pool(_mem_tracker.get()), + _mem_pool("EsHttpScanner"), _tuple_desc(nullptr), _counter(counter), _es_reader(nullptr), diff --git a/be/src/exec/es_http_scanner.h b/be/src/exec/es_http_scanner.h index dcebfe164994b1..cc2380607d25f6 100644 --- a/be/src/exec/es_http_scanner.h +++ b/be/src/exec/es_http_scanner.h @@ -43,7 +43,6 @@ class TextConverter; class TupleDescriptor; class TupleRow; class RowDescriptor; -class MemTracker; class RuntimeProfile; struct EsScanCounter { @@ -82,7 +81,6 @@ class EsHttpScanner { std::vector _slot_descs; std::unique_ptr _row_desc; - std::shared_ptr _mem_tracker; MemPool _mem_pool; const TupleDescriptor* _tuple_desc; diff --git a/be/src/exec/es_scan_node.cpp b/be/src/exec/es_scan_node.cpp index c71a3efe1d1409..a548a0b1cacf16 100644 --- a/be/src/exec/es_scan_node.cpp +++ b/be/src/exec/es_scan_node.cpp @@ -34,6 +34,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/string_value.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "service/backend_options.h" #include "util/debug_util.h" @@ -67,6 +68,7 @@ Status EsScanNode::prepare(RuntimeState* state) { VLOG_CRITICAL << "EsScanNode::Prepare"; RETURN_IF_ERROR(ScanNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); if (_tuple_desc == nullptr) { std::stringstream ss; @@ -80,6 +82,7 @@ Status EsScanNode::prepare(RuntimeState* state) { } Status EsScanNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "EsScanNode::Open"; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); @@ -204,6 +207,7 @@ Status EsScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); // create tuple @@ -256,6 +260,7 @@ Status EsScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) Status EsScanNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "EsScanNode::Close"; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -771,11 +776,12 @@ Status EsScanNode::materialize_row(MemPool* tuple_pool, Tuple* tuple, } const string& val = col.string_vals[val_idx]; size_t val_size = val.size(); - char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size)); + Status rst; + char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size, &rst)); if (UNLIKELY(buffer == nullptr)) { std::string details = strings::Substitute( ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", val_size, "string slot"); - return tuple_pool->mem_tracker()->mem_limit_exceeded(nullptr, details, val_size); + RETURN_ALLOC_LIMIT_EXCEEDED(tuple_pool->mem_tracker(), nullptr, details, val_size, rst); } memcpy(buffer, val.data(), val_size); reinterpret_cast(slot)->ptr = buffer; diff --git a/be/src/exec/except_node.cpp b/be/src/exec/except_node.cpp index 8229b73e53a669..992ab60a068e6a 100644 --- a/be/src/exec/except_node.cpp +++ b/be/src/exec/except_node.cpp @@ -21,6 +21,7 @@ #include "exprs/expr.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" namespace doris { ExceptNode::ExceptNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) @@ -40,6 +41,7 @@ Status ExceptNode::init(const TPlanNode& tnode, RuntimeState* state) { Status ExceptNode::open(RuntimeState* state) { RETURN_IF_ERROR(SetOperationNode::open(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "Except , while probing the hash table."); // if a table is empty, the result must be empty if (_hash_tbl->size() == 0) { _hash_tbl_iterator = _hash_tbl->begin(); @@ -53,15 +55,13 @@ Status ExceptNode::open(RuntimeState* state) { if (i > 1) { refresh_hash_table(i); } // probe - _probe_batch.reset( - new RowBatch(child(i)->row_desc(), state->batch_size(), mem_tracker().get())); + _probe_batch.reset(new RowBatch(child(i)->row_desc(), state->batch_size())); ScopedTimer probe_timer(_probe_timer); RETURN_IF_ERROR(child(i)->open(state)); eos = false; while (!eos) { RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(child(i)->get_next(state, _probe_batch.get(), &eos)); - RETURN_IF_LIMIT_EXCEEDED(state, " Except , while probing the hash table."); for (int j = 0; j < _probe_batch->num_rows(); ++j) { VLOG_ROW << "probe row: " << get_row_output_string(_probe_batch->get_row(j), child(i)->row_desc()); @@ -90,6 +90,7 @@ Status ExceptNode::open(RuntimeState* state) { Status ExceptNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eos) { RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); *eos = true; if (reached_limit()) { diff --git a/be/src/exec/exchange_node.cpp b/be/src/exec/exchange_node.cpp index 14299c65a610ae..e421415b83ce56 100644 --- a/be/src/exec/exchange_node.cpp +++ b/be/src/exec/exchange_node.cpp @@ -23,6 +23,7 @@ #include "runtime/exec_env.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris { @@ -57,6 +58,7 @@ Status ExchangeNode::init(const TPlanNode& tnode, RuntimeState* state) { Status ExchangeNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _convert_row_batch_timer = ADD_TIMER(runtime_profile(), "ConvertRowBatchTime"); // TODO: figure out appropriate buffer size DCHECK_GT(_num_senders, 0); @@ -74,6 +76,7 @@ Status ExchangeNode::prepare(RuntimeState* state) { } Status ExchangeNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); if (_is_merging) { @@ -82,8 +85,7 @@ Status ExchangeNode::open(RuntimeState* state) { // create_merger() will populate its merging heap with batches from the _stream_recvr, // so it is not necessary to call fill_input_row_batch(). if (state->enable_exchange_node_parallel_merge()) { - RETURN_IF_ERROR(_stream_recvr->create_parallel_merger(less_than, state->batch_size(), - mem_tracker().get())); + RETURN_IF_ERROR(_stream_recvr->create_parallel_merger(less_than, state->batch_size())); } else { RETURN_IF_ERROR(_stream_recvr->create_merger(less_than)); } @@ -103,6 +105,7 @@ Status ExchangeNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (_is_merging) { _sort_exec_exprs.close(state); } @@ -129,6 +132,7 @@ Status ExchangeNode::fill_input_row_batch(RuntimeState* state) { Status ExchangeNode::get_next(RuntimeState* state, RowBatch* output_batch, bool* eos) { RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (reached_limit()) { diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp index 14a24304182ac7..f65dacc5f1ae72 100644 --- a/be/src/exec/exec_node.cpp +++ b/be/src/exec/exec_node.cpp @@ -208,7 +208,6 @@ Status ExecNode::prepare(RuntimeState* state) { MemTrackerLevel::VERBOSE, _runtime_profile.get()); _expr_mem_tracker = MemTracker::create_tracker(-1, "ExecNode:Exprs:" + _runtime_profile->name(), _mem_tracker); - _expr_mem_pool.reset(new MemPool(_expr_mem_tracker.get())); if (_vconjunct_ctx_ptr) { RETURN_IF_ERROR((*_vconjunct_ctx_ptr)->prepare(state, row_desc(), expr_mem_tracker())); @@ -270,10 +269,6 @@ Status ExecNode::close(RuntimeState* state) { if (_vconjunct_ctx_ptr) (*_vconjunct_ctx_ptr)->close(state); Expr::close(_conjunct_ctxs, state); - if (expr_mem_pool() != nullptr) { - _expr_mem_pool->free_all(); - } - if (_buffer_pool_client.is_registered()) { VLOG_FILE << _id << " returning reservation " << _resource_profile.min_reservation; state->initial_reservations()->Return(&_buffer_pool_client, diff --git a/be/src/exec/exec_node.h b/be/src/exec/exec_node.h index c27469d2561fdb..1644ba5165db3c 100644 --- a/be/src/exec/exec_node.h +++ b/be/src/exec/exec_node.h @@ -196,8 +196,6 @@ class ExecNode { std::shared_ptr expr_mem_tracker() const { return _expr_mem_tracker; } - MemPool* expr_mem_pool() const { return _expr_mem_pool.get(); } - // Extract node id from p->name(). static int get_node_id_from_profile(RuntimeProfile* p); @@ -306,14 +304,9 @@ class ExecNode { /// Account for peak memory used by this node std::shared_ptr _mem_tracker; - - /// MemTracker used by 'expr_mem_pool_'. + // MemTracker used by all Expr. std::shared_ptr _expr_mem_tracker; - /// MemPool for allocating data structures used by expression evaluators in this node. - /// Created in Prepare(). - std::unique_ptr _expr_mem_pool; - RuntimeProfile::Counter* _rows_returned_counter; RuntimeProfile::Counter* _rows_returned_rate; // Account for peak memory used by this node diff --git a/be/src/exec/hash_join_node.cpp b/be/src/exec/hash_join_node.cpp index 5f39ff0e5102e9..41bde1e45e0756 100644 --- a/be/src/exec/hash_join_node.cpp +++ b/be/src/exec/hash_join_node.cpp @@ -96,8 +96,9 @@ Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status HashJoinNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); - _build_pool.reset(new MemPool(mem_tracker().get())); + _build_pool.reset(new MemPool()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _push_down_timer = ADD_TIMER(runtime_profile(), "PushDownTime"); _push_compute_timer = ADD_TIMER(runtime_profile(), "PushDownComputeTime"); @@ -147,8 +148,7 @@ Status HashJoinNode::prepare(RuntimeState* state) { _hash_tbl.reset(new HashTable(_build_expr_ctxs, _probe_expr_ctxs, _build_tuple_size, stores_nulls, _is_null_safe_eq_join, id(), mem_tracker(), 1024)); - _probe_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + _probe_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); return Status::OK(); } @@ -157,6 +157,7 @@ Status HashJoinNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); // Must reset _probe_batch in close() to release resources @@ -177,8 +178,8 @@ Status HashJoinNode::close(RuntimeState* state) { } void HashJoinNode::build_side_thread(RuntimeState* state, std::promise* status) { - SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(state->query_id()), - state->fragment_instance_id()); + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), mem_tracker()); status->set_value(construct_hash_table(state)); } @@ -187,7 +188,8 @@ Status HashJoinNode::construct_hash_table(RuntimeState* state) { // The hash join node needs to keep in memory all build tuples, including the tuple // row ptrs. The row ptrs are copied into the hash table's internal structure so they // don't need to be stored in the _build_pool. - RowBatch build_batch(child(1)->row_desc(), state->batch_size(), mem_tracker().get()); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB("Hash join, while constructing the hash table."); + RowBatch build_batch(child(1)->row_desc(), state->batch_size()); RETURN_IF_ERROR(child(1)->open(state)); SCOPED_TIMER(_build_timer); @@ -217,6 +219,7 @@ Status HashJoinNode::construct_hash_table(RuntimeState* state) { } Status HashJoinNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -234,7 +237,8 @@ Status HashJoinNode::open(RuntimeState* state) { // main thread std::promise thread_status; add_runtime_exec_option("Hash Table Built Asynchronously"); - std::thread(bind(&HashJoinNode::build_side_thread, this, state, &thread_status)).detach(); + std::thread(bind(&HashJoinNode::build_side_thread, this, state, &thread_status)) + .detach(); if (!_runtime_filter_descs.empty()) { RuntimeFilterSlots runtime_filter_slots(_probe_expr_ctxs, _build_expr_ctxs, @@ -304,7 +308,7 @@ Status HashJoinNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eo // In most cases, no additional memory overhead will be applied for at this stage, // but if the expression calculation in this node needs to apply for additional memory, // it may cause the memory to exceed the limit. - RETURN_IF_LIMIT_EXCEEDED(state, "Hash join, while execute get_next."); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "Hash join, while execute get_next."); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (reached_limit()) { diff --git a/be/src/exec/hash_join_node_ir.cpp b/be/src/exec/hash_join_node_ir.cpp new file mode 100644 index 00000000000000..6dc5ab1dc01c35 --- /dev/null +++ b/be/src/exec/hash_join_node_ir.cpp @@ -0,0 +1,170 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "common/utils.h" +#include "exec/hash_join_node.h" +#include "exec/hash_table.hpp" +#include "exprs/expr_context.h" +#include "runtime/row_batch.h" +#include "runtime/runtime_state.h" +#include "runtime/tuple_row.h" + +namespace doris { + +// Functions in this file are cross compiled to IR with clang. + +// Wrapper around ExecNode's eval conjuncts with a different function name. +// This lets us distinguish between the join conjuncts vs. non-join conjuncts +// for codegen. +// Note: don't declare this static. LLVM will pick the fastcc calling convention and +// we will not be able to replace the functions with codegen'd versions. +// TODO: explicitly set the calling convention? +// TODO: investigate using fastcc for all codegen internal functions? +bool IR_NO_INLINE eval_other_join_conjuncts(ExprContext* const* ctxs, int num_ctxs, TupleRow* row) { + return ExecNode::eval_conjuncts(ctxs, num_ctxs, row); +} + +// CreateOutputRow, EvalOtherJoinConjuncts, and EvalConjuncts are replaced by +// codegen. +int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch, + int max_added_rows) { + // This path does not handle full outer or right outer joins + DCHECK(!_match_all_build); + + int row_idx = out_batch->add_rows(max_added_rows); + DCHECK(row_idx != RowBatch::INVALID_ROW_INDEX); + uint8_t* out_row_mem = reinterpret_cast(out_batch->get_row(row_idx)); + TupleRow* out_row = reinterpret_cast(out_row_mem); + + int rows_returned = 0; + int probe_rows = probe_batch->num_rows(); + + ExprContext* const* other_conjunct_ctxs = &_other_join_conjunct_ctxs[0]; + int num_other_conjunct_ctxs = _other_join_conjunct_ctxs.size(); + + ExprContext* const* conjunct_ctxs = &_conjunct_ctxs[0]; + int num_conjunct_ctxs = _conjunct_ctxs.size(); + + while (true) { + // Create output row for each matching build row + while (_hash_tbl_iterator.has_next()) { + TupleRow* matched_build_row = _hash_tbl_iterator.get_row(); + _hash_tbl_iterator.next(); + create_output_row(out_row, _current_probe_row, matched_build_row); + + if (!eval_other_join_conjuncts(other_conjunct_ctxs, num_other_conjunct_ctxs, out_row)) { + continue; + } + + _matched_probe = true; + + // left_anti_join: equal match won't return + if (_join_op == TJoinOp::LEFT_ANTI_JOIN) { + _hash_tbl_iterator = _hash_tbl->end(); + break; + } + + if (eval_conjuncts(conjunct_ctxs, num_conjunct_ctxs, out_row)) { + ++rows_returned; + + // Filled up out batch or hit limit + if (UNLIKELY(rows_returned == max_added_rows)) { + goto end; + } + + // Advance to next out row + out_row_mem += out_batch->row_byte_size(); + out_row = reinterpret_cast(out_row_mem); + } + + // Handle left semi-join + if (_match_one_build) { + _hash_tbl_iterator = _hash_tbl->end(); + break; + } + } + + // Handle left outer-join and left semi-join + if ((!_matched_probe && _match_all_probe) || + ((!_matched_probe && _join_op == TJoinOp::LEFT_ANTI_JOIN))) { + create_output_row(out_row, _current_probe_row, nullptr); + _matched_probe = true; + + if (ExecNode::eval_conjuncts(conjunct_ctxs, num_conjunct_ctxs, out_row)) { + ++rows_returned; + + if (UNLIKELY(rows_returned == max_added_rows)) { + goto end; + } + + // Advance to next out row + out_row_mem += out_batch->row_byte_size(); + out_row = reinterpret_cast(out_row_mem); + } + } + + if (!_hash_tbl_iterator.has_next()) { + // Advance to the next probe row + if (UNLIKELY(_probe_batch_pos == probe_rows)) { + goto end; + } + if (++_probe_counter % RELEASE_CONTEXT_COUNTER == 0) { + ExprContext::free_local_allocations(_probe_expr_ctxs); + ExprContext::free_local_allocations(_build_expr_ctxs); + } + _current_probe_row = probe_batch->get_row(_probe_batch_pos++); + _hash_tbl_iterator = _hash_tbl->find(_current_probe_row); + _matched_probe = false; + } + } + +end: + + if (_match_one_build && _matched_probe) { + _hash_tbl_iterator = _hash_tbl->end(); + } + + out_batch->commit_rows(rows_returned); + return rows_returned; +} + +// when build table has too many duplicated rows, the collisions will be very serious, +// so in some case will don't need to store duplicated value in hash table, we can build an unique one +Status HashJoinNode::process_build_batch(RuntimeState* state, RowBatch* build_batch) { + // insert build row into our hash table + if (_build_unique) { + for (int i = 0; i < build_batch->num_rows(); ++i) { + // _hash_tbl->insert_unique(build_batch->get_row(i)); + TupleRow* tuple_row = nullptr; + if (_hash_tbl->emplace_key(build_batch->get_row(i), &tuple_row)) { + build_batch->get_row(i)->deep_copy(tuple_row, + child(1)->row_desc().tuple_descriptors(), + _build_pool.get(), false); + } + } + } else { + // take ownership of tuple data of build_batch + _build_pool->acquire_data(build_batch->tuple_data_pool(), false); + + for (int i = 0; i < build_batch->num_rows(); ++i) { + _hash_tbl->insert(build_batch->get_row(i)); + } + } + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/exec/hash_table.cpp b/be/src/exec/hash_table.cpp index 50f9c8c87784db..b821a4a7cce9e0 100644 --- a/be/src/exec/hash_table.cpp +++ b/be/src/exec/hash_table.cpp @@ -44,12 +44,12 @@ HashTable::HashTable(const std::vector& build_expr_ctxs, _current_capacity(num_buckets), _current_used(0), _total_capacity(num_buckets), - _exceeded_limit(false), - _mem_tracker(mem_tracker) { - DCHECK(_mem_tracker); + _exceeded_limit(false) { DCHECK_EQ(_build_expr_ctxs.size(), _probe_expr_ctxs.size()); DCHECK_EQ((num_buckets & (num_buckets - 1)), 0) << "num_buckets must be a power of 2"; + _mem_tracker = MemTracker::create_virtual_tracker(-1, mem_tracker->label() + ":HashTable", + mem_tracker); _buckets.resize(num_buckets); _num_buckets = num_buckets; _num_buckets_till_resize = MAX_BUCKET_OCCUPANCY_FRACTION * _num_buckets; diff --git a/be/src/exec/intersect_node.cpp b/be/src/exec/intersect_node.cpp index 60481cce861634..b943b28f85fae7 100644 --- a/be/src/exec/intersect_node.cpp +++ b/be/src/exec/intersect_node.cpp @@ -21,6 +21,7 @@ #include "exprs/expr.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" namespace doris { IntersectNode::IntersectNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) @@ -43,6 +44,7 @@ Status IntersectNode::init(const TPlanNode& tnode, RuntimeState* state) { // 2 probe with child(1), then filter the hash table and find the matched item, use them to rebuild a hash table // repeat [2] this for all the rest child Status IntersectNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "Intersect , while probing the hash table."); RETURN_IF_ERROR(SetOperationNode::open(state)); // if a table is empty, the result must be empty if (_hash_tbl->size() == 0) { @@ -57,14 +59,13 @@ Status IntersectNode::open(RuntimeState* state) { _valid_element_in_hash_tbl = 0; // probe _probe_batch.reset( - new RowBatch(child(i)->row_desc(), state->batch_size(), mem_tracker().get())); + new RowBatch(child(i)->row_desc(), state->batch_size())); ScopedTimer probe_timer(_probe_timer); RETURN_IF_ERROR(child(i)->open(state)); eos = false; while (!eos) { RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(child(i)->get_next(state, _probe_batch.get(), &eos)); - RETURN_IF_LIMIT_EXCEEDED(state, " Intersect , while probing the hash table."); for (int j = 0; j < _probe_batch->num_rows(); ++j) { VLOG_ROW << "probe row: " << get_row_output_string(_probe_batch->get_row(j), child(i)->row_desc()); @@ -87,6 +88,7 @@ Status IntersectNode::open(RuntimeState* state) { } Status IntersectNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/json_scanner.cpp b/be/src/exec/json_scanner.cpp index eaff43f69a7ed2..3d611e668187d9 100644 --- a/be/src/exec/json_scanner.cpp +++ b/be/src/exec/json_scanner.cpp @@ -30,7 +30,6 @@ #include "exprs/json_functions.h" #include "gutil/strings/split.h" #include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" #include "runtime/runtime_state.h" namespace doris { diff --git a/be/src/exec/json_scanner.h b/be/src/exec/json_scanner.h index 1a489a5ed890d3..91528c8351b927 100644 --- a/be/src/exec/json_scanner.h +++ b/be/src/exec/json_scanner.h @@ -47,7 +47,6 @@ class Tuple; class SlotDescriptor; class RuntimeState; class TupleDescriptor; -class MemTracker; class JsonReader; class LineReader; class FileReader; diff --git a/be/src/exec/merge_join_node.cpp b/be/src/exec/merge_join_node.cpp index d83e872507e40a..72ae19ac414822 100644 --- a/be/src/exec/merge_join_node.cpp +++ b/be/src/exec/merge_join_node.cpp @@ -25,6 +25,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/debug_util.h" #include "util/runtime_profile.h" @@ -71,6 +72,7 @@ Status MergeJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status MergeJoinNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // build and probe exprs are evaluated in the context of the rows produced by our // right and left children, respectively @@ -129,10 +131,8 @@ Status MergeJoinNode::prepare(RuntimeState* state) { _right_tuple_idx.push_back(_row_descriptor.get_tuple_idx(right_tuple_desc->id())); } - _left_child_ctx.reset( - new ChildReaderContext(row_desc(), state->batch_size(), state->instance_mem_tracker())); - _right_child_ctx.reset( - new ChildReaderContext(row_desc(), state->batch_size(), state->instance_mem_tracker())); + _left_child_ctx.reset(new ChildReaderContext(row_desc(), state->batch_size())); + _right_child_ctx.reset(new ChildReaderContext(row_desc(), state->batch_size())); return Status::OK(); } @@ -141,6 +141,7 @@ Status MergeJoinNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); Expr::close(_left_expr_ctxs, state); Expr::close(_right_expr_ctxs, state); @@ -149,6 +150,7 @@ Status MergeJoinNode::close(RuntimeState* state) { } Status MergeJoinNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(ExecNode::open(state)); @@ -170,6 +172,7 @@ Status MergeJoinNode::open(RuntimeState* state) { } Status MergeJoinNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -295,14 +298,12 @@ Status MergeJoinNode::get_input_row(RuntimeState* state, int child_idx) { } if (child_idx == 0) { - _left_child_ctx.reset(new ChildReaderContext(child(child_idx)->row_desc(), - state->batch_size(), - state->instance_mem_tracker())); + _left_child_ctx.reset( + new ChildReaderContext(child(child_idx)->row_desc(), state->batch_size())); ctx = _left_child_ctx.get(); } else { - _right_child_ctx.reset(new ChildReaderContext(child(child_idx)->row_desc(), - state->batch_size(), - state->instance_mem_tracker())); + _right_child_ctx.reset( + new ChildReaderContext(child(child_idx)->row_desc(), state->batch_size())); ctx = _right_child_ctx.get(); } diff --git a/be/src/exec/merge_join_node.h b/be/src/exec/merge_join_node.h index d8b294ea186f1c..ef02727be576fd 100644 --- a/be/src/exec/merge_join_node.h +++ b/be/src/exec/merge_join_node.h @@ -65,9 +65,8 @@ class MergeJoinNode : public ExecNode { int row_idx; bool is_eos; TupleRow* current_row; - ChildReaderContext(const RowDescriptor& desc, int batch_size, - const std::shared_ptr& mem_tracker) - : batch(desc, batch_size, mem_tracker.get()), + ChildReaderContext(const RowDescriptor& desc, int batch_size) + : batch(desc, batch_size), row_idx(0), is_eos(false), current_row(nullptr) {} diff --git a/be/src/exec/merge_node.cpp b/be/src/exec/merge_node.cpp index 513284e4ca6f78..b8232e00132eb3 100644 --- a/be/src/exec/merge_node.cpp +++ b/be/src/exec/merge_node.cpp @@ -23,6 +23,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" using std::vector; @@ -60,6 +61,7 @@ Status MergeNode::init(const TPlanNode& tnode, RuntimeState* state) { Status MergeNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); DCHECK(_tuple_desc != nullptr); @@ -90,6 +92,7 @@ Status MergeNode::prepare(RuntimeState* state) { } Status MergeNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); // Prepare const expr lists. for (int i = 0; i < _const_result_expr_ctx_lists.size(); ++i) { @@ -105,6 +108,7 @@ Status MergeNode::open(RuntimeState* state) { } Status MergeNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -136,8 +140,8 @@ Status MergeNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) // Row batch was either never set or we're moving on to a different child. if (_child_row_batch.get() == nullptr) { RETURN_IF_CANCELLED(state); - _child_row_batch.reset(new RowBatch(child(_child_idx)->row_desc(), state->batch_size(), - mem_tracker().get())); + _child_row_batch.reset( + new RowBatch(child(_child_idx)->row_desc(), state->batch_size())); // Open child and fetch the first row batch. RETURN_IF_ERROR(child(_child_idx)->open(state)); RETURN_IF_ERROR( @@ -185,6 +189,7 @@ Status MergeNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // don't call ExecNode::close(), it always closes all children _child_row_batch.reset(nullptr); for (int i = 0; i < _const_result_expr_ctx_lists.size(); ++i) { diff --git a/be/src/exec/mysql_scan_node.cpp b/be/src/exec/mysql_scan_node.cpp index 634f47c7842613..0ea0e5df56c50b 100644 --- a/be/src/exec/mysql_scan_node.cpp +++ b/be/src/exec/mysql_scan_node.cpp @@ -23,6 +23,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/string_value.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" @@ -53,6 +54,7 @@ Status MysqlScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // get tuple desc _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -81,7 +83,7 @@ Status MysqlScanNode::prepare(RuntimeState* state) { return Status::InternalError("new a mysql scanner failed."); } - _tuple_pool.reset(new (std::nothrow) MemPool(mem_tracker().get())); + _tuple_pool.reset(new (std::nothrow) MemPool("MysqlScanNode")); if (_tuple_pool.get() == nullptr) { return Status::InternalError("new a mem pool failed."); @@ -99,6 +101,7 @@ Status MysqlScanNode::prepare(RuntimeState* state) { } Status MysqlScanNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); VLOG_CRITICAL << "MysqlScanNode::Open"; @@ -146,6 +149,7 @@ Status MysqlScanNode::write_text_slot(char* value, int value_length, SlotDescrip } Status MysqlScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "MysqlScanNode::GetNext"; if (nullptr == state || nullptr == row_batch || nullptr == eos) { @@ -241,6 +245,7 @@ Status MysqlScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/odbc_scan_node.cpp b/be/src/exec/odbc_scan_node.cpp index 958e22ef54a166..379e8aee8451a7 100644 --- a/be/src/exec/odbc_scan_node.cpp +++ b/be/src/exec/odbc_scan_node.cpp @@ -24,6 +24,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/string_value.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" @@ -55,6 +56,7 @@ Status OdbcScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // get tuple desc _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -74,7 +76,7 @@ Status OdbcScanNode::prepare(RuntimeState* state) { return Status::InternalError("new a odbc scanner failed."); } - _tuple_pool.reset(new (std::nothrow) MemPool(mem_tracker().get())); + _tuple_pool.reset(new (std::nothrow) MemPool("OdbcScanNode")); if (_tuple_pool.get() == nullptr) { return Status::InternalError("new a mem pool failed."); @@ -92,6 +94,7 @@ Status OdbcScanNode::prepare(RuntimeState* state) { } Status OdbcScanNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); VLOG_CRITICAL << _scan_node_type << "::Open"; @@ -128,6 +131,7 @@ Status OdbcScanNode::write_text_slot(char* value, int value_length, SlotDescript Status OdbcScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { VLOG_CRITICAL << _scan_node_type << "::GetNext"; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (nullptr == state || nullptr == row_batch || nullptr == eos) { return Status::InternalError("input is nullptr pointer"); @@ -232,6 +236,7 @@ Status OdbcScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index e53b54b1a47578..dceb6171d6ba77 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -56,7 +56,6 @@ OlapScanNode::OlapScanNode(ObjectPool* pool, const TPlanNode& tnode, const Descr _transfer_done(false), _status(Status::OK()), _resource_info(nullptr), - _buffered_bytes(0), _eval_conjuncts_fn(nullptr), _runtime_filter_descs(tnode.runtime_filters) {} @@ -172,6 +171,7 @@ void OlapScanNode::_init_counter(RuntimeState* state) { Status OlapScanNode::prepare(RuntimeState* state) { init_scan_profile(); RETURN_IF_ERROR(ScanNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // create scanner profile // create timer _tablet_counter = ADD_COUNTER(runtime_profile(), "TabletCount ", TUnit::UNIT); @@ -180,6 +180,9 @@ Status OlapScanNode::prepare(RuntimeState* state) { _init_counter(state); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); + scanner_mem_tracker = MemTracker::create_virtual_tracker(state->instance_mem_tracker()->limit(), + "Scanners", mem_tracker()); + if (_tuple_desc == nullptr) { // TODO: make sure we print all available diagnostic output to our error log return Status::InternalError("Failed to get tuple descriptor."); @@ -213,6 +216,7 @@ Status OlapScanNode::prepare(RuntimeState* state) { } Status OlapScanNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "OlapScanNode::Open"; SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); @@ -256,6 +260,7 @@ Status OlapScanNode::open(RuntimeState* state) { } Status OlapScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -347,8 +352,6 @@ Status OlapScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eo << Tuple::to_string(row->get_tuple(0), *_tuple_desc); } } - __sync_fetch_and_sub(&_buffered_bytes, - row_batch->tuple_data_pool()->total_reserved_bytes()); delete materialized_batch; return Status::OK(); @@ -372,6 +375,7 @@ Status OlapScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); // change done status @@ -795,8 +799,9 @@ Status OlapScanNode::start_scan_thread(RuntimeState* state) { ++j, ++i) { scanner_ranges.push_back((*ranges)[i].get()); } - OlapScanner* scanner = new OlapScanner(state, this, _olap_scan_node.is_preaggregation, - _need_agg_finalize, *scan_range); + OlapScanner* scanner = + new OlapScanner(state, this, _olap_scan_node.is_preaggregation, + _need_agg_finalize, *scan_range, scanner_mem_tracker); // add scanner to pool before doing prepare. // so that scanner can be automatically deconstructed if prepare failed. _scanner_pool.add(scanner); @@ -1332,8 +1337,8 @@ Status OlapScanNode::normalize_bloom_filter_predicate(SlotDescriptor* slot) { void OlapScanNode::transfer_thread(RuntimeState* state) { // scanner open pushdown to scanThread - SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(state->query_id()), - state->fragment_instance_id()); + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), mem_tracker()); Status status = Status::OK(); for (auto scanner : _olap_scanners) { status = Expr::clone_if_not_exists(_conjunct_ctxs, state, scanner->conjunct_ctxs()); @@ -1361,13 +1366,8 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { _nice = 18 + std::max(0, 2 - (int)_olap_scanners.size() / 5); std::list olap_scanners; - int64_t mem_limit = 512 * 1024 * 1024; - // TODO(zc): use memory limit - int64_t mem_consume = __sync_fetch_and_add(&_buffered_bytes, 0); - if (state->fragment_mem_tracker() != nullptr) { - mem_limit = state->fragment_mem_tracker()->limit(); - mem_consume = state->fragment_mem_tracker()->consumption(); - } + int64_t mem_limit = scanner_mem_tracker->limit(); + int64_t mem_consume = scanner_mem_tracker->consumption(); int max_thread = _max_materialized_row_batches; if (config::doris_scanner_row_num > state->batch_size()) { max_thread /= config::doris_scanner_row_num / state->batch_size(); @@ -1386,13 +1386,9 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { { std::unique_lock l(_scan_batches_lock); assigned_thread_num = _running_thread; - // int64_t buf_bytes = __sync_fetch_and_add(&_buffered_bytes, 0); // How many thread can apply to this query size_t thread_slot_num = 0; - mem_consume = __sync_fetch_and_add(&_buffered_bytes, 0); - if (state->fragment_mem_tracker() != nullptr) { - mem_consume = state->fragment_mem_tracker()->consumption(); - } + mem_consume = scanner_mem_tracker->consumption(); if (mem_consume < (mem_limit * 6) / 10) { thread_slot_num = max_thread - assigned_thread_num; } else { @@ -1504,10 +1500,9 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { } void OlapScanNode::scanner_thread(OlapScanner* scanner) { - SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(scanner->runtime_state()->query_id()), - _runtime_state->fragment_instance_id()); - // thread_local_ctx.attach(ThreadContext::QUERY, print_id(scanner->runtime_state()->query_id()), - // _runtime_state->fragment_instance_id()); + SCOPED_ATTACH_TASK_THREAD_4ARG(_runtime_state->query_type(), + print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id(), mem_tracker()); if (UNLIKELY(_transfer_done)) { _scanner_done = true; std::unique_lock l(_scan_batches_lock); @@ -1517,7 +1512,6 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { _scan_batch_added_cv.notify_one(); _scan_thread_exit_cv.notify_one(); LOG(INFO) << "Scan thread cancelled, cause query done, scan thread started to exit"; - // thread_local_ctx.detach(); return; } int64_t wait_time = scanner->update_wait_worker_timer(); @@ -1588,8 +1582,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { << ", fragment id=" << print_id(_runtime_state->fragment_instance_id()); break; } - RowBatch* row_batch = new RowBatch(this->row_desc(), state->batch_size(), - _runtime_state->fragment_mem_tracker().get()); + RowBatch* row_batch = new RowBatch(this->row_desc(), state->batch_size()); row_batch->set_scanner_id(scanner->id()); status = scanner->get_batch(_runtime_state, row_batch, &eos); if (!status.ok()) { @@ -1604,8 +1597,6 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { row_batch = nullptr; } else { row_batchs.push_back(row_batch); - __sync_fetch_and_add(&_buffered_bytes, - row_batch->tuple_data_pool()->total_reserved_bytes()); } raw_rows_read = scanner->raw_rows_read(); } @@ -1667,7 +1658,6 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { // and transfer thread _scan_batch_added_cv.notify_one(); _scan_thread_exit_cv.notify_one(); - // thread_local_ctx.detach(); } Status OlapScanNode::add_one_batch(RowBatch* row_batch) { @@ -1687,7 +1677,6 @@ Status OlapScanNode::add_one_batch(RowBatch* row_batch) { return Status::OK(); } - vectorized::VExpr* OlapScanNode::_dfs_peel_conjunct(vectorized::VExpr* expr, int& leaf_index) { static constexpr auto is_leaf = [](vectorized::VExpr* expr) { return !expr->is_and_expr(); }; diff --git a/be/src/exec/olap_scan_node.h b/be/src/exec/olap_scan_node.h index 82e98d5c0bac01..83150c747c20f4 100644 --- a/be/src/exec/olap_scan_node.h +++ b/be/src/exec/olap_scan_node.h @@ -248,7 +248,7 @@ class OlapScanNode : public ScanNode { TResourceInfo* _resource_info; - int64_t _buffered_bytes; + std::shared_ptr scanner_mem_tracker; EvalConjunctsFn _eval_conjuncts_fn; bool _need_agg_finalize = true; diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp index daa536493c9027..682886ed17f84e 100644 --- a/be/src/exec/olap_scanner.cpp +++ b/be/src/exec/olap_scanner.cpp @@ -30,6 +30,7 @@ #include "runtime/descriptors.h" #include "runtime/mem_pool.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "runtime/runtime_state.h" #include "service/backend_options.h" #include "util/doris_metrics.h" @@ -39,7 +40,8 @@ namespace doris { OlapScanner::OlapScanner(RuntimeState* runtime_state, OlapScanNode* parent, bool aggregation, - bool need_agg_finalize, const TPaloScanRange& scan_range) + bool need_agg_finalize, const TPaloScanRange& scan_range, + std::shared_ptr tracker) : _runtime_state(runtime_state), _parent(parent), _tuple_desc(parent->_tuple_desc), @@ -48,15 +50,15 @@ OlapScanner::OlapScanner(RuntimeState* runtime_state, OlapScanNode* parent, bool _aggregation(aggregation), _need_agg_finalize(need_agg_finalize), _version(-1), - _mem_tracker(MemTracker::create_tracker( - runtime_state->fragment_mem_tracker()->limit(), "OlapScanner", - runtime_state->fragment_mem_tracker(), MemTrackerLevel::VERBOSE)) {} + _mem_tracker(MemTracker::create_tracker(tracker->limit(), + tracker->label() + ":OlapScanner", tracker)) {} Status OlapScanner::prepare( const TPaloScanRange& scan_range, const std::vector& key_ranges, const std::vector& filters, const std::vector>>& bloom_filters) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); set_tablet_reader(); // set limit to reduce end of rowset and segment mem use _tablet_reader->set_batch_size(_parent->limit() == -1 ? _parent->_runtime_state->batch_size() : std::min( @@ -92,7 +94,7 @@ Status OlapScanner::prepare( // the rowsets maybe compacted when the last olap scanner starts Version rd_version(0, _version); OLAPStatus acquire_reader_st = - _tablet->capture_rs_readers(rd_version, &_tablet_reader_params.rs_readers, _mem_tracker); + _tablet->capture_rs_readers(rd_version, &_tablet_reader_params.rs_readers); if (acquire_reader_st != OLAP_SUCCESS) { LOG(WARNING) << "fail to init reader.res=" << acquire_reader_st; std::stringstream ss; @@ -113,6 +115,7 @@ Status OlapScanner::prepare( } Status OlapScanner::open() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_parent->_reader_init_timer); if (_conjunct_ctxs.size() > _parent->_direct_conjunct_size) { @@ -256,13 +259,14 @@ Status OlapScanner::_init_return_columns() { } Status OlapScanner::get_batch(RuntimeState* state, RowBatch* batch, bool* eof) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // 2. Allocate Row's Tuple buf uint8_t* tuple_buf = batch->tuple_data_pool()->allocate(state->batch_size() * _tuple_desc->byte_size()); bzero(tuple_buf, state->batch_size() * _tuple_desc->byte_size()); Tuple* tuple = reinterpret_cast(tuple_buf); - std::unique_ptr mem_pool(new MemPool(_mem_tracker.get())); + std::unique_ptr mem_pool(new MemPool()); int64_t raw_rows_threshold = raw_rows_read() + config::doris_scanner_row_num; { SCOPED_TIMER(_parent->_scan_timer); @@ -274,7 +278,7 @@ Status OlapScanner::get_batch(RuntimeState* state, RowBatch* batch, bool* eof) { } // Read one row from reader auto res = _tablet_reader->next_row_with_aggregation(&_read_row_cursor, mem_pool.get(), - batch->agg_object_pool(), eof); + batch->agg_object_pool(), eof); if (res != OLAP_SUCCESS) { std::stringstream ss; ss << "Internal Error: read storage fail. res=" << res @@ -585,6 +589,7 @@ Status OlapScanner::close(RuntimeState* state) { if (_is_closed) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // olap scan node will call scanner.close() when finished // will release resources here // if not clear rowset readers in read_params here diff --git a/be/src/exec/olap_scanner.h b/be/src/exec/olap_scanner.h index 0c684d9851b378..a7938d323f587e 100644 --- a/be/src/exec/olap_scanner.h +++ b/be/src/exec/olap_scanner.h @@ -47,7 +47,8 @@ class OlapScanNode; class OlapScanner { public: OlapScanner(RuntimeState* runtime_state, OlapScanNode* parent, bool aggregation, - bool need_agg_finalize, const TPaloScanRange& scan_range); + bool need_agg_finalize, const TPaloScanRange& scan_range, + std::shared_ptr tracker); virtual ~OlapScanner() = default; diff --git a/be/src/exec/orc_scanner.cpp b/be/src/exec/orc_scanner.cpp index 25031c3016ba38..2ea9f934f8f4fa 100644 --- a/be/src/exec/orc_scanner.cpp +++ b/be/src/exec/orc_scanner.cpp @@ -24,13 +24,12 @@ #include "exprs/expr.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" #include "runtime/raw_value.h" #include "runtime/runtime_state.h" #include "runtime/tuple.h" #if defined(__x86_64__) - #include "exec/hdfs_file_reader.h" +#include "exec/hdfs_file_reader.h" #endif // orc include file didn't expose orc::TimezoneError diff --git a/be/src/exec/partitioned_aggregation_node.cc b/be/src/exec/partitioned_aggregation_node.cc index 35b502fe7ce305..4b006b1fd78436 100644 --- a/be/src/exec/partitioned_aggregation_node.cc +++ b/be/src/exec/partitioned_aggregation_node.cc @@ -41,6 +41,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/string_value.h" +#include "runtime/thread_context.h" #include "runtime/tuple.h" #include "runtime/tuple_row.h" #include "udf/udf_internal.h" @@ -152,8 +153,7 @@ Status PartitionedAggregationNode::init(const TPlanNode& tnode, RuntimeState* st DCHECK_EQ(intermediate_tuple_desc_->slots().size(), output_tuple_desc_->slots().size()); const RowDescriptor& row_desc = child(0)->row_desc(); - RETURN_IF_ERROR(Expr::create(tnode.agg_node.grouping_exprs, row_desc, state, &grouping_exprs_, - mem_tracker())); + RETURN_IF_ERROR(Expr::create(tnode.agg_node.grouping_exprs, row_desc, state, &grouping_exprs_)); // Construct build exprs from intermediate_row_desc_ for (int i = 0; i < grouping_exprs_.size(); ++i) { SlotDescriptor* desc = intermediate_tuple_desc_->slots()[i]; @@ -185,10 +185,11 @@ Status PartitionedAggregationNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); state_ = state; - mem_pool_.reset(new MemPool(mem_tracker().get())); - agg_fn_pool_.reset(new MemPool(expr_mem_tracker().get())); + mem_pool_.reset(new MemPool()); + agg_fn_pool_.reset(new MemPool()); ht_resize_timer_ = ADD_TIMER(runtime_profile(), "HTResizeTime"); get_results_timer_ = ADD_TIMER(runtime_profile(), "GetResultsTime"); @@ -231,20 +232,21 @@ Status PartitionedAggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(NewAggFnEvaluator::Create(agg_fns_, state, _pool, agg_fn_pool_.get(), &agg_fn_evals_, expr_mem_tracker(), row_desc)); - expr_results_pool_.reset(new MemPool(expr_mem_tracker().get())); + expr_results_pool_.reset(new MemPool(expr_mem_tracker())); if (!grouping_exprs_.empty()) { RowDescriptor build_row_desc(intermediate_tuple_desc_, false); RETURN_IF_ERROR(PartitionedHashTableCtx::Create( _pool, state, build_exprs_, grouping_exprs_, true, vector(build_exprs_.size(), true), state->fragment_hash_seed(), - MAX_PARTITION_DEPTH, 1, expr_mem_pool(), expr_results_pool_.get(), - expr_mem_tracker(), build_row_desc, row_desc, &ht_ctx_)); + MAX_PARTITION_DEPTH, 1, nullptr, expr_results_pool_.get(), expr_mem_tracker(), + build_row_desc, row_desc, &ht_ctx_)); } // AddCodegenDisabledMessage(state); return Status::OK(); } Status PartitionedAggregationNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); // Open the child before consuming resources in this node. RETURN_IF_ERROR(child(0)->open(state)); @@ -293,7 +295,7 @@ Status PartitionedAggregationNode::open(RuntimeState* state) { // Streaming preaggregations do all processing in GetNext(). if (is_streaming_preagg_) return Status::OK(); - RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); + RowBatch batch(child(0)->row_desc(), state->batch_size()); // Read all the rows from the child and process them. bool eos = false; do { @@ -343,6 +345,7 @@ Status PartitionedAggregationNode::open(RuntimeState* state) { } Status PartitionedAggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // 1. `!need_finalize` means this aggregation node not the level two aggregation node // 2. `grouping_exprs_.size() == 0 ` means is not group by // 3. `child(0)->rows_returned() == 0` mean not data from child @@ -362,7 +365,7 @@ Status PartitionedAggregationNode::get_next(RuntimeState* state, RowBatch* row_b // TODO: if ancestor node don't have a no-spilling blocking node, we could avoid a deep_copy // we should a flag indicate this node don't have to deep_copy DCHECK_EQ(row_batch->num_rows(), 0); - RowBatch batch(row_batch->row_desc(), row_batch->capacity(), _mem_tracker.get()); + RowBatch batch(row_batch->row_desc(), row_batch->capacity()); int first_row_idx = batch.num_rows(); RETURN_IF_ERROR(GetNextInternal(state, &batch, eos)); RETURN_IF_ERROR(HandleOutputStrings(&batch, first_row_idx)); @@ -403,13 +406,14 @@ Status PartitionedAggregationNode::CopyStringData(const SlotDescriptor& slot_des Tuple* tuple = batch_iter.get()->get_tuple(0); StringValue* sv = reinterpret_cast(tuple->get_slot(slot_desc.tuple_offset())); if (sv == nullptr || sv->len == 0) continue; - char* new_ptr = reinterpret_cast(pool->try_allocate(sv->len)); + Status rst; + char* new_ptr = reinterpret_cast(pool->try_allocate(sv->len, &rst)); if (UNLIKELY(new_ptr == nullptr)) { string details = Substitute( "Cannot perform aggregation at node with id $0." " Failed to allocate $1 output bytes.", _id, sv->len); - return pool->mem_tracker()->mem_limit_exceeded(state_, details, sv->len); + RETURN_ALLOC_LIMIT_EXCEEDED(pool->mem_tracker(), state_, details, sv->len, rst); } memcpy(new_ptr, sv->ptr, sv->len); sv->ptr = new_ptr; @@ -534,8 +538,7 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state, RowBatc DCHECK(is_streaming_preagg_); if (child_batch_ == nullptr) { - child_batch_.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + child_batch_.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); } do { @@ -686,6 +689,7 @@ Status PartitionedAggregationNode::reset(RuntimeState* state) { Status PartitionedAggregationNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (!singleton_output_tuple_returned_) { GetOutputTuple(agg_fn_evals_, singleton_output_tuple_, mem_pool_.get()); @@ -725,7 +729,7 @@ PartitionedAggregationNode::Partition::~Partition() { } Status PartitionedAggregationNode::Partition::InitStreams() { - agg_fn_pool.reset(new MemPool(parent->expr_mem_tracker().get())); + agg_fn_pool.reset(new MemPool(parent->expr_mem_tracker())); DCHECK_EQ(agg_fn_evals.size(), 0); NewAggFnEvaluator::ShallowClone(parent->partition_pool_.get(), agg_fn_pool.get(), parent->agg_fn_evals_, &agg_fn_evals); @@ -849,8 +853,7 @@ Status PartitionedAggregationNode::Partition::Spill(bool more_aggregate_rows) { // TODO(ml): enable spill std::stringstream msg; msg << "New partitioned Aggregation in spill"; - LIMIT_EXCEEDED(parent->state_->query_mem_tracker(), parent->state_, msg.str()); - // RETURN_IF_ERROR(parent->state_->StartSpilling(parent->mem_tracker())); + RETURN_LIMIT_EXCEEDED(parent->state_->query_mem_tracker(), parent->state_, msg.str()); RETURN_IF_ERROR(SerializeStreamForSpilling()); @@ -921,7 +924,8 @@ Tuple* PartitionedAggregationNode::ConstructIntermediateTuple( const int fixed_size = intermediate_tuple_desc_->byte_size(); const int varlen_size = GroupingExprsVarlenSize(); const int tuple_data_size = fixed_size + varlen_size; - uint8_t* tuple_data = pool->try_allocate(tuple_data_size); + Status rst; + uint8_t* tuple_data = pool->try_allocate(tuple_data_size, &rst); if (UNLIKELY(tuple_data == nullptr)) { stringstream str; str << "Memory exceed limit. Cannot perform aggregation at node with id $0. Failed " @@ -932,7 +936,7 @@ Tuple* PartitionedAggregationNode::ConstructIntermediateTuple( << ", Limit: " << pool->mem_tracker()->limit() << ". " << "You can change the limit by session variable exec_mem_limit."; string details = Substitute(str.str(), _id, tuple_data_size); - *status = pool->mem_tracker()->mem_limit_exceeded(state_, details, tuple_data_size); + *status = pool->mem_tracker()->mem_limit_exceeded(state_, details, tuple_data_size, rst); return nullptr; } memset(tuple_data, 0, fixed_size); @@ -1347,7 +1351,7 @@ Status PartitionedAggregationNode::ProcessStream(BufferedTupleStream3* input_str bool eos = false; const RowDescriptor* desc = AGGREGATED_ROWS ? &intermediate_row_desc_ : &(_children[0]->row_desc()); - RowBatch batch(*desc, state_->batch_size(), mem_tracker().get()); + RowBatch batch(*desc, state_->batch_size()); do { RETURN_IF_ERROR(input_stream->GetNext(&batch, &eos)); RETURN_IF_ERROR(ProcessBatch(&batch, ht_ctx_.get())); diff --git a/be/src/exec/partitioned_hash_table.cc b/be/src/exec/partitioned_hash_table.cc index cc52c6067bb17c..11bdbfc8c70d65 100644 --- a/be/src/exec/partitioned_hash_table.cc +++ b/be/src/exec/partitioned_hash_table.cc @@ -151,7 +151,7 @@ Status PartitionedHashTableCtx::Open(RuntimeState* state) { void PartitionedHashTableCtx::Close(RuntimeState* state) { free(scratch_row_); scratch_row_ = nullptr; - expr_values_cache_.Close(tracker_); + expr_values_cache_.Close(); for (int i = 0; i < build_expr_evals_.size(); i++) { build_expr_evals_[i]->close(state); } @@ -310,13 +310,13 @@ Status PartitionedHashTableCtx::ExprValuesCache::Init(RuntimeState* state, MAX_EXPR_VALUES_ARRAY_SIZE / expr_values_bytes_per_row_)); int mem_usage = MemUsage(capacity_, expr_values_bytes_per_row_, num_exprs_); - Status st = tracker->try_consume(mem_usage); + Status st = tracker->check_limit(mem_usage); WARN_IF_ERROR(st, "PartitionedHashTableCtx::ExprValuesCache failed"); if (UNLIKELY(!st)) { capacity_ = 0; string details = Substitute( - "PartitionedHashTableCtx::ExprValuesCache failed to allocate $0 bytes.", mem_usage); - return tracker->mem_limit_exceeded(state, details, mem_usage); + "PartitionedHashTableCtx::ExprValuesCache failed to allocate $0 bytes", mem_usage); + RETURN_ALLOC_LIMIT_EXCEEDED(tracker, state, details, mem_usage, st); } int expr_values_size = expr_values_bytes_per_row_ * capacity_; @@ -338,7 +338,7 @@ Status PartitionedHashTableCtx::ExprValuesCache::Init(RuntimeState* state, return Status::OK(); } -void PartitionedHashTableCtx::ExprValuesCache::Close(const std::shared_ptr& tracker) { +void PartitionedHashTableCtx::ExprValuesCache::Close() { if (capacity_ == 0) return; cur_expr_values_ = nullptr; cur_expr_values_null_ = nullptr; @@ -348,8 +348,6 @@ void PartitionedHashTableCtx::ExprValuesCache::Close(const std::shared_ptrrelease(mem_usage); } int PartitionedHashTableCtx::ExprValuesCache::MemUsage(int capacity, int expr_values_bytes_per_row, diff --git a/be/src/exec/partitioned_hash_table.h b/be/src/exec/partitioned_hash_table.h index 23a9c3aaab9ee6..80007617d8ee81 100644 --- a/be/src/exec/partitioned_hash_table.h +++ b/be/src/exec/partitioned_hash_table.h @@ -211,8 +211,7 @@ class PartitionedHashTableCtx { const std::vector& build_exprs); /// Frees up various resources and updates memory tracker with proper accounting. - /// 'tracker' should be the same memory tracker which was passed in for Init(). - void Close(const std::shared_ptr& tracker); + void Close(); /// Resets the cache states (iterators, end pointers etc) before writing. void Reset() noexcept; diff --git a/be/src/exec/repeat_node.cpp b/be/src/exec/repeat_node.cpp index 78d937edd28c3f..439df40bab4cd1 100644 --- a/be/src/exec/repeat_node.cpp +++ b/be/src/exec/repeat_node.cpp @@ -22,6 +22,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris { @@ -44,6 +45,7 @@ RepeatNode::~RepeatNode() {} Status RepeatNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _runtime_state = state; _tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id); if (_tuple_desc == nullptr) { @@ -54,6 +56,7 @@ Status RepeatNode::prepare(RuntimeState* state) { } Status RepeatNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_CANCELLED(state); @@ -162,6 +165,7 @@ Status RepeatNode::get_repeated_batch(RowBatch* child_row_batch, int repeat_id_i } Status RepeatNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); DCHECK(_repeat_id_idx >= 0); @@ -175,8 +179,7 @@ Status RepeatNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) return Status::OK(); } - _child_row_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + _child_row_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); RETURN_IF_ERROR(child(0)->get_next(state, _child_row_batch.get(), &_child_eos)); if (_child_row_batch->num_rows() <= 0) { @@ -203,6 +206,7 @@ Status RepeatNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _child_row_batch.reset(nullptr); RETURN_IF_ERROR(child(0)->close(state)); return ExecNode::close(state); diff --git a/be/src/exec/schema_scan_node.cpp b/be/src/exec/schema_scan_node.cpp index b393452883034e..e09d44ff7a7047 100644 --- a/be/src/exec/schema_scan_node.cpp +++ b/be/src/exec/schema_scan_node.cpp @@ -25,6 +25,7 @@ #include "gen_cpp/Types_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/string_value.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" @@ -99,9 +100,10 @@ Status SchemaScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // new one mem pool - _tuple_pool.reset(new (std::nothrow) MemPool(mem_tracker().get())); + _tuple_pool.reset(new (std::nothrow) MemPool()); if (nullptr == _tuple_pool.get()) { return Status::InternalError("Allocate MemPool failed."); @@ -187,6 +189,7 @@ Status SchemaScanNode::prepare(RuntimeState* state) { } Status SchemaScanNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (!_is_init) { return Status::InternalError("Open before Init."); } @@ -241,6 +244,7 @@ Status SchemaScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* } RETURN_IF_CANCELLED(state); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (reached_limit()) { @@ -305,6 +309,7 @@ Status SchemaScanNode::close(RuntimeState* state) { return Status::OK(); } RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); _tuple_pool.reset(); diff --git a/be/src/exec/select_node.cpp b/be/src/exec/select_node.cpp index 25057686c49519..7648ef10b037e8 100644 --- a/be/src/exec/select_node.cpp +++ b/be/src/exec/select_node.cpp @@ -22,6 +22,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" namespace doris { @@ -33,12 +34,13 @@ SelectNode::SelectNode(ObjectPool* pool, const TPlanNode& tnode, const Descripto Status SelectNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); - _child_row_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _child_row_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); return Status::OK(); } Status SelectNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(child(0)->open(state)); @@ -48,6 +50,7 @@ Status SelectNode::open(RuntimeState* state) { Status SelectNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (reached_limit() || (_child_row_idx == _child_row_batch->num_rows() && _child_eos)) { diff --git a/be/src/exec/set_operation_node.cpp b/be/src/exec/set_operation_node.cpp index 0ca6dd57c1fff6..488be80ea909e6 100644 --- a/be/src/exec/set_operation_node.cpp +++ b/be/src/exec/set_operation_node.cpp @@ -23,6 +23,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" namespace doris { SetOperationNode::SetOperationNode(ObjectPool* pool, const TPlanNode& tnode, @@ -38,9 +39,10 @@ Status SetOperationNode::init(const TPlanNode& tnode, RuntimeState* state) { Status SetOperationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); DCHECK(_tuple_desc != nullptr); - _build_pool.reset(new MemPool(mem_tracker().get())); + _build_pool.reset(new MemPool()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _probe_timer = ADD_TIMER(runtime_profile(), "ProbeTime"); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -68,6 +70,7 @@ Status SetOperationNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); for (auto& exprs : _child_expr_lists) { Expr::close(exprs, state); } @@ -134,6 +137,7 @@ bool SetOperationNode::equals(TupleRow* row, TupleRow* other) { Status SetOperationNode::open(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "SetOperation, while constructing the hash table."); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); // open result expr lists. @@ -143,7 +147,7 @@ Status SetOperationNode::open(RuntimeState* state) { // initial build hash table used for remove duplicated _hash_tbl.reset(new HashTable(_child_expr_lists[0], _child_expr_lists[1], _build_tuple_size, true, _find_nulls, id(), mem_tracker(), 1024)); - RowBatch build_batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); + RowBatch build_batch(child(0)->row_desc(), state->batch_size()); RETURN_IF_ERROR(child(0)->open(state)); bool eos = false; @@ -153,7 +157,6 @@ Status SetOperationNode::open(RuntimeState* state) { RETURN_IF_ERROR(child(0)->get_next(state, &build_batch, &eos)); // take ownership of tuple data of build_batch _build_pool->acquire_data(build_batch.tuple_data_pool(), false); - RETURN_IF_LIMIT_EXCEEDED(state, " SetOperation, while constructing the hash table."); // build hash table and remove duplicate items for (int i = 0; i < build_batch.num_rows(); ++i) { VLOG_ROW << "build row: " diff --git a/be/src/exec/spill_sort_node.cc b/be/src/exec/spill_sort_node.cc index ef527a18ede4e6..4fae0c042044ef 100644 --- a/be/src/exec/spill_sort_node.cc +++ b/be/src/exec/spill_sort_node.cc @@ -21,6 +21,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/sorted_run_merger.h" +#include "runtime/thread_context.h" #include "util/debug_util.h" namespace doris { @@ -44,6 +45,7 @@ Status SpillSortNode::init(const TPlanNode& tnode, RuntimeState* state) { Status SpillSortNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(_sort_exec_exprs.prepare(state, child(0)->row_desc(), _row_descriptor, expr_mem_tracker())); // AddExprCtxsToFree(_sort_exec_exprs); @@ -51,6 +53,7 @@ Status SpillSortNode::prepare(RuntimeState* state) { } Status SpillSortNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(_sort_exec_exprs.open(state)); @@ -81,6 +84,7 @@ Status SpillSortNode::open(RuntimeState* state) { } Status SpillSortNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); // RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT, state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); @@ -153,7 +157,7 @@ void SpillSortNode::debug_string(int indentation_level, stringstream* out) const } Status SpillSortNode::sort_input(RuntimeState* state) { - RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); + RowBatch batch(child(0)->row_desc(), state->batch_size()); bool eos = false; do { batch.reset(); diff --git a/be/src/exec/table_function_node.cpp b/be/src/exec/table_function_node.cpp index 6eac8eb8243888..76a894b65a95fd 100644 --- a/be/src/exec/table_function_node.cpp +++ b/be/src/exec/table_function_node.cpp @@ -23,6 +23,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "exprs/table_function/table_function_factory.h" @@ -81,7 +82,7 @@ Status TableFunctionNode::_prepare_output_slot_ids(const TPlanNode& tnode) { Status TableFunctionNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); - + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(Expr::prepare(_fn_ctxs, state, _row_descriptor, expr_mem_tracker())); for (auto fn : _fns) { RETURN_IF_ERROR(fn->prepare()); @@ -90,6 +91,7 @@ Status TableFunctionNode::prepare(RuntimeState* state) { } Status TableFunctionNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(ExecNode::open(state)); @@ -182,6 +184,7 @@ bool TableFunctionNode::_roll_table_functions(int last_eos_idx) { // And the inner loop is to expand the row by table functions, and output row by row. Status TableFunctionNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); const RowDescriptor& parent_rowdesc = row_batch->row_desc(); @@ -203,7 +206,7 @@ Status TableFunctionNode::get_next(RuntimeState* state, RowBatch* row_batch, boo RETURN_IF_ERROR(state->check_query_state("TableFunctionNode, while getting next batch.")); if (_cur_child_batch == nullptr) { - _cur_child_batch.reset(new RowBatch(child_rowdesc, state->batch_size(), mem_tracker().get())); + _cur_child_batch.reset(new RowBatch(child_rowdesc, state->batch_size())); } if (_child_batch_exhausted) { if (_child_eos) { diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index 491fa50dad5b02..123cb521a416bf 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -18,7 +18,6 @@ #include "exec/tablet_info.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "runtime/row_batch.h" #include "runtime/tuple_row.h" #include "util/string_parser.hpp" @@ -161,8 +160,7 @@ OlapTablePartitionParam::OlapTablePartitionParam(std::shared_ptrtuple_desc()->slots()), - _mem_tracker(MemTracker::CreateTracker(-1, "OlapTablePartitionParam")) { + _mem_tracker(MemTracker::create_virtual_tracker(-1, "OlapTablePartitionParam")) { for (auto slot : _slots) { _partition_block.insert({slot->get_empty_mutable_column(), slot->get_data_type_ptr(), slot->col_name()}); } } VOlapTablePartitionParam::~VOlapTablePartitionParam() { - _mem_tracker->Release(_mem_usage); + _mem_tracker->release(_mem_usage); } Status VOlapTablePartitionParam::init() { @@ -509,7 +507,7 @@ Status VOlapTablePartitionParam::init() { } _mem_usage = _partition_block.allocated_bytes(); - _mem_tracker->Consume(_mem_usage); + _mem_tracker->consume(_mem_usage); return Status::OK(); } diff --git a/be/src/exec/tablet_info.h b/be/src/exec/tablet_info.h index f47566351a8c00..d51b306c533cea 100644 --- a/be/src/exec/tablet_info.h +++ b/be/src/exec/tablet_info.h @@ -36,7 +36,6 @@ namespace doris { class MemPool; -class MemTracker; class RowBatch; struct OlapTableIndexSchema { @@ -200,7 +199,6 @@ class OlapTablePartitionParam { std::vector _distributed_slot_descs; ObjectPool _obj_pool; - std::shared_ptr _mem_tracker; std::unique_ptr _mem_pool; std::vector _partitions; std::unique_ptr> diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp index 8cbda78968c923..e8c285b6a9bfb6 100644 --- a/be/src/exec/tablet_sink.cpp +++ b/be/src/exec/tablet_sink.cpp @@ -51,6 +51,7 @@ NodeChannel::NodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int if (_parent->_transfer_data_by_brpc_attachment) { _tuple_data_buffer_ptr = &_tuple_data_buffer; } + _node_channel_tracker = MemTracker::create_tracker(-1, "NodeChannel"); } NodeChannel::~NodeChannel() { @@ -72,6 +73,7 @@ NodeChannel::~NodeChannel() { // no need to set _cancel_msg because the error will be // returned directly via "TabletSink::prepare()" method. Status NodeChannel::init(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); _tuple_desc = _parent->_output_tuple_desc; auto node = _parent->_nodes_info->find_node(_node_id); if (node == nullptr) { @@ -85,7 +87,7 @@ Status NodeChannel::init(RuntimeState* state) { _row_desc.reset(new RowDescriptor(_tuple_desc, false)); _batch_size = state->batch_size(); - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker.get())); + _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); _stub = state->exec_env()->brpc_internal_client_cache()->get_client(_node_info.host, _node_info.brpc_port); @@ -113,6 +115,7 @@ Status NodeChannel::init(RuntimeState* state) { } void NodeChannel::open() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); PTabletWriterOpenRequest request; request.set_allocated_id(&_parent->_load_id); request.set_index_id(_index_channel->_index_id); @@ -157,6 +160,7 @@ void NodeChannel::_cancel_with_msg(const std::string& msg) { } Status NodeChannel::open_wait() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); _open_closure->join(); if (_open_closure->cntl.Failed()) { if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available( @@ -233,6 +237,7 @@ Status NodeChannel::open_wait() { } Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); // If add_row() when _eos_is_produced==true, there must be sth wrong, we can only mark this channel as failed. auto st = none_of({_cancelled, _eos_is_produced}); if (!st.ok()) { @@ -249,8 +254,7 @@ Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { // But there is still some unfinished things, we do mem limit here temporarily. // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && _parent->_mem_tracker->any_limit_exceeded() && - _pending_batches_num > 0) { + while (!_cancelled && _parent->_mem_tracker->any_limit_exceeded() && _pending_batches_num > 0) { SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); SleepFor(MonoDelta::FromMilliseconds(10)); } @@ -265,7 +269,7 @@ Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { _pending_batches_num++; } - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker.get())); + _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); _cur_add_batch_request.clear_tablet_ids(); row_no = _cur_batch->add_row(); @@ -298,8 +302,7 @@ Status NodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { // But there is still some unfinished things, we do mem limit here temporarily. // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && _parent->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) && - _pending_batches_num > 0) { + while (!_cancelled && _parent->_mem_tracker->any_limit_exceeded() && _pending_batches_num > 0) { SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); SleepFor(MonoDelta::FromMilliseconds(10)); } @@ -314,7 +317,7 @@ Status NodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { _pending_batches_num++; } - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker.get())); + _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); _cur_add_batch_request.clear_tablet_ids(); row_no = _cur_batch->add_row(); @@ -330,6 +333,7 @@ Status NodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { } Status NodeChannel::mark_close() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); auto st = none_of({_cancelled, _eos_is_produced}); if (!st.ok()) { if (_cancelled) { @@ -356,6 +360,7 @@ Status NodeChannel::mark_close() { } Status NodeChannel::close_wait(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); auto st = none_of({_cancelled, !_eos_is_produced}); if (!st.ok()) { if (_cancelled) { @@ -403,6 +408,7 @@ Status NodeChannel::close_wait(RuntimeState* state) { } void NodeChannel::cancel(const std::string& cancel_msg) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); // we don't need to wait last rpc finished, cause closure's release/reset will join. // But do we need brpc::StartCancel(call_id)? _cancel_with_msg(cancel_msg); @@ -427,7 +433,8 @@ void NodeChannel::cancel(const std::string& cancel_msg) { request.release_id(); } -int NodeChannel::try_send_and_fetch_status(std::unique_ptr& thread_pool_token) { +int NodeChannel::try_send_and_fetch_status(RuntimeState* state, + std::unique_ptr& thread_pool_token) { auto st = none_of({_cancelled, _send_finished}); if (!st.ok()) { return 0; @@ -435,7 +442,8 @@ int NodeChannel::try_send_and_fetch_status(std::unique_ptr& thr bool is_finished = true; if (!_add_batch_closure->is_packet_in_flight() && _pending_batches_num > 0 && _last_patch_processed_finished.compare_exchange_strong(is_finished, false)) { - auto s = thread_pool_token->submit_func(std::bind(&NodeChannel::try_send_batch, this)); + auto s = thread_pool_token->submit_func( + std::bind(&NodeChannel::try_send_batch, this, state)); if (!s.ok()) { _cancel_with_msg("submit send_batch task to send_batch_thread_pool failed"); } @@ -443,7 +451,9 @@ int NodeChannel::try_send_and_fetch_status(std::unique_ptr& thr return _send_finished ? 0 : 1; } -void NodeChannel::try_send_batch() { +void NodeChannel::try_send_batch(RuntimeState* state) { + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), _node_channel_tracker); SCOPED_ATOMIC_TIMER(&_actual_consume_ns); AddBatchReq send_batch; { @@ -531,6 +541,7 @@ Status NodeChannel::none_of(std::initializer_list vars) { } void NodeChannel::clear_all_batches() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); std::lock_guard lg(_pending_batches_lock); std::queue empty; std::swap(_pending_batches, empty); @@ -540,6 +551,7 @@ void NodeChannel::clear_all_batches() { IndexChannel::~IndexChannel() {} Status IndexChannel::init(RuntimeState* state, const std::vector& tablets) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_index_channel_tracker); for (auto& tablet : tablets) { auto location = _parent->_location->find_tablet(tablet.tablet_id); if (location == nullptr) { @@ -572,6 +584,7 @@ Status IndexChannel::init(RuntimeState* state, const std::vectorload_job_id()), - state->instance_mem_tracker()); - + state->instance_mem_tracker()); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_profile->total_time_counter()); // Prepare the exprs to run. @@ -739,7 +752,7 @@ Status OlapTableSink::prepare(RuntimeState* state) { } _output_row_desc = _pool->add(new RowDescriptor(_output_tuple_desc, false)); - _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size(), _mem_tracker.get())); + _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size())); _max_decimalv2_val.resize(_output_tuple_desc->slots().size()); _min_decimalv2_val.resize(_output_tuple_desc->slots().size()); @@ -810,6 +823,7 @@ Status OlapTableSink::prepare(RuntimeState* state) { } Status OlapTableSink::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_profile->total_time_counter()); SCOPED_TIMER(_open_timer); // Prepare the exprs to run. @@ -847,6 +861,7 @@ Status OlapTableSink::open(RuntimeState* state) { } Status OlapTableSink::send(RuntimeState* state, RowBatch* input_batch) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_profile->total_time_counter()); // update incrementally so that FE can get the progress. // the real 'num_rows_load_total' will be set when sink being closed. @@ -929,6 +944,7 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { /// So here we use a flag to prevent repeated close operations. return _close_status; } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); Status status = close_status; if (status.ok()) { // only if status is ok can we call this _profile->total_time_counter(). @@ -1212,14 +1228,14 @@ Status OlapTableSink::_validate_data(RuntimeState* state, RowBatch* batch, Bitma void OlapTableSink::_send_batch_process(RuntimeState* state) { SCOPED_TIMER(_non_blocking_send_timer); - SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(state->query_id()), - state->fragment_instance_id()); + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), _mem_tracker); do { int running_channels_num = 0; for (auto index_channel : _channels) { - index_channel->for_each_node_channel([&running_channels_num, this](const std::shared_ptr& ch) { + index_channel->for_each_node_channel([&running_channels_num, this, state](const std::shared_ptr& ch) { running_channels_num += - ch->try_send_and_fetch_status(this->_send_batch_thread_pool_token); + ch->try_send_and_fetch_status(state, this->_send_batch_thread_pool_token); }); } diff --git a/be/src/exec/tablet_sink.h b/be/src/exec/tablet_sink.h index 19e80dd20d6992..32fc7db582e820 100644 --- a/be/src/exec/tablet_sink.h +++ b/be/src/exec/tablet_sink.h @@ -176,9 +176,10 @@ class NodeChannel { // 1: running, haven't reach eos. // only allow 1 rpc in flight // plz make sure, this func should be called after open_wait(). - int try_send_and_fetch_status(std::unique_ptr& thread_pool_token); + int try_send_and_fetch_status(RuntimeState* state, + std::unique_ptr& thread_pool_token); - void try_send_batch(); + void try_send_batch(RuntimeState* state); void time_report(std::unordered_map* add_batch_counter_map, int64_t* serialize_batch_ns, int64_t* mem_exceeded_block_ns, @@ -201,7 +202,6 @@ class NodeChannel { Status none_of(std::initializer_list vars); - // TODO(HW): remove after mem tracker shared void clear_all_batches(); std::string channel_info() const { @@ -220,6 +220,8 @@ class NodeChannel { std::string _load_info; std::string _name; + std::shared_ptr _node_channel_tracker; + TupleDescriptor* _tuple_desc = nullptr; NodeInfo _node_info; @@ -279,7 +281,9 @@ class NodeChannel { class IndexChannel { public: IndexChannel(OlapTableSink* parent, int64_t index_id, int32_t schema_hash) - : _parent(parent), _index_id(index_id), _schema_hash(schema_hash) {} + : _parent(parent), _index_id(index_id), _schema_hash(schema_hash) { + _index_channel_tracker = MemTracker::create_tracker(-1, "IndexChannel"); + } ~IndexChannel(); Status init(RuntimeState* state, const std::vector& tablets); @@ -323,6 +327,8 @@ class IndexChannel { // key is tablet_id, value is error message std::unordered_map _failed_channels_msgs; Status _intolerable_failure_status = Status::OK(); + + std::shared_ptr _index_channel_tracker; }; // Write data to Olap Table. diff --git a/be/src/exec/topn_node.cpp b/be/src/exec/topn_node.cpp index 7e98e1d329bfbb..3d8160ddb8f117 100644 --- a/be/src/exec/topn_node.cpp +++ b/be/src/exec/topn_node.cpp @@ -27,6 +27,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/tuple.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" @@ -59,7 +60,8 @@ Status TopNNode::init(const TPlanNode& tnode, RuntimeState* state) { Status TopNNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); - _tuple_pool.reset(new MemPool(mem_tracker().get())); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _tuple_pool.reset(new MemPool()); RETURN_IF_ERROR(_sort_exec_exprs.prepare(state, child(0)->row_desc(), _row_descriptor, expr_mem_tracker())); // AddExprCtxsToFree(_sort_exec_exprs); @@ -74,6 +76,7 @@ Status TopNNode::prepare(RuntimeState* state) { } Status TopNNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_CANCELLED(state); @@ -95,7 +98,7 @@ Status TopNNode::open(RuntimeState* state) { // Limit of 0, no need to fetch anything from children. if (_limit != 0) { - RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); + RowBatch batch(child(0)->row_desc(), state->batch_size()); bool eos = false; do { @@ -126,6 +129,7 @@ Status TopNNode::open(RuntimeState* state) { } Status TopNNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); @@ -167,6 +171,7 @@ Status TopNNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (_tuple_pool.get() != nullptr) { _tuple_pool->free_all(); } diff --git a/be/src/exec/union_node.cpp b/be/src/exec/union_node.cpp index cbb4bc9d5d27f0..f7f6c1d42a94e8 100644 --- a/be/src/exec/union_node.cpp +++ b/be/src/exec/union_node.cpp @@ -25,10 +25,9 @@ #include "runtime/tuple_row.h" // #include "util/runtime_profile_counters.h" #include "gen_cpp/PlanNodes_types.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" -// - namespace doris { UnionNode::UnionNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) @@ -69,6 +68,7 @@ Status UnionNode::init(const TPlanNode& tnode, RuntimeState* state) { Status UnionNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); DCHECK(_tuple_desc != nullptr); _materialize_exprs_evaluate_timer = @@ -94,6 +94,7 @@ Status UnionNode::prepare(RuntimeState* state) { } Status UnionNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); // open const expr lists. @@ -154,8 +155,7 @@ Status UnionNode::get_next_materialized(RuntimeState* state, RowBatch* row_batch // Child row batch was either never set or we're moving on to a different child. if (_child_batch.get() == nullptr) { DCHECK_LT(_child_idx, _children.size()); - _child_batch.reset(new RowBatch(child(_child_idx)->row_desc(), state->batch_size(), - mem_tracker().get())); + _child_batch.reset(new RowBatch(child(_child_idx)->row_desc(), state->batch_size())); _child_row_idx = 0; // open the current child unless it's the first child, which was already opened in // UnionNode::open(). @@ -233,6 +233,7 @@ Status UnionNode::get_next_const(RuntimeState* state, RowBatch* row_batch) { } Status UnionNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); @@ -280,6 +281,7 @@ Status UnionNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) Status UnionNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); _child_batch.reset(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); for (auto& exprs : _const_expr_lists) { Expr::close(exprs, state); } diff --git a/be/src/exprs/agg_fn.h b/be/src/exprs/agg_fn.h index aa15a67c89848e..684c937cce8f91 100644 --- a/be/src/exprs/agg_fn.h +++ b/be/src/exprs/agg_fn.h @@ -27,7 +27,6 @@ namespace doris { using doris_udf::FunctionContext; class MemPool; -class MemTracker; class ObjectPool; class RuntimeState; class Tuple; diff --git a/be/src/exprs/agg_fn_evaluator.cpp b/be/src/exprs/agg_fn_evaluator.cpp index f77e74be37c3eb..726575cb4d3c5c 100644 --- a/be/src/exprs/agg_fn_evaluator.cpp +++ b/be/src/exprs/agg_fn_evaluator.cpp @@ -149,7 +149,7 @@ Status AggFnEvaluator::prepare(RuntimeState* state, const RowDescriptor& desc, M _intermediate_slot_desc = intermediate_slot_desc; _string_buffer_len = 0; - _mem_tracker = mem_tracker; + _mem_tracker = MemTracker::create_virtual_tracker(-1, "AggFnEvaluator", mem_tracker); Status status = Expr::prepare(_input_exprs_ctxs, state, desc, _mem_tracker); RETURN_IF_ERROR(status); diff --git a/be/src/exprs/anyval_util.cpp b/be/src/exprs/anyval_util.cpp index 141fd09847ff91..ed1f6ddab3ea41 100644 --- a/be/src/exprs/anyval_util.cpp +++ b/be/src/exprs/anyval_util.cpp @@ -38,9 +38,10 @@ Status allocate_any_val(RuntimeState* state, MemPool* pool, const TypeDescriptor const std::string& mem_limit_exceeded_msg, AnyVal** result) { const int anyval_size = AnyValUtil::any_val_size(type); const int anyval_alignment = AnyValUtil::any_val_alignment(type); - *result = reinterpret_cast(pool->try_allocate_aligned(anyval_size, anyval_alignment)); + Status rst; + *result = reinterpret_cast(pool->try_allocate_aligned(anyval_size, anyval_alignment, &rst)); if (*result == nullptr) { - return pool->mem_tracker()->mem_limit_exceeded(state, mem_limit_exceeded_msg, anyval_size); + RETURN_ALLOC_LIMIT_EXCEEDED(pool->mem_tracker(), state, mem_limit_exceeded_msg, anyval_size, rst); } memset(static_cast(*result), 0, anyval_size); return Status::OK(); diff --git a/be/src/exprs/bloomfilter_predicate.h b/be/src/exprs/bloomfilter_predicate.h index 6fd16a1a9e81d2..f67dfc0c92a407 100644 --- a/be/src/exprs/bloomfilter_predicate.h +++ b/be/src/exprs/bloomfilter_predicate.h @@ -88,14 +88,15 @@ class IBloomFilterFuncBase { virtual Status assign(const char* data, int len) = 0; virtual Status get_data(char** data, int* len) = 0; - virtual MemTracker* tracker() = 0; virtual void light_copy(IBloomFilterFuncBase* other) = 0; }; template class BloomFilterFuncBase : public IBloomFilterFuncBase { public: - BloomFilterFuncBase(MemTracker* tracker) : _tracker(tracker), _inited(false) {} + BloomFilterFuncBase() : _inited(false) { + _tracker = MemTracker::create_virtual_tracker(-1, "BloomFilterFunc"); + } virtual ~BloomFilterFuncBase() { if (_tracker != nullptr) { @@ -148,18 +149,16 @@ class BloomFilterFuncBase : public IBloomFilterFuncBase { return Status::OK(); } - MemTracker* tracker() override { return _tracker; } - void light_copy(IBloomFilterFuncBase* bloomfilter_func) override { auto other_func = static_cast(bloomfilter_func); - _tracker = nullptr; + _tracker = nullptr; // Avoid repeated release when ~BloomFilterFuncBase _bloom_filter_alloced = other_func->_bloom_filter_alloced; _bloom_filter = other_func->_bloom_filter; _inited = other_func->_inited; } protected: - MemTracker* _tracker; + std::shared_ptr _tracker; // bloom filter size int32_t _bloom_filter_alloced; std::shared_ptr _bloom_filter; @@ -298,7 +297,7 @@ struct BloomFilterTypeTraits { template class BloomFilterFunc final : public BloomFilterFuncBase { public: - BloomFilterFunc(MemTracker* tracker) : BloomFilterFuncBase(tracker) {} + BloomFilterFunc() : BloomFilterFuncBase() {} ~BloomFilterFunc() = default; diff --git a/be/src/exprs/create_predicate_function.h b/be/src/exprs/create_predicate_function.h index 27aef88c921498..b8acae59646755 100644 --- a/be/src/exprs/create_predicate_function.h +++ b/be/src/exprs/create_predicate_function.h @@ -20,7 +20,6 @@ #include "exprs/bloomfilter_predicate.h" #include "exprs/hybrid_set.h" #include "exprs/minmax_predicate.h" -#include "runtime/mem_tracker.h" namespace doris { @@ -28,7 +27,7 @@ class MinmaxFunctionTraits { public: using BasePtr = MinMaxFuncBase*; template - static BasePtr get_function([[maybe_unused]] MemTracker* tracker) { + static BasePtr get_function() { return new (std::nothrow) MinMaxNumFunc::CppType>(); }; }; @@ -37,7 +36,7 @@ class HybridSetTraits { public: using BasePtr = HybridSetBase*; template - static BasePtr get_function([[maybe_unused]] MemTracker* tracker) { + static BasePtr get_function() { using CppType = typename PrimitiveTypeTraits::CppType; using Set = std::conditional_t, StringValueSet, HybridSet>; @@ -49,8 +48,8 @@ class BloomFilterTraits { public: using BasePtr = IBloomFilterFuncBase*; template - static BasePtr get_function(MemTracker* tracker) { - return new BloomFilterFunc(tracker); + static BasePtr get_function() { + return new BloomFilterFunc(); }; }; @@ -58,49 +57,48 @@ template class PredicateFunctionCreator { public: template - static typename Traits::BasePtr create(MemTracker* tracker = nullptr) { - return Traits::template get_function(tracker); + static typename Traits::BasePtr create() { + return Traits::template get_function(); } }; template -typename Traits::BasePtr create_predicate_function(PrimitiveType type, - MemTracker* tracker = nullptr) { +typename Traits::BasePtr create_predicate_function(PrimitiveType type) { using Creator = PredicateFunctionCreator; switch (type) { case TYPE_BOOLEAN: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_TINYINT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_SMALLINT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_INT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_BIGINT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_LARGEINT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_FLOAT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_DOUBLE: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_DECIMALV2: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_DATE: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_DATETIME: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_CHAR: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_VARCHAR: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_STRING: - return Creator::template create(tracker); + return Creator::template create(); default: DCHECK(false) << "Invalid type."; @@ -117,8 +115,8 @@ inline auto create_set(PrimitiveType type) { return create_predicate_function(type); } -inline auto create_bloom_filter(MemTracker* tracker, PrimitiveType type) { - return create_predicate_function(type, tracker); +inline auto create_bloom_filter(PrimitiveType type) { + return create_predicate_function(type); } } // namespace doris \ No newline at end of file diff --git a/be/src/exprs/expr.cpp b/be/src/exprs/expr.cpp index 73f3775247f291..4c2d3520a2023c 100644 --- a/be/src/exprs/expr.cpp +++ b/be/src/exprs/expr.cpp @@ -815,15 +815,16 @@ void Expr::assign_fn_ctx_idx(int* next_fn_ctx_idx) { _fn_ctx_idx = *next_fn_ctx_idx; ++(*next_fn_ctx_idx); } - for (Expr* child : children()) child->assign_fn_ctx_idx(next_fn_ctx_idx); + for (Expr* child : children()) { + child->assign_fn_ctx_idx(next_fn_ctx_idx); + } _fn_ctx_idx_end = *next_fn_ctx_idx; } Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, - ObjectPool* pool, Expr** scalar_expr, - const std::shared_ptr& tracker) { + ObjectPool* pool, Expr** scalar_expr) { *scalar_expr = nullptr; - Expr* root; + Expr* root = nullptr; RETURN_IF_ERROR(create_expr(pool, texpr.nodes[0], &root)); RETURN_IF_ERROR(create_tree(texpr, pool, root)); // TODO pengyubing replace by Init() @@ -844,12 +845,11 @@ Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeSt } Status Expr::create(const std::vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, ObjectPool* pool, std::vector* exprs, - const std::shared_ptr& tracker) { + RuntimeState* state, ObjectPool* pool, std::vector* exprs) { exprs->clear(); for (const TExpr& texpr : texprs) { - Expr* expr; - RETURN_IF_ERROR(create(texpr, row_desc, state, pool, &expr, tracker)); + Expr* expr = nullptr; + RETURN_IF_ERROR(create(texpr, row_desc, state, pool, &expr)); DCHECK(expr != nullptr); exprs->push_back(expr); } @@ -857,14 +857,13 @@ Status Expr::create(const std::vector& texprs, const RowDescriptor& row_d } Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, - Expr** scalar_expr, const std::shared_ptr& tracker) { - return Expr::create(texpr, row_desc, state, state->obj_pool(), scalar_expr, tracker); + Expr** scalar_expr) { + return Expr::create(texpr, row_desc, state, state->obj_pool(), scalar_expr); } Status Expr::create(const std::vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, std::vector* exprs, - const std::shared_ptr& tracker) { - return Expr::create(texprs, row_desc, state, state->obj_pool(), exprs, tracker); + RuntimeState* state, std::vector* exprs) { + return Expr::create(texprs, row_desc, state, state->obj_pool(), exprs); } Status Expr::create_tree(const TExpr& texpr, ObjectPool* pool, Expr* root) { diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h index 795dca6bc0884c..0004e9554fc276 100644 --- a/be/src/exprs/expr.h +++ b/be/src/exprs/expr.h @@ -179,23 +179,21 @@ class Expr { /// tuple row descriptor of the input tuple row. On failure, 'expr' is set to nullptr and /// the expr tree (if created) will be closed. Error status will be returned too. static Status create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, - ObjectPool* pool, Expr** expr, const std::shared_ptr& tracker); + ObjectPool* pool, Expr** expr); /// Create a new ScalarExpr based on thrift Expr 'texpr'. The newly created ScalarExpr /// is stored in ObjectPool 'state->obj_pool()' and returned in 'expr'. 'row_desc' is /// the tuple row descriptor of the input tuple row. Returns error status on failure. static Status create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, - Expr** expr, const std::shared_ptr& tracker); + Expr** expr); /// Convenience functions creating multiple ScalarExpr. static Status create(const std::vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, ObjectPool* pool, std::vector* exprs, - const std::shared_ptr& tracker); + RuntimeState* state, ObjectPool* pool, std::vector* exprs); /// Convenience functions creating multiple ScalarExpr. static Status create(const std::vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, std::vector* exprs, - const std::shared_ptr& tracker); + RuntimeState* state, std::vector* exprs); /// Convenience function for preparing multiple expr trees. /// Allocations from 'ctxs' will be counted against 'tracker'. diff --git a/be/src/exprs/expr_context.cpp b/be/src/exprs/expr_context.cpp index e0f3b6461b030a..d97a09a433216e 100644 --- a/be/src/exprs/expr_context.cpp +++ b/be/src/exprs/expr_context.cpp @@ -28,6 +28,7 @@ #include "runtime/mem_tracker.h" #include "runtime/raw_value.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "udf/udf_internal.h" #include "util/debug_util.h" #include "util/stack_util.h" @@ -49,15 +50,17 @@ ExprContext::~ExprContext() { } } -// TODO(zc): memory tracker Status ExprContext::prepare(RuntimeState* state, const RowDescriptor& row_desc, const std::shared_ptr& tracker) { DCHECK(tracker != nullptr) << std::endl << get_stack_trace(); + if (_prepared) { + return Status::OK(); + } + _mem_tracker = tracker; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(_pool.get() == nullptr); _prepared = true; - // TODO: use param tracker to replace instance_mem_tracker, be careful about tracker's life cycle - // _pool.reset(new MemPool(new MemTracker(-1))); - _pool.reset(new MemPool(state->instance_mem_tracker().get())); + _pool.reset(new MemPool()); return _root->prepare(state, row_desc, this); } @@ -66,6 +69,7 @@ Status ExprContext::open(RuntimeState* state) { if (_opened) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); _opened = true; // Fragment-local state is only initialized for original contexts. Clones inherit the // original's fragment state and only need to have thread-local state initialized. @@ -84,6 +88,7 @@ Status ExprContext::open(std::vector evals, RuntimeState* state) { void ExprContext::close(RuntimeState* state) { DCHECK(!_closed); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); FunctionContext::FunctionStateScope scope = _is_clone ? FunctionContext::THREAD_LOCAL : FunctionContext::FRAGMENT_LOCAL; _root->close(state, this, scope); @@ -112,9 +117,10 @@ Status ExprContext::clone(RuntimeState* state, ExprContext** new_ctx) { DCHECK(_prepared); DCHECK(_opened); DCHECK(*new_ctx == nullptr); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); *new_ctx = state->obj_pool()->add(new ExprContext(_root)); - (*new_ctx)->_pool.reset(new MemPool(_pool->mem_tracker())); + (*new_ctx)->_pool.reset(new MemPool()); for (int i = 0; i < _fn_contexts.size(); ++i) { (*new_ctx)->_fn_contexts.push_back(_fn_contexts[i]->impl()->clone((*new_ctx)->_pool.get())); } @@ -123,6 +129,7 @@ Status ExprContext::clone(RuntimeState* state, ExprContext** new_ctx) { (*new_ctx)->_is_clone = true; (*new_ctx)->_prepared = true; (*new_ctx)->_opened = true; + (*new_ctx)->_mem_tracker = _mem_tracker; return _root->open(state, *new_ctx, FunctionContext::THREAD_LOCAL); } @@ -132,8 +139,9 @@ Status ExprContext::clone(RuntimeState* state, ExprContext** new_ctx, Expr* root DCHECK(_opened); DCHECK(*new_ctx == nullptr); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); *new_ctx = state->obj_pool()->add(new ExprContext(root)); - (*new_ctx)->_pool.reset(new MemPool(_pool->mem_tracker())); + (*new_ctx)->_pool.reset(new MemPool()); for (int i = 0; i < _fn_contexts.size(); ++i) { (*new_ctx)->_fn_contexts.push_back(_fn_contexts[i]->impl()->clone((*new_ctx)->_pool.get())); } @@ -142,11 +150,13 @@ Status ExprContext::clone(RuntimeState* state, ExprContext** new_ctx, Expr* root (*new_ctx)->_is_clone = true; (*new_ctx)->_prepared = true; (*new_ctx)->_opened = true; + (*new_ctx)->_mem_tracker = _mem_tracker; return root->open(state, *new_ctx, FunctionContext::THREAD_LOCAL); } void ExprContext::free_local_allocations() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); free_local_allocations(_fn_contexts); } @@ -371,10 +381,11 @@ Status ExprContext::get_const_value(RuntimeState* state, Expr& expr, AnyVal** co StringVal* sv = reinterpret_cast(*const_val); if (!sv->is_null && sv->len > 0) { // Make sure the memory is owned by this evaluator. - char* ptr_copy = reinterpret_cast(_pool->try_allocate(sv->len)); + Status rst; + char* ptr_copy = reinterpret_cast(_pool->try_allocate(sv->len, &rst)); if (ptr_copy == nullptr) { - return _pool->mem_tracker()->mem_limit_exceeded( - state, "Could not allocate constant string value", sv->len); + RETURN_ALLOC_LIMIT_EXCEEDED(_pool->mem_tracker(), state, + "Could not allocate constant string value", sv->len, rst); } memcpy(ptr_copy, sv->ptr, sv->len); sv->ptr = reinterpret_cast(ptr_copy); diff --git a/be/src/exprs/expr_context.h b/be/src/exprs/expr_context.h index f176240f720f2b..9de41f169c7688 100644 --- a/be/src/exprs/expr_context.h +++ b/be/src/exprs/expr_context.h @@ -170,6 +170,8 @@ class ExprContext { /// TODO: revisit this FunctionContext** _fn_contexts_ptr; + std::shared_ptr _mem_tracker; + /// Pool backing fn_contexts_. Counts against the runtime state's UDF mem tracker. std::unique_ptr _pool; diff --git a/be/src/exprs/new_agg_fn_evaluator.cc b/be/src/exprs/new_agg_fn_evaluator.cc index 376643516cccc9..17f8a931f31037 100644 --- a/be/src/exprs/new_agg_fn_evaluator.cc +++ b/be/src/exprs/new_agg_fn_evaluator.cc @@ -90,19 +90,13 @@ typedef AnyVal (*FinalizeFn)(FunctionContext*, const AnyVal&); const int DEFAULT_MULTI_DISTINCT_COUNT_STRING_BUFFER_SIZE = 1024; -NewAggFnEvaluator::NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, - const std::shared_ptr& tracker, bool is_clone) - : _total_mem_consumption(0), - _accumulated_mem_consumption(0), +NewAggFnEvaluator::NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, bool is_clone) + : _accumulated_mem_consumption(0), is_clone_(is_clone), agg_fn_(agg_fn), - mem_pool_(mem_pool), - _mem_tracker(tracker) {} + mem_pool_(mem_pool) {} NewAggFnEvaluator::~NewAggFnEvaluator() { - if (UNLIKELY(_total_mem_consumption > 0)) { - _mem_tracker->release(_total_mem_consumption); - } DCHECK(closed_); } @@ -122,7 +116,7 @@ Status NewAggFnEvaluator::Create(const AggFn& agg_fn, RuntimeState* state, Objec // Create a new AggFn evaluator. NewAggFnEvaluator* agg_fn_eval = - pool->add(new NewAggFnEvaluator(agg_fn, mem_pool, tracker, false)); + pool->add(new NewAggFnEvaluator(agg_fn, mem_pool, false)); agg_fn_eval->agg_fn_ctx_.reset(FunctionContextImpl::create_context( state, mem_pool, agg_fn.GetIntermediateTypeDesc(), agg_fn.GetOutputTypeDesc(), @@ -633,7 +627,7 @@ void NewAggFnEvaluator::SerializeOrFinalize(Tuple* src, const SlotDescriptor& ds void NewAggFnEvaluator::ShallowClone(ObjectPool* pool, MemPool* mem_pool, NewAggFnEvaluator** cloned_eval) const { DCHECK(opened_); - *cloned_eval = pool->add(new NewAggFnEvaluator(agg_fn_, mem_pool, _mem_tracker, true)); + *cloned_eval = pool->add(new NewAggFnEvaluator(agg_fn_, mem_pool, true)); (*cloned_eval)->agg_fn_ctx_.reset(agg_fn_ctx_->impl()->clone(mem_pool)); DCHECK_EQ((*cloned_eval)->input_evals_.size(), 0); (*cloned_eval)->input_evals_ = input_evals_; diff --git a/be/src/exprs/new_agg_fn_evaluator.h b/be/src/exprs/new_agg_fn_evaluator.h index 36bdc2f21c4dbc..462c4705a174c5 100644 --- a/be/src/exprs/new_agg_fn_evaluator.h +++ b/be/src/exprs/new_agg_fn_evaluator.h @@ -188,7 +188,6 @@ class NewAggFnEvaluator { static std::string DebugString(const std::vector& evals); private: - uint64_t _total_mem_consumption; uint64_t _accumulated_mem_consumption; // index if has multi count distinct @@ -209,8 +208,6 @@ class NewAggFnEvaluator { /// Owned by the exec node which owns this evaluator. MemPool* mem_pool_ = nullptr; - std::shared_ptr _mem_tracker; // saved c'tor param - /// This contains runtime state such as constant input arguments to the aggregate /// functions and a FreePool from which the intermediate values are allocated. /// Owned by this evaluator. @@ -231,8 +228,7 @@ class NewAggFnEvaluator { doris_udf::AnyVal* staging_merge_input_val_ = nullptr; /// Use Create() instead. - NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, - const std::shared_ptr& tracker, bool is_clone); + NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, bool is_clone); /// Return the intermediate type of the aggregate function. inline const SlotDescriptor& intermediate_slot_desc() const; diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 4b603ea4ed1369..6e134578673371 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -318,18 +318,17 @@ BinaryPredicate* create_bin_predicate(ObjectPool* pool, PrimitiveType prim_type, // This class is a wrapper of runtime predicate function class RuntimePredicateWrapper { public: - RuntimePredicateWrapper(RuntimeState* state, MemTracker* tracker, ObjectPool* pool, + RuntimePredicateWrapper(RuntimeState* state, ObjectPool* pool, const RuntimeFilterParams* params) - : _tracker(tracker), - _pool(pool), + : _pool(pool), _column_return_type(params->column_return_type), _filter_type(params->filter_type), _fragment_instance_id(params->fragment_instance_id), _filter_id(params->filter_id) {} // for a 'tmp' runtime predicate wrapper // only could called assign method or as a param for merge - RuntimePredicateWrapper(MemTracker* tracker, ObjectPool* pool, RuntimeFilterType type, UniqueId fragment_instance_id, uint32_t filter_id) - : _tracker(tracker), _pool(pool), _filter_type(type), _fragment_instance_id(fragment_instance_id), _filter_id(filter_id) {} + RuntimePredicateWrapper(ObjectPool* pool, RuntimeFilterType type, UniqueId fragment_instance_id, uint32_t filter_id) + : _pool(pool), _filter_type(type), _fragment_instance_id(fragment_instance_id), _filter_id(filter_id) {} // init runtime filter wrapper // alloc memory to init runtime filter function Status init(const RuntimeFilterParams* params) { @@ -345,12 +344,12 @@ class RuntimePredicateWrapper { } case RuntimeFilterType::BLOOM_FILTER: { _is_bloomfilter = true; - _bloomfilter_func.reset(create_bloom_filter(_tracker, _column_return_type)); + _bloomfilter_func.reset(create_bloom_filter(_column_return_type)); return _bloomfilter_func->init_with_fixed_length(params->bloom_filter_size); } case RuntimeFilterType::IN_OR_BLOOM_FILTER: { _hybrid_set.reset(create_set(_column_return_type)); - _bloomfilter_func.reset(create_bloom_filter(_tracker, _column_return_type)); + _bloomfilter_func.reset(create_bloom_filter(_column_return_type)); return _bloomfilter_func->init_with_fixed_length(params->bloom_filter_size); } default: @@ -622,8 +621,6 @@ class RuntimePredicateWrapper { } Status assign(const PInFilter* in_filter) { - DCHECK(_tracker != nullptr); - PrimitiveType type = to_primitive_type(in_filter->column_type()); if (in_filter->has_ignored_msg()) { VLOG_DEBUG << "Ignore in filter(id=" << _filter_id << ") because: " << in_filter->ignored_msg(); @@ -726,18 +723,16 @@ class RuntimePredicateWrapper { // used by shuffle runtime filter // assign this filter by protobuf Status assign(const PBloomFilter* bloom_filter, const char* data) { - DCHECK(_tracker != nullptr); _is_bloomfilter = true; // we won't use this class to insert or find any data // so any type is ok - _bloomfilter_func.reset(create_bloom_filter(_tracker, PrimitiveType::TYPE_INT)); + _bloomfilter_func.reset(create_bloom_filter(PrimitiveType::TYPE_INT)); return _bloomfilter_func->assign(data, bloom_filter->filter_length()); } // used by shuffle runtime filter // assign this filter by protobuf Status assign(const PMinMaxFilter* minmax_filter) { - DCHECK(_tracker != nullptr); PrimitiveType type = to_primitive_type(minmax_filter->column_type()); _minmax_func.reset(create_minmax_filter(type)); switch (type) { @@ -890,7 +885,6 @@ class RuntimePredicateWrapper { } private: - MemTracker* _tracker; ObjectPool* _pool; PrimitiveType _column_return_type; // column type RuntimeFilterType _filter_type; @@ -905,10 +899,10 @@ class RuntimePredicateWrapper { uint32_t _filter_id; }; -Status IRuntimeFilter::create(RuntimeState* state, MemTracker* tracker, ObjectPool* pool, +Status IRuntimeFilter::create(RuntimeState* state, ObjectPool* pool, const TRuntimeFilterDesc* desc, const TQueryOptions* query_options, const RuntimeFilterRole role, int node_id, IRuntimeFilter** res) { - *res = pool->add(new IRuntimeFilter(state, tracker, pool)); + *res = pool->add(new IRuntimeFilter(state, pool)); (*res)->set_role(role); UniqueId fragment_instance_id(state->fragment_instance_id()); return (*res)->init_with_desc(desc, query_options, fragment_instance_id, node_id); @@ -1048,7 +1042,7 @@ Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQue RETURN_IF_ERROR(Expr::create_expr_tree(_pool, iter->second, &_probe_ctx)); } - _wrapper = _pool->add(new RuntimePredicateWrapper(_state, _mem_tracker, _pool, ¶ms)); + _wrapper = _pool->add(new RuntimePredicateWrapper(_state, _pool, ¶ms)); return _wrapper->init(¶ms); } @@ -1060,16 +1054,14 @@ Status IRuntimeFilter::serialize(PPublishFilterRequest* request, void** data, in return serialize_impl(request, data, len); } -Status IRuntimeFilter::create_wrapper(const MergeRuntimeFilterParams* param, MemTracker* tracker, - ObjectPool* pool, +Status IRuntimeFilter::create_wrapper(const MergeRuntimeFilterParams* param, ObjectPool* pool, std::unique_ptr* wrapper) { - return _create_wrapper(param, tracker, pool, wrapper); + return _create_wrapper(param, pool, wrapper); } -Status IRuntimeFilter::create_wrapper(const UpdateRuntimeFilterParams* param, MemTracker* tracker, - ObjectPool* pool, +Status IRuntimeFilter::create_wrapper(const UpdateRuntimeFilterParams* param, ObjectPool* pool, std::unique_ptr* wrapper) { - return _create_wrapper(param, tracker, pool, wrapper); + return _create_wrapper(param, pool, wrapper); } void IRuntimeFilter::change_to_bloom_filter() { @@ -1081,10 +1073,10 @@ void IRuntimeFilter::change_to_bloom_filter() { } template -Status IRuntimeFilter::_create_wrapper(const T* param, MemTracker* tracker, ObjectPool* pool, +Status IRuntimeFilter::_create_wrapper(const T* param, ObjectPool* pool, std::unique_ptr* wrapper) { int filter_type = param->request->filter_type(); - wrapper->reset(new RuntimePredicateWrapper(tracker, pool, get_type(filter_type), + wrapper->reset(new RuntimePredicateWrapper(pool, get_type(filter_type), UniqueId(param->request->fragment_id()), param->request->filter_id())); switch (filter_type) { @@ -1383,7 +1375,7 @@ Status IRuntimeFilter::update_filter(const UpdateRuntimeFilterParams* param) { set_ignored_msg(*msg); } std::unique_ptr wrapper; - RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(param, _mem_tracker, _pool, &wrapper)); + RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(param, _pool, &wrapper)); auto origin_type = _wrapper->get_real_type(); RETURN_IF_ERROR(_wrapper->merge(wrapper.get())); if (origin_type != _wrapper->get_real_type()) { diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index 663843d2b68041..8d5b433b9b38ef 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -106,9 +106,8 @@ struct MergeRuntimeFilterParams { /// that can be pushed down to node based on the results of the right table. class IRuntimeFilter { public: - IRuntimeFilter(RuntimeState* state, MemTracker* mem_tracker, ObjectPool* pool) + IRuntimeFilter(RuntimeState* state, ObjectPool* pool) : _state(state), - _mem_tracker(mem_tracker), _pool(pool), _runtime_filter_type(RuntimeFilterType::UNKNOWN_FILTER), _filter_id(-1), @@ -124,7 +123,7 @@ class IRuntimeFilter { ~IRuntimeFilter() = default; - static Status create(RuntimeState* state, MemTracker* tracker, ObjectPool* pool, + static Status create(RuntimeState* state, ObjectPool* pool, const TRuntimeFilterDesc* desc, const TQueryOptions* query_options, const RuntimeFilterRole role, int node_id, IRuntimeFilter** res); @@ -191,11 +190,9 @@ class IRuntimeFilter { // for ut const RuntimePredicateWrapper* get_wrapper(); - static Status create_wrapper(const MergeRuntimeFilterParams* param, MemTracker* tracker, - ObjectPool* pool, + static Status create_wrapper(const MergeRuntimeFilterParams* param, ObjectPool* pool, std::unique_ptr* wrapper); - static Status create_wrapper(const UpdateRuntimeFilterParams* param, MemTracker* tracker, - ObjectPool* pool, + static Status create_wrapper(const UpdateRuntimeFilterParams* param, ObjectPool* pool, std::unique_ptr* wrapper); void change_to_bloom_filter(); Status update_filter(const UpdateRuntimeFilterParams* param); @@ -234,11 +231,10 @@ class IRuntimeFilter { Status serialize_impl(T* request, void** data, int* len); template - static Status _create_wrapper(const T* param, MemTracker* tracker, ObjectPool* pool, + static Status _create_wrapper(const T* param, ObjectPool* pool, std::unique_ptr* wrapper); RuntimeState* _state; - MemTracker* _mem_tracker; ObjectPool* _pool; // _wrapper is a runtime filter function wrapper // _wrapper should alloc from _pool diff --git a/be/src/http/action/compaction_action.cpp b/be/src/http/action/compaction_action.cpp index 6c52c9165d8b81..b228a2b58d968d 100644 --- a/be/src/http/action/compaction_action.cpp +++ b/be/src/http/action/compaction_action.cpp @@ -30,6 +30,7 @@ #include "http/http_response.h" #include "http/http_status.h" #include "olap/base_compaction.h" +#include "runtime/thread_context.h" #include "olap/cumulative_compaction.h" #include "olap/olap_define.h" #include "olap/storage_engine.h" @@ -225,8 +226,7 @@ OLAPStatus CompactionAction::_execute_compaction_callback(TabletSharedPtr tablet OLAPStatus status = OLAP_SUCCESS; if (compaction_type == PARAM_COMPACTION_BASE) { - std::string tracker_label = "CompactionAction:BaseCompaction:" + std::to_string(syscall(__NR_gettid)); - BaseCompaction base_compaction(tablet, tracker_label, _compaction_mem_tracker); + BaseCompaction base_compaction(tablet); OLAPStatus res = base_compaction.compact(); if (res != OLAP_SUCCESS && res != OLAP_ERR_BE_NO_SUITABLE_VERSION) { DorisMetrics::instance()->base_compaction_request_failed->increment(1); @@ -235,8 +235,7 @@ OLAPStatus CompactionAction::_execute_compaction_callback(TabletSharedPtr tablet } status = res; } else if (compaction_type == PARAM_COMPACTION_CUMULATIVE) { - std::string tracker_label = "CompactionAction:CumulativeCompaction:" + std::to_string(syscall(__NR_gettid)); - CumulativeCompaction cumulative_compaction(tablet, tracker_label, _compaction_mem_tracker); + CumulativeCompaction cumulative_compaction(tablet); OLAPStatus res = cumulative_compaction.compact(); if (res != OLAP_SUCCESS && res != OLAP_ERR_CUMULATIVE_NO_SUITABLE_VERSIONS) { DorisMetrics::instance()->cumulative_compaction_request_failed->increment(1); @@ -254,6 +253,7 @@ OLAPStatus CompactionAction::_execute_compaction_callback(TabletSharedPtr tablet } void CompactionAction::handle(HttpRequest* req) { + SCOPED_ATTACH_TASK_THREAD_2ARG(ThreadContext::TaskType::COMPACTION, _compaction_mem_tracker); req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str()); if (_type == CompactionActionType::SHOW_INFO) { diff --git a/be/src/olap/aggregate_func.h b/be/src/olap/aggregate_func.h index e282a6e1caa0a9..1a84806952c995 100644 --- a/be/src/olap/aggregate_func.h +++ b/be/src/olap/aggregate_func.h @@ -24,7 +24,6 @@ #include "runtime/datetime_value.h" #include "runtime/decimalv2_value.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "runtime/string_value.h" #include "util/bitmap_value.h" @@ -488,8 +487,6 @@ struct AggregateFuncTraitsdata = reinterpret_cast(hll); - mem_pool->mem_tracker()->consume(hll->memory_consumed()); - agg_pool->add(hll); } @@ -534,7 +531,6 @@ struct AggregateFuncTraitssize = 0; auto bitmap = new BitmapValue(src_slice->data); - mem_pool->mem_tracker()->consume(sizeof(BitmapValue)); dst_slice->data = (char*)bitmap; agg_pool->add(bitmap); diff --git a/be/src/olap/base_compaction.cpp b/be/src/olap/base_compaction.cpp index cc7c358d4739ac..7472a807279e28 100644 --- a/be/src/olap/base_compaction.cpp +++ b/be/src/olap/base_compaction.cpp @@ -19,16 +19,17 @@ #include "util/doris_metrics.h" #include "util/trace.h" +#include "runtime/thread_context.h" namespace doris { -BaseCompaction::BaseCompaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker) - : Compaction(tablet, label, parent_tracker) {} +BaseCompaction::BaseCompaction(TabletSharedPtr tablet) + : Compaction(tablet, "BaseCompaction:" + std::to_string(tablet->tablet_id())) {} BaseCompaction::~BaseCompaction() {} OLAPStatus BaseCompaction::prepare_compact() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (!_tablet->init_succeeded()) { return OLAP_ERR_INPUT_PARAMETER_ERROR; } @@ -50,6 +51,7 @@ OLAPStatus BaseCompaction::prepare_compact() { } OLAPStatus BaseCompaction::execute_compact_impl() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); MutexLock lock(_tablet->get_base_lock(), TRY_LOCK); if (!lock.own_lock()) { LOG(WARNING) << "another base compaction is running. tablet=" << _tablet->full_name(); @@ -81,6 +83,7 @@ OLAPStatus BaseCompaction::execute_compact_impl() { } OLAPStatus BaseCompaction::pick_rowsets_to_compact() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); _input_rowsets.clear(); _tablet->pick_candidate_rowsets_to_base_compaction(&_input_rowsets); if (_input_rowsets.size() <= 1) { diff --git a/be/src/olap/base_compaction.h b/be/src/olap/base_compaction.h index 54088ea48d7dfe..d4c2c2f360af79 100644 --- a/be/src/olap/base_compaction.h +++ b/be/src/olap/base_compaction.h @@ -29,8 +29,7 @@ namespace doris { class BaseCompaction : public Compaction { public: - BaseCompaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker); + BaseCompaction(TabletSharedPtr tablet); ~BaseCompaction() override; OLAPStatus prepare_compact() override; diff --git a/be/src/olap/bloom_filter_predicate.cpp b/be/src/olap/bloom_filter_predicate.cpp index 48127357181cff..834df7198994cf 100644 --- a/be/src/olap/bloom_filter_predicate.cpp +++ b/be/src/olap/bloom_filter_predicate.cpp @@ -41,19 +41,19 @@ ColumnPredicate* BloomFilterColumnPredicateFactory::create_column_predicate( switch (type) { #define M(NAME) \ case OLAP_FIELD_##NAME: { \ - filter.reset(create_bloom_filter(bloom_filter->tracker(), NAME)); \ + filter.reset(create_bloom_filter(NAME)); \ filter->light_copy(bloom_filter.get()); \ return new BloomFilterColumnPredicate(column_id, filter); \ } APPLY_FOR_PRIMTYPE(M) #undef M case OLAP_FIELD_TYPE_DECIMAL: { - filter.reset(create_bloom_filter(bloom_filter->tracker(), TYPE_DECIMALV2)); + filter.reset(create_bloom_filter(TYPE_DECIMALV2)); filter->light_copy(bloom_filter.get()); return new BloomFilterColumnPredicate(column_id, filter); } case OLAP_FIELD_TYPE_BOOL: { - filter.reset(create_bloom_filter(bloom_filter->tracker(), TYPE_BOOLEAN)); + filter.reset(create_bloom_filter(TYPE_BOOLEAN)); filter->light_copy(bloom_filter.get()); return new BloomFilterColumnPredicate(column_id, filter); } diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 31e2dd646e70ba..8aff5b6dded6d5 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -19,6 +19,7 @@ #include "gutil/strings/substitute.h" #include "olap/rowset/rowset_factory.h" +#include "runtime/thread_context.h" #include "util/time.h" #include "util/trace.h" @@ -26,10 +27,9 @@ using std::vector; namespace doris { -Compaction::Compaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker) +Compaction::Compaction(TabletSharedPtr tablet, const std::string& label) : _mem_tracker( - MemTracker::create_tracker(-1, label, parent_tracker, MemTrackerLevel::TASK)), + MemTracker::create_tracker(-1, label, nullptr, MemTrackerLevel::TASK)), _readers_tracker(MemTracker::create_tracker( -1, "CompactionReaderTracker:" + std::to_string(tablet->tablet_id()), _mem_tracker)), @@ -167,7 +167,6 @@ OLAPStatus Compaction::construct_output_rowset_writer() { context.rowset_state = VISIBLE; context.version = _output_version; context.segments_overlap = NONOVERLAPPING; - context.parent_mem_tracker = _writer_tracker; // The test results show that one rs writer is low-memory-footprint, there is no need to tracker its mem pool RETURN_NOT_OK(RowsetFactory::create_rowset_writer(context, &_output_rs_writer)); return OLAP_SUCCESS; @@ -176,11 +175,7 @@ OLAPStatus Compaction::construct_output_rowset_writer() { OLAPStatus Compaction::construct_input_rowset_readers() { for (auto& rowset : _input_rowsets) { RowsetReaderSharedPtr rs_reader; - RETURN_NOT_OK(rowset->create_reader( - MemTracker::create_tracker( - -1, "Compaction:RowsetReader:" + rowset->rowset_id().to_string(), - _readers_tracker), - &rs_reader)); + RETURN_NOT_OK(rowset->create_reader(&rs_reader)); _input_rs_readers.push_back(std::move(rs_reader)); } return OLAP_SUCCESS; diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h index 6e7985a4d1c7c6..71dab8b3ff955e 100644 --- a/be/src/olap/compaction.h +++ b/be/src/olap/compaction.h @@ -44,8 +44,7 @@ class Merger; // 4. gc output rowset if failed class Compaction { public: - Compaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker); + Compaction(TabletSharedPtr tablet, const std::string& label); virtual ~Compaction(); // This is only for http CompactionAction @@ -84,6 +83,7 @@ class Compaction { // the root tracker for this compaction std::shared_ptr _mem_tracker; + // TODO(zxy) no used // the child of root, only track rowset readers mem std::shared_ptr _readers_tracker; diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index f987acdbe342ce..7990d75c95f0ff 100644 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -19,17 +19,18 @@ #include "util/doris_metrics.h" #include "util/time.h" +#include "runtime/thread_context.h" #include "util/trace.h" namespace doris { -CumulativeCompaction::CumulativeCompaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker) - : Compaction(tablet, label, parent_tracker) {} +CumulativeCompaction::CumulativeCompaction(TabletSharedPtr tablet) + : Compaction(tablet, "CumulativeCompaction:" + std::to_string(tablet->tablet_id())) {} CumulativeCompaction::~CumulativeCompaction() {} OLAPStatus CumulativeCompaction::prepare_compact() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (!_tablet->init_succeeded()) { return OLAP_ERR_CUMULATIVE_INVALID_PARAMETERS; } @@ -57,6 +58,7 @@ OLAPStatus CumulativeCompaction::prepare_compact() { } OLAPStatus CumulativeCompaction::execute_compact_impl() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); MutexLock lock(_tablet->get_cumulative_lock(), TRY_LOCK); if (!lock.own_lock()) { LOG(INFO) << "The tablet is under cumulative compaction. tablet=" << _tablet->full_name(); @@ -94,6 +96,7 @@ OLAPStatus CumulativeCompaction::execute_compact_impl() { } OLAPStatus CumulativeCompaction::pick_rowsets_to_compact() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::vector candidate_rowsets; _tablet->pick_candidate_rowsets_to_cumulative_compaction( diff --git a/be/src/olap/cumulative_compaction.h b/be/src/olap/cumulative_compaction.h index c1d742de9f03d7..d7c26ed6699db6 100644 --- a/be/src/olap/cumulative_compaction.h +++ b/be/src/olap/cumulative_compaction.h @@ -27,8 +27,7 @@ namespace doris { class CumulativeCompaction : public Compaction { public: - CumulativeCompaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker); + CumulativeCompaction(TabletSharedPtr tablet); ~CumulativeCompaction() override; OLAPStatus prepare_compact() override; diff --git a/be/src/olap/delta_writer.cpp b/be/src/olap/delta_writer.cpp index 696e2c5b5836d8..87664c01d30e20 100644 --- a/be/src/olap/delta_writer.cpp +++ b/be/src/olap/delta_writer.cpp @@ -25,18 +25,17 @@ #include "olap/schema_change.h" #include "olap/storage_engine.h" #include "runtime/row_batch.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" namespace doris { -OLAPStatus DeltaWriter::open(WriteRequest* req, const std::shared_ptr& parent, - DeltaWriter** writer) { - *writer = new DeltaWriter(req, parent, StorageEngine::instance()); +OLAPStatus DeltaWriter::open(WriteRequest* req, DeltaWriter** writer) { + *writer = new DeltaWriter(req, StorageEngine::instance()); return OLAP_SUCCESS; } -DeltaWriter::DeltaWriter(WriteRequest* req, const std::shared_ptr& parent, - StorageEngine* storage_engine) +DeltaWriter::DeltaWriter(WriteRequest* req, StorageEngine* storage_engine) : _req(*req), _tablet(nullptr), _cur_rowset(nullptr), @@ -45,8 +44,7 @@ DeltaWriter::DeltaWriter(WriteRequest* req, const std::shared_ptr& p _rowset_writer(nullptr), _tablet_schema(nullptr), _delta_written_success(false), - _storage_engine(storage_engine), - _parent_mem_tracker(parent) {} + _storage_engine(storage_engine) {} DeltaWriter::~DeltaWriter() { if (_is_init && !_delta_written_success) { @@ -105,8 +103,9 @@ OLAPStatus DeltaWriter::init() { return OLAP_ERR_TABLE_NOT_FOUND; } - _mem_tracker = MemTracker::create_tracker(-1, "DeltaWriter:" + std::to_string(_tablet->tablet_id()), - _parent_mem_tracker); + _mem_tracker = + MemTracker::create_tracker(-1, "DeltaWriter:" + std::to_string(_tablet->tablet_id())); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // check tablet version number if (_tablet->version_count() > config::max_tablet_version_num) { LOG(WARNING) << "failed to init delta writer. version count: " << _tablet->version_count() @@ -142,7 +141,6 @@ OLAPStatus DeltaWriter::init() { writer_context.txn_id = _req.txn_id; writer_context.load_id = _req.load_id; writer_context.segments_overlap = OVERLAPPING; - writer_context.parent_mem_tracker = _mem_tracker; RETURN_NOT_OK(RowsetFactory::create_rowset_writer(writer_context, &_rowset_writer)); _tablet_schema = &(_tablet->tablet_schema()); @@ -162,6 +160,7 @@ OLAPStatus DeltaWriter::write(Tuple* tuple) { if (!_is_init && !_is_cancelled) { RETURN_NOT_OK(init()); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (_is_cancelled) { // The writer may be cancelled at any time by other thread. @@ -189,6 +188,7 @@ OLAPStatus DeltaWriter::write(const RowBatch* row_batch, const std::vector& if (!_is_init && !_is_cancelled) { RETURN_NOT_OK(init()); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (_is_cancelled) { return OLAP_ERR_ALREADY_CANCELLED; @@ -214,6 +214,7 @@ OLAPStatus DeltaWriter::_flush_memtable_async() { } OLAPStatus DeltaWriter::flush_memtable_and_wait(bool need_wait) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (!_is_init) { // This writer is not initialized before flushing. Do nothing @@ -222,7 +223,7 @@ OLAPStatus DeltaWriter::flush_memtable_and_wait(bool need_wait) { // and at that time, the writer may not be initialized yet and that is a normal case. return OLAP_SUCCESS; } - + if (_is_cancelled) { return OLAP_ERR_ALREADY_CANCELLED; } @@ -247,6 +248,7 @@ OLAPStatus DeltaWriter::flush_memtable_and_wait(bool need_wait) { } OLAPStatus DeltaWriter::wait_flush() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (!_is_init) { // return OLAP_SUCCESS instead of OLAP_ERR_ALREADY_CANCELLED for same reason @@ -262,8 +264,7 @@ OLAPStatus DeltaWriter::wait_flush() { void DeltaWriter::_reset_mem_table() { _mem_table.reset(new MemTable(_tablet->tablet_id(), _schema.get(), _tablet_schema, _req.slots, - _req.tuple_desc, _tablet->keys_type(), _rowset_writer.get(), - _mem_tracker)); + _req.tuple_desc, _tablet->keys_type(), _rowset_writer.get())); } OLAPStatus DeltaWriter::close() { @@ -276,6 +277,7 @@ OLAPStatus DeltaWriter::close() { // for this tablet when being closed. RETURN_NOT_OK(init()); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (_is_cancelled) { return OLAP_ERR_ALREADY_CANCELLED; @@ -287,6 +289,7 @@ OLAPStatus DeltaWriter::close() { } OLAPStatus DeltaWriter::close_wait(google::protobuf::RepeatedPtrField* tablet_vec, bool is_broken) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); DCHECK(_is_init) << "delta writer is supposed be to initialized before close_wait() being called"; @@ -297,7 +300,6 @@ OLAPStatus DeltaWriter::close_wait(google::protobuf::RepeatedPtrFieldwait()); - DCHECK_EQ(_mem_tracker->consumption(), 0); // use rowset meta manager to save meta _cur_rowset = _rowset_writer->build(); @@ -351,12 +353,12 @@ OLAPStatus DeltaWriter::close_wait(google::protobuf::RepeatedPtrFieldget_stats(); VLOG_CRITICAL << "close delta writer for tablet: " << _tablet->tablet_id() - << ", load id: " << print_id(_req.load_id) - << ", stats: " << stat; + << ", load id: " << print_id(_req.load_id) << ", stats: " << stat; return OLAP_SUCCESS; } OLAPStatus DeltaWriter::cancel() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (!_is_init || _is_cancelled) { return OLAP_SUCCESS; @@ -366,7 +368,6 @@ OLAPStatus DeltaWriter::cancel() { // cancel and wait all memtables in flush queue to be finished _flush_token->cancel(); } - DCHECK_EQ(_mem_tracker->consumption(), 0); _is_cancelled = true; return OLAP_SUCCESS; } diff --git a/be/src/olap/delta_writer.h b/be/src/olap/delta_writer.h index b8db71320d6e53..3a7e5612160c46 100644 --- a/be/src/olap/delta_writer.h +++ b/be/src/olap/delta_writer.h @@ -55,8 +55,7 @@ struct WriteRequest { // This class is NOT thread-safe, external synchronization is required. class DeltaWriter { public: - static OLAPStatus open(WriteRequest* req, const std::shared_ptr& parent, - DeltaWriter** writer); + static OLAPStatus open(WriteRequest* req, DeltaWriter** writer); ~DeltaWriter(); @@ -91,8 +90,7 @@ class DeltaWriter { int64_t tablet_id() { return _tablet->tablet_id(); } private: - DeltaWriter(WriteRequest* req, const std::shared_ptr& parent, - StorageEngine* storage_engine); + DeltaWriter(WriteRequest* req, StorageEngine* storage_engine); // push a full memtable to flush executor OLAPStatus _flush_memtable_async(); @@ -117,7 +115,6 @@ class DeltaWriter { StorageEngine* _storage_engine; std::unique_ptr _flush_token; - std::shared_ptr _parent_mem_tracker; std::shared_ptr _mem_tracker; // The counter of number of segment flushed already. diff --git a/be/src/olap/fs/block_manager.h b/be/src/olap/fs/block_manager.h index 55be413cebe236..fd0b99ea8f8b32 100644 --- a/be/src/olap/fs/block_manager.h +++ b/be/src/olap/fs/block_manager.h @@ -30,7 +30,6 @@ namespace doris { class BlockId; class Env; -class MemTracker; class Slice; namespace fs { @@ -185,10 +184,6 @@ struct CreateBlockOptions { struct BlockManagerOptions { BlockManagerOptions() = default; - // The memory tracker under which all new memory trackers will be parented. - // If nullptr, new memory trackers will be parented to the root tracker. - std::shared_ptr parent_mem_tracker; - // If false, metrics will not be produced. bool enable_metric = false; diff --git a/be/src/olap/fs/file_block_manager.cpp b/be/src/olap/fs/file_block_manager.cpp index 06721b7b631498..72b0c43d374583 100644 --- a/be/src/olap/fs/file_block_manager.cpp +++ b/be/src/olap/fs/file_block_manager.cpp @@ -32,7 +32,6 @@ #include "olap/fs/block_id.h" #include "olap/fs/block_manager_metrics.h" #include "olap/storage_engine.h" -#include "runtime/mem_tracker.h" #include "util/doris_metrics.h" #include "util/file_cache.h" #include "util/metrics.h" @@ -367,9 +366,7 @@ Status FileReadableBlock::readv(uint64_t offset, const Slice* results, size_t re FileBlockManager::FileBlockManager(Env* env, BlockManagerOptions opts) : _env(DCHECK_NOTNULL(env)), - _opts(std::move(opts)), - _mem_tracker(MemTracker::create_tracker(-1, "FileBlockManager", _opts.parent_mem_tracker, - MemTrackerLevel::OVERVIEW)) { + _opts(std::move(opts)) { if (_opts.enable_metric) { _metrics.reset(new internal::BlockManagerMetrics()); } diff --git a/be/src/olap/fs/file_block_manager.h b/be/src/olap/fs/file_block_manager.h index 118d61988030eb..f8bd96743d2d97 100644 --- a/be/src/olap/fs/file_block_manager.h +++ b/be/src/olap/fs/file_block_manager.h @@ -31,7 +31,6 @@ namespace doris { class BlockId; class Env; -class MemTracker; class RandomAccessFile; namespace fs { @@ -111,10 +110,6 @@ class FileBlockManager : public BlockManager { // May be null if instantiated without metrics. std::unique_ptr _metrics; - // Tracks memory consumption of any allocations numerous enough to be - // interesting. - std::shared_ptr _mem_tracker; - // DISALLOW_COPY_AND_ASSIGN(FileBlockManager); // Underlying cache instance. Caches opened files. diff --git a/be/src/olap/generic_iterators.cpp b/be/src/olap/generic_iterators.cpp index 0d31955aad3844..a499f28ac6f14c 100644 --- a/be/src/olap/generic_iterators.cpp +++ b/be/src/olap/generic_iterators.cpp @@ -113,8 +113,7 @@ Status AutoIncrementIterator::next_batch(RowBlockV2* block) { // } class MergeIteratorContext { public: - MergeIteratorContext(RowwiseIterator* iter, std::shared_ptr parent) - : _iter(iter), _block(iter->schema(), 1024, std::move(parent)) {} + MergeIteratorContext(RowwiseIterator* iter) : _iter(iter), _block(iter->schema(), 1024) {} MergeIteratorContext(const MergeIteratorContext&) = delete; MergeIteratorContext(MergeIteratorContext&&) = delete; @@ -207,11 +206,10 @@ Status MergeIteratorContext::_load_next_block() { class MergeIterator : public RowwiseIterator { public: // MergeIterator takes the ownership of input iterators - MergeIterator(std::vector iters, std::shared_ptr parent, int sequence_id_idx) - : _origin_iters(std::move(iters)), _sequence_id_idx(sequence_id_idx), _merge_heap(MergeContextComparator(_sequence_id_idx)) { - // use for count the mem use of Block use in Merge - _mem_tracker = MemTracker::create_tracker(-1, "MergeIterator", std::move(parent)); - } + MergeIterator(std::vector iters, int sequence_id_idx) + : _origin_iters(std::move(iters)), + _sequence_id_idx(sequence_id_idx), + _merge_heap(MergeContextComparator(_sequence_id_idx)) {} ~MergeIterator() override { while (!_merge_heap.empty()) { @@ -245,7 +243,7 @@ class MergeIterator : public RowwiseIterator { if (cmp_res != 0) { return cmp_res > 0; } - + // Second: If sequence_id_idx != 0 means we need to compare sequence. sequence only use // in unique key. so keep reverse order of sequence id here if (sequence_id_idx != -1) { @@ -278,7 +276,7 @@ Status MergeIterator::init(const StorageReadOptions& opts) { _schema.reset(new Schema((*(_origin_iters.begin()))->schema())); for (auto iter : _origin_iters) { - std::unique_ptr ctx(new MergeIteratorContext(iter, _mem_tracker)); + std::unique_ptr ctx(new MergeIteratorContext(iter)); RETURN_IF_ERROR(ctx->init(opts)); if (!ctx->valid()) { continue; @@ -323,10 +321,7 @@ class UnionIterator : public RowwiseIterator { // Iterators' ownership it transfered to this class. // This class will delete all iterators when destructs // Client should not use iterators any more. - UnionIterator(std::vector &v, std::shared_ptr parent) - : _origin_iters(v.begin(), v.end()) { - _mem_tracker = MemTracker::create_tracker(-1, "UnionIterator", parent); - } + UnionIterator(std::vector& v) : _origin_iters(v.begin(), v.end()) {} ~UnionIterator() override { std::for_each(_origin_iters.begin(), _origin_iters.end(), std::default_delete()); @@ -374,18 +369,18 @@ Status UnionIterator::next_batch(RowBlockV2* block) { return Status::EndOfFile("End of UnionIterator"); } -RowwiseIterator* new_merge_iterator(std::vector inputs, std::shared_ptr parent, int sequence_id_idx) { +RowwiseIterator* new_merge_iterator(std::vector inputs, int sequence_id_idx) { if (inputs.size() == 1) { return *(inputs.begin()); } - return new MergeIterator(std::move(inputs), parent, sequence_id_idx); + return new MergeIterator(std::move(inputs), sequence_id_idx); } -RowwiseIterator* new_union_iterator(std::vector& inputs, std::shared_ptr parent) { +RowwiseIterator* new_union_iterator(std::vector& inputs) { if (inputs.size() == 1) { return *(inputs.begin()); } - return new UnionIterator(inputs, parent); + return new UnionIterator(inputs); } RowwiseIterator* new_auto_increment_iterator(const Schema& schema, size_t num_rows) { diff --git a/be/src/olap/generic_iterators.h b/be/src/olap/generic_iterators.h index e8f4528885ae29..5ff287b8d7cd8c 100644 --- a/be/src/olap/generic_iterators.h +++ b/be/src/olap/generic_iterators.h @@ -25,14 +25,14 @@ namespace doris { // // Inputs iterators' ownership is taken by created merge iterator. And client // should delete returned iterator after usage. -RowwiseIterator* new_merge_iterator(std::vector inputs, std::shared_ptr parent, int sequence_id_idx); +RowwiseIterator* new_merge_iterator(std::vector inputs, int sequence_id_idx); // Create a union iterator for input iterators. Union iterator will read // input iterators one by one. // // Inputs iterators' ownership is taken by created union iterator. And client // should delete returned iterator after usage. -RowwiseIterator* new_union_iterator(std::vector& inputs, std::shared_ptr parent); +RowwiseIterator* new_union_iterator(std::vector& inputs); // Create an auto increment iterator which returns num_rows data in format of schema. // This class aims to be used in unit test. diff --git a/be/src/olap/iterators.h b/be/src/olap/iterators.h index 4cdfc605788354..4609bd0a52ca8f 100644 --- a/be/src/olap/iterators.h +++ b/be/src/olap/iterators.h @@ -114,9 +114,6 @@ class RowwiseIterator { // Return the data id such as segment id, used for keep the insert order when do // merge sort in priority queue virtual uint64_t data_id() const { return 0; } - -protected: - std::shared_ptr _mem_tracker; }; } // namespace doris diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index d0ab7da32ee972..92fdf1dcd65f26 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -320,8 +320,8 @@ Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, // note that the cache might get larger than its capacity if not enough // space was freed auto old = _table.insert(e); - DCHECK(thread_local_ctx.thread_mem_tracker()->get_task_mem_tracker() == nullptr); - source_mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), charge); + DCHECK(thread_local_ctx.thread_mem_tracker()->parent_task_mem_tracker() == nullptr); + Status st = source_mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), charge); _usage += e->total_size; if (old != nullptr) { old->in_cache = false; @@ -339,7 +339,6 @@ Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, // we free the entries here outside of mutex for // performance reasons while (to_remove_head != nullptr) { - SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); LRUHandle* next = to_remove_head->next; to_remove_head->free(); to_remove_head = next; @@ -443,11 +442,11 @@ uint32_t ShardedLRUCache::_shard(uint32_t hash) { return hash >> (32 - kNumShardBits); } -ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type, - std::shared_ptr parent) +ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type) : _name(name), _last_id(1), - _mem_tracker(MemTracker::create_tracker(-1, name, parent, MemTrackerLevel::OVERVIEW)) { + _mem_tracker(MemTracker::create_tracker(-1, name, nullptr, MemTrackerLevel::OVERVIEW)) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const size_t per_shard = (total_capacity + (kNumShards - 1)) / kNumShards; for (int s = 0; s < kNumShards; s++) { _shards[s] = new LRUCache(type); @@ -466,6 +465,7 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, } ShardedLRUCache::~ShardedLRUCache() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); for (int s = 0; s < kNumShards; s++) { delete _shards[s]; } @@ -477,26 +477,26 @@ Cache::Handle* ShardedLRUCache::insert(const CacheKey& key, void* value, size_t void (*deleter)(const CacheKey& key, void* value), CachePriority priority) { std::shared_ptr source_mem_tracker = thread_local_ctx.thread_mem_tracker(); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const uint32_t hash = _hash_slice(key); return _shards[_shard(hash)]->insert(key, hash, value, charge, deleter, priority, source_mem_tracker); } Cache::Handle* ShardedLRUCache::lookup(const CacheKey& key) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const uint32_t hash = _hash_slice(key); return _shards[_shard(hash)]->lookup(key, hash); } void ShardedLRUCache::release(Handle* handle) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); LRUHandle* h = reinterpret_cast(handle); _shards[_shard(h->hash)]->release(handle); } void ShardedLRUCache::erase(const CacheKey& key) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const uint32_t hash = _hash_slice(key); _shards[_shard(hash)]->erase(key, hash); } @@ -515,7 +515,7 @@ uint64_t ShardedLRUCache::new_id() { } int64_t ShardedLRUCache::prune() { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t num_prune = 0; for (int s = 0; s < kNumShards; s++) { num_prune += _shards[s]->prune(); @@ -524,7 +524,7 @@ int64_t ShardedLRUCache::prune() { } int64_t ShardedLRUCache::prune_if(CacheValuePredicate pred) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker, "LRUCache", false); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t num_prune = 0; for (int s = 0; s < kNumShards; s++) { num_prune += _shards[s]->prune_if(pred); @@ -553,14 +553,12 @@ void ShardedLRUCache::update_cache_metrics() const { : ((double)total_hit_count / total_lookup_count)); } -Cache* new_lru_cache(const std::string& name, size_t capacity, - std::shared_ptr parent_tracker) { - return new ShardedLRUCache(name, capacity, LRUCacheType::SIZE, parent_tracker); +Cache* new_lru_cache(const std::string& name, size_t capacity) { + return new ShardedLRUCache(name, capacity, LRUCacheType::SIZE); } -Cache* new_typed_lru_cache(const std::string& name, size_t capacity, LRUCacheType type, - std::shared_ptr parent_tracker) { - return new ShardedLRUCache(name, capacity, type, parent_tracker); +Cache* new_typed_lru_cache(const std::string& name, size_t capacity, LRUCacheType type) { + return new ShardedLRUCache(name, capacity, type); } } // namespace doris diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h index 8f35c0f4d51c63..f0453016ce96c2 100644 --- a/be/src/olap/lru_cache.h +++ b/be/src/olap/lru_cache.h @@ -56,11 +56,9 @@ enum LRUCacheType { // Create a new cache with a specified name and a fixed SIZE capacity. // This implementation of Cache uses a least-recently-used eviction policy. -extern Cache* new_lru_cache(const std::string& name, size_t capacity, - std::shared_ptr parent_tracekr = nullptr); +extern Cache* new_lru_cache(const std::string& name, size_t capacity); -extern Cache* new_typed_lru_cache(const std::string& name, size_t capacity, LRUCacheType type, - std::shared_ptr parent_tracekr = nullptr); +extern Cache* new_typed_lru_cache(const std::string& name, size_t capacity, LRUCacheType type); class CacheKey { public: @@ -362,8 +360,7 @@ static const int kNumShards = 1 << kNumShardBits; class ShardedLRUCache : public Cache { public: - explicit ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type, - std::shared_ptr parent); + explicit ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type); // TODO(fdy): 析构时清除所有cache元素 virtual ~ShardedLRUCache(); virtual Handle* insert(const CacheKey& key, void* value, size_t charge, diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index c0f671166f6a29..2b347700aaaa0c 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -31,17 +31,16 @@ namespace doris { MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet_schema, const std::vector* slot_descs, TupleDescriptor* tuple_desc, - KeysType keys_type, RowsetWriter* rowset_writer, - const std::shared_ptr& parent_tracker) + KeysType keys_type, RowsetWriter* rowset_writer) : _tablet_id(tablet_id), _schema(schema), _tablet_schema(tablet_schema), _tuple_desc(tuple_desc), _slot_descs(slot_descs), _keys_type(keys_type), - _mem_tracker(MemTracker::create_tracker(-1, "MemTable", parent_tracker)), - _buffer_mem_pool(new MemPool(_mem_tracker.get())), - _table_mem_pool(new MemPool(_mem_tracker.get())), + _mem_tracker(MemTracker::create_tracker(-1, "MemTable")), + _buffer_mem_pool(new MemPool(_mem_tracker)), + _table_mem_pool(new MemPool(_mem_tracker)), _schema_size(_schema->schema_size()), _rowset_writer(rowset_writer) { if (tablet_schema->sort_type() == SortType::ZORDER) { diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 7b0ee309d3d3a5..eced6ffd06920f 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -40,8 +40,7 @@ class MemTable { public: MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet_schema, const std::vector* slot_descs, TupleDescriptor* tuple_desc, - KeysType keys_type, RowsetWriter* rowset_writer, - const std::shared_ptr& parent_tracker); + KeysType keys_type, RowsetWriter* rowset_writer); ~MemTable(); int64_t tablet_id() const { return _tablet_id; } diff --git a/be/src/olap/memtable_flush_executor.cpp b/be/src/olap/memtable_flush_executor.cpp index a2251a73afe104..dce1674052236f 100644 --- a/be/src/olap/memtable_flush_executor.cpp +++ b/be/src/olap/memtable_flush_executor.cpp @@ -57,8 +57,7 @@ OLAPStatus FlushToken::wait() { } void FlushToken::_flush_memtable(std::shared_ptr memtable, int64_t submit_task_time) { - // TODO(zxy) - // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(memtable->mem_tracker(), "FlushToken", false); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(memtable->mem_tracker()); _stats.flush_wait_time_ns += (MonotonicNanos() - submit_task_time); SCOPED_CLEANUP({ memtable.reset(); }); // If previous flush has failed, return directly diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index ec332f0f6caf12..66dfaaf5974a79 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -48,9 +48,7 @@ OLAPStatus Merger::merge_rowsets(TabletSharedPtr tablet, ReaderType reader_type, "failed to init row cursor when merging rowsets of tablet " + tablet->full_name()); row_cursor.allocate_memory_for_string_type(tablet->tablet_schema()); - // TODO(yingchun): monitor - std::shared_ptr tracker(new MemTracker(-1)); - std::unique_ptr mem_pool(new MemPool(tracker.get())); + std::unique_ptr mem_pool(new MemPool("Merger:merge_rowsets")); // The following procedure would last for long time, half of one day, etc. int64_t output_rows = 0; diff --git a/be/src/olap/olap_index.cpp b/be/src/olap/olap_index.cpp index 14ae7e4434cb0d..85f0625889c74c 100644 --- a/be/src/olap/olap_index.cpp +++ b/be/src/olap/olap_index.cpp @@ -40,8 +40,7 @@ MemIndex::MemIndex() _index_size(0), _data_size(0), _num_rows(0), - _tracker(new MemTracker(-1)), - _mem_pool(new MemPool(_tracker.get())) {} + _mem_pool(new MemPool("MemIndex")) {} MemIndex::~MemIndex() { _num_entries = 0; diff --git a/be/src/olap/olap_index.h b/be/src/olap/olap_index.h index 1b9c704c41007f..11e22d2b67f89c 100644 --- a/be/src/olap/olap_index.h +++ b/be/src/olap/olap_index.h @@ -291,7 +291,6 @@ class MemIndex { size_t _num_rows; std::vector* _short_key_columns; - std::shared_ptr _tracker; std::unique_ptr _mem_pool; DISALLOW_COPY_AND_ASSIGN(MemIndex); }; diff --git a/be/src/olap/page_cache.cpp b/be/src/olap/page_cache.cpp index b0555c9889a71f..65882ccbee3611 100644 --- a/be/src/olap/page_cache.cpp +++ b/be/src/olap/page_cache.cpp @@ -16,6 +16,7 @@ // under the License. #include "olap/page_cache.h" +#include "runtime/thread_context.h" namespace doris { @@ -31,17 +32,18 @@ StoragePageCache::StoragePageCache(size_t capacity, int32_t index_cache_percenta : _index_cache_percentage(index_cache_percentage), _mem_tracker(MemTracker::create_tracker(capacity, "StoragePageCache", nullptr, MemTrackerLevel::OVERVIEW)) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (index_cache_percentage == 0) { _data_page_cache = - std::unique_ptr(new_lru_cache("DataPageCache", capacity, _mem_tracker)); + std::unique_ptr(new_lru_cache("DataPageCache", capacity)); } else if (index_cache_percentage == 100) { _index_page_cache = - std::unique_ptr(new_lru_cache("IndexPageCache", capacity, _mem_tracker)); + std::unique_ptr(new_lru_cache("IndexPageCache", capacity)); } else if (index_cache_percentage > 0 && index_cache_percentage < 100) { _data_page_cache = std::unique_ptr(new_lru_cache( - "DataPageCache", capacity * (100 - index_cache_percentage) / 100, _mem_tracker)); + "DataPageCache", capacity * (100 - index_cache_percentage) / 100)); _index_page_cache = std::unique_ptr(new_lru_cache( - "IndexPageCache", capacity * index_cache_percentage / 100, _mem_tracker)); + "IndexPageCache", capacity * index_cache_percentage / 100)); } else { CHECK(false) << "invalid index page cache percentage"; } diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index b1a03a19ede162..833d8822594ac6 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -903,9 +903,7 @@ OLAPStatus PushBrokerReader::init(const Schema* schema, const TBrokerScanRange& } _runtime_profile = _runtime_state->runtime_profile(); _runtime_profile->set_name("PushBrokerReader"); - _mem_tracker = MemTracker::create_tracker(-1, "PushBrokerReader", - _runtime_state->instance_mem_tracker()); - _mem_pool.reset(new MemPool(_mem_tracker.get())); + _mem_pool.reset(new MemPool("PushBrokerReader")); _counter.reset(new ScannerCounter()); // init scanner diff --git a/be/src/olap/push_handler.h b/be/src/olap/push_handler.h index c1499b2858e36d..4c3d54a67795ea 100644 --- a/be/src/olap/push_handler.h +++ b/be/src/olap/push_handler.h @@ -211,7 +211,6 @@ class PushBrokerReader { const Schema* _schema; std::unique_ptr _runtime_state; RuntimeProfile* _runtime_profile; - std::shared_ptr _mem_tracker; std::unique_ptr _mem_pool; std::unique_ptr _counter; std::unique_ptr _scanner; diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp index 4deda90eb0cc8c..e2aa2dc5d4e877 100644 --- a/be/src/olap/reader.cpp +++ b/be/src/olap/reader.cpp @@ -37,7 +37,6 @@ #include "olap/storage_engine.h" #include "olap/tablet.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "runtime/string_value.hpp" #include "util/date_func.h" #include "util/mem_util.hpp" @@ -107,9 +106,7 @@ TabletReader::~TabletReader() { } OLAPStatus TabletReader::init(const ReaderParams& read_params) { - // TODO(yingchun): monitor - _tracker.reset(new MemTracker(-1, read_params.tablet->full_name())); - _predicate_mem_pool.reset(new MemPool(_tracker.get())); + _predicate_mem_pool.reset(new MemPool("TabletReader:" + read_params.tablet->full_name())); OLAPStatus res = _init_params(read_params); if (res != OLAP_SUCCESS) { diff --git a/be/src/olap/reader.h b/be/src/olap/reader.h index 3137e0612dd39e..de45f749c4ff13 100644 --- a/be/src/olap/reader.h +++ b/be/src/olap/reader.h @@ -185,7 +185,6 @@ class TabletReader { TabletSharedPtr tablet() { return _tablet; } - std::shared_ptr _tracker; std::unique_ptr _predicate_mem_pool; std::set _load_bf_columns; std::set _load_bf_all_columns; diff --git a/be/src/olap/row_block.cpp b/be/src/olap/row_block.cpp index 1b041c80d00c61..061972edd3d372 100644 --- a/be/src/olap/row_block.cpp +++ b/be/src/olap/row_block.cpp @@ -37,10 +37,8 @@ using std::vector; namespace doris { -RowBlock::RowBlock(const TabletSchema* schema, const std::shared_ptr& parent_tracker) - : _capacity(0), _schema(schema) { - _tracker = MemTracker::create_tracker(-1, "RowBlock", parent_tracker, MemTrackerLevel::VERBOSE); - _mem_pool.reset(new MemPool(_tracker.get())); +RowBlock::RowBlock(const TabletSchema* schema) : _capacity(0), _schema(schema) { + _mem_pool.reset(new MemPool("RowBlock")); } RowBlock::~RowBlock() { @@ -90,7 +88,8 @@ void RowBlock::_compute_layout() { // All field has a nullbyte in memory if (column.type() == OLAP_FIELD_TYPE_VARCHAR || column.type() == OLAP_FIELD_TYPE_HLL || - column.type() == OLAP_FIELD_TYPE_CHAR || column.type() == OLAP_FIELD_TYPE_OBJECT ||column.type() == OLAP_FIELD_TYPE_STRING) { + column.type() == OLAP_FIELD_TYPE_CHAR || column.type() == OLAP_FIELD_TYPE_OBJECT || + column.type() == OLAP_FIELD_TYPE_STRING) { // 变长部分额外计算下实际最大的字符串长度(此处length已经包括记录Length的2个字节) memory_size += sizeof(Slice) + sizeof(char); } else { diff --git a/be/src/olap/row_block.h b/be/src/olap/row_block.h index 75924fa63228de..6b1dd0255412cb 100644 --- a/be/src/olap/row_block.h +++ b/be/src/olap/row_block.h @@ -57,8 +57,7 @@ class RowBlock { friend class VectorizedRowBatch; public: - RowBlock(const TabletSchema* schema, - const std::shared_ptr& parent_tracker = nullptr); + RowBlock(const TabletSchema* schema); // 注意回收内部buffer ~RowBlock(); @@ -136,7 +135,6 @@ class RowBlock { size_t _limit = 0; uint8_t _block_status = DEL_PARTIAL_SATISFIED; - std::shared_ptr _tracker; std::unique_ptr _mem_pool; // 由于内部持有内存资源,所以这里禁止拷贝和赋值 DISALLOW_COPY_AND_ASSIGN(RowBlock); diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp index 21132d58bf2c5d..ac067c7a51db7f 100644 --- a/be/src/olap/row_block2.cpp +++ b/be/src/olap/row_block2.cpp @@ -33,14 +33,10 @@ using strings::Substitute; namespace doris { RowBlockV2::RowBlockV2(const Schema& schema, uint16_t capacity) - : RowBlockV2(schema, capacity, nullptr) {} - -RowBlockV2::RowBlockV2(const Schema& schema, uint16_t capacity, std::shared_ptr parent) : _schema(schema), _capacity(capacity), _column_vector_batches(_schema.num_columns()), - _tracker(MemTracker::create_tracker(-1, "RowBlockV2", std::move(parent))), - _pool(new MemPool(_tracker.get())), + _pool(new MemPool("RowBlockV2")), _selection_vector(nullptr) { for (auto cid : _schema.column_ids()) { Status status = ColumnVectorBatch::create( diff --git a/be/src/olap/row_block2.h b/be/src/olap/row_block2.h index 7f2b79d638e90c..35c4a48996addd 100644 --- a/be/src/olap/row_block2.h +++ b/be/src/olap/row_block2.h @@ -28,7 +28,6 @@ #include "olap/selection_vector.h" #include "olap/types.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" namespace doris { @@ -43,7 +42,6 @@ class RowBlockV2 { public: RowBlockV2(const Schema& schema, uint16_t capacity); - RowBlockV2(const Schema& schema, uint16_t capacity, std::shared_ptr parent); ~RowBlockV2(); // update number of rows contained in this block @@ -119,7 +117,6 @@ class RowBlockV2 { size_t _num_rows; // manages the memory for slice's data - std::shared_ptr _tracker; std::unique_ptr _pool; // index of selected rows for rows passed the predicate diff --git a/be/src/olap/rowset/alpha_rowset.cpp b/be/src/olap/rowset/alpha_rowset.cpp index 53c695f4d65056..9e3ff54092fab2 100644 --- a/be/src/olap/rowset/alpha_rowset.cpp +++ b/be/src/olap/rowset/alpha_rowset.cpp @@ -55,14 +55,6 @@ OLAPStatus AlphaRowset::create_reader(std::shared_ptr* result) { return OLAP_SUCCESS; } -OLAPStatus AlphaRowset::create_reader(const std::shared_ptr& parent_tracker, - std::shared_ptr* result) { - result->reset(new AlphaRowsetReader(_schema->num_rows_per_row_block(), - std::static_pointer_cast(shared_from_this()), - parent_tracker)); - return OLAP_SUCCESS; -} - OLAPStatus AlphaRowset::remove() { VLOG_NOTICE << "begin to remove files in rowset " << unique_id() << ", version:" << start_version() << "-" << end_version() << ", tabletid:" << _rowset_meta->tablet_id(); diff --git a/be/src/olap/rowset/alpha_rowset.h b/be/src/olap/rowset/alpha_rowset.h index 364d59dc78fef6..8a5b4d9dc549f9 100644 --- a/be/src/olap/rowset/alpha_rowset.h +++ b/be/src/olap/rowset/alpha_rowset.h @@ -41,9 +41,6 @@ class AlphaRowset : public Rowset { OLAPStatus create_reader(std::shared_ptr* result) override; - OLAPStatus create_reader(const std::shared_ptr& parent_tracker, - std::shared_ptr* result) override; - OLAPStatus split_range(const RowCursor& start_key, const RowCursor& end_key, uint64_t request_block_row_count, size_t key_num, std::vector* ranges) override; diff --git a/be/src/olap/rowset/alpha_rowset_reader.cpp b/be/src/olap/rowset/alpha_rowset_reader.cpp index b22bfa1fb59eb0..e883680f50bb15 100644 --- a/be/src/olap/rowset/alpha_rowset_reader.cpp +++ b/be/src/olap/rowset/alpha_rowset_reader.cpp @@ -22,11 +22,9 @@ namespace doris { -AlphaRowsetReader::AlphaRowsetReader(int num_rows_per_row_block, AlphaRowsetSharedPtr rowset, - const std::shared_ptr& parent_tracker) +AlphaRowsetReader::AlphaRowsetReader(int num_rows_per_row_block, AlphaRowsetSharedPtr rowset) : _num_rows_per_row_block(num_rows_per_row_block), _rowset(std::move(rowset)), - _parent_tracker(parent_tracker), _alpha_rowset_meta( std::static_pointer_cast(_rowset->rowset_meta()).get()), _segment_groups(_rowset->_segment_groups), @@ -68,8 +66,7 @@ OLAPStatus AlphaRowsetReader::init(RowsetReaderContext* read_context) { if (_current_read_context->need_ordered_result && _is_segments_overlapping && _sequential_ctxs.size() > 1) { _next_block = &AlphaRowsetReader::_merge_block; - _read_block.reset(new (std::nothrow) - RowBlock(_current_read_context->tablet_schema, _parent_tracker)); + _read_block.reset(new (std::nothrow) RowBlock(_current_read_context->tablet_schema)); if (_read_block == nullptr) { LOG(WARNING) << "new row block failed in reader"; return OLAP_ERR_MALLOC_ERROR; @@ -322,8 +319,7 @@ OLAPStatus AlphaRowsetReader::_init_merge_ctxs(RowsetReaderContext* read_context const bool use_index_stream_cache = read_context->reader_type == READER_QUERY; for (auto& segment_group : _segment_groups) { - std::unique_ptr new_column_data( - ColumnData::create(segment_group.get(), _parent_tracker)); + std::unique_ptr new_column_data(ColumnData::create(segment_group.get())); OLAPStatus status = new_column_data->init(); if (status != OLAP_SUCCESS) { LOG(WARNING) << "init column data failed"; diff --git a/be/src/olap/rowset/alpha_rowset_reader.h b/be/src/olap/rowset/alpha_rowset_reader.h index e76bb9465d44f5..018d78153c2f4a 100644 --- a/be/src/olap/rowset/alpha_rowset_reader.h +++ b/be/src/olap/rowset/alpha_rowset_reader.h @@ -52,8 +52,7 @@ struct AlphaMergeContextComparator { class AlphaRowsetReader : public RowsetReader { public: - AlphaRowsetReader(int num_rows_per_row_block, AlphaRowsetSharedPtr rowset, - const std::shared_ptr& parent_tracker = nullptr); + AlphaRowsetReader(int num_rows_per_row_block, AlphaRowsetSharedPtr rowset); ~AlphaRowsetReader() override; @@ -61,7 +60,6 @@ class AlphaRowsetReader : public RowsetReader { OLAPStatus init(RowsetReaderContext* read_context) override; // read next block data - // If parent_tracker is not null, the block we get from next_block() will have the parent_tracker. // It's ok, because we only get ref here, the block's owner is this reader. OLAPStatus next_block(RowBlock** block) override; @@ -104,7 +102,6 @@ class AlphaRowsetReader : public RowsetReader { private: int _num_rows_per_row_block; AlphaRowsetSharedPtr _rowset; - std::shared_ptr _parent_tracker; std::string _rowset_path; AlphaRowsetMeta* _alpha_rowset_meta; const std::vector>& _segment_groups; diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index 99ce9e5dcccfe5..fabcacfbab7716 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -74,14 +74,6 @@ OLAPStatus BetaRowset::create_reader(RowsetReaderSharedPtr* result) { return OLAP_SUCCESS; } -OLAPStatus BetaRowset::create_reader(const std::shared_ptr& parent_tracker, - std::shared_ptr* result) { - // NOTE: We use std::static_pointer_cast for performance - result->reset(new BetaRowsetReader(std::static_pointer_cast(shared_from_this()), - parent_tracker)); - return OLAP_SUCCESS; -} - OLAPStatus BetaRowset::split_range(const RowCursor& start_key, const RowCursor& end_key, uint64_t request_block_row_count, size_t key_num, std::vector* ranges) { diff --git a/be/src/olap/rowset/beta_rowset.h b/be/src/olap/rowset/beta_rowset.h index 5030b298e1770e..81ad134904766f 100644 --- a/be/src/olap/rowset/beta_rowset.h +++ b/be/src/olap/rowset/beta_rowset.h @@ -39,9 +39,6 @@ class BetaRowset : public Rowset { OLAPStatus create_reader(RowsetReaderSharedPtr* result) override; - OLAPStatus create_reader(const std::shared_ptr& parent_tracker, - std::shared_ptr* result) override; - static FilePathDesc segment_file_path(const FilePathDesc& segment_dir_desc, const RowsetId& rowset_id, int segment_id); diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index 3aed8eb3c37124..dd7e1586a58b84 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -31,21 +31,14 @@ namespace doris { -BetaRowsetReader::BetaRowsetReader(BetaRowsetSharedPtr rowset, - std::shared_ptr parent_tracker) +BetaRowsetReader::BetaRowsetReader(BetaRowsetSharedPtr rowset) : _context(nullptr), _rowset(std::move(rowset)), - _stats(&_owned_stats), - _parent_tracker(std::move(parent_tracker)) { + _stats(&_owned_stats) { _rowset->acquire(); } OLAPStatus BetaRowsetReader::init(RowsetReaderContext* read_context) { - // If do not init the RowsetReader with a parent_tracker, use the runtime_state instance_mem_tracker - if (_parent_tracker == nullptr && read_context->runtime_state != nullptr) { - _parent_tracker = read_context->runtime_state->instance_mem_tracker(); - } - RETURN_NOT_OK(_rowset->load()); _context = read_context; if (_context->stats != nullptr) { @@ -102,7 +95,7 @@ OLAPStatus BetaRowsetReader::init(RowsetReaderContext* read_context) { std::vector> seg_iterators; for (auto& seg_ptr : _segment_cache_handle.get_segments()) { std::unique_ptr iter; - auto s = seg_ptr->new_iterator(*_schema, read_options, _parent_tracker, &iter); + auto s = seg_ptr->new_iterator(*_schema, read_options, &iter); if (!s.ok()) { LOG(WARNING) << "failed to create iterator[" << seg_ptr->id() << "]: " << s.to_string(); return OLAP_ERR_ROWSET_READER_INIT; @@ -119,9 +112,9 @@ OLAPStatus BetaRowsetReader::init(RowsetReaderContext* read_context) { // merge or union segment iterator RowwiseIterator* final_iterator; if (read_context->need_ordered_result && _rowset->rowset_meta()->is_segments_overlapping()) { - final_iterator = new_merge_iterator(iterators, _parent_tracker, read_context->sequence_id_idx); + final_iterator = new_merge_iterator(iterators, read_context->sequence_id_idx); } else { - final_iterator = new_union_iterator(iterators, _parent_tracker); + final_iterator = new_union_iterator(iterators); } auto s = final_iterator->init(read_options); if (!s.ok()) { @@ -132,11 +125,11 @@ OLAPStatus BetaRowsetReader::init(RowsetReaderContext* read_context) { // init input block _input_block.reset(new RowBlockV2(*_schema, - std::min(1024, read_context->batch_size), _parent_tracker)); + std::min(1024, read_context->batch_size))); if (!read_context->is_vec) { // init input/output block and row - _output_block.reset(new RowBlock(read_context->tablet_schema, _parent_tracker)); + _output_block.reset(new RowBlock(read_context->tablet_schema)); RowBlockInfo output_block_info; output_block_info.row_num = std::min(1024, read_context->batch_size); diff --git a/be/src/olap/rowset/beta_rowset_reader.h b/be/src/olap/rowset/beta_rowset_reader.h index 55a8938dbfb7f5..eeddef80da8653 100644 --- a/be/src/olap/rowset/beta_rowset_reader.h +++ b/be/src/olap/rowset/beta_rowset_reader.h @@ -30,14 +30,12 @@ namespace doris { class BetaRowsetReader : public RowsetReader { public: - BetaRowsetReader(BetaRowsetSharedPtr rowset, - std::shared_ptr parent_tracker = nullptr); + BetaRowsetReader(BetaRowsetSharedPtr rowset); ~BetaRowsetReader() override { _rowset->release(); } OLAPStatus init(RowsetReaderContext* read_context) override; - // If parent_tracker is not null, the block we get from next_block() will have the parent_tracker. // It's ok, because we only get ref here, the block's owner is this reader. OLAPStatus next_block(RowBlock** block) override; OLAPStatus next_block(vectorized::Block* block) override; @@ -63,8 +61,6 @@ class BetaRowsetReader : public RowsetReader { OlapReaderStatistics _owned_stats; OlapReaderStatistics* _stats; - std::shared_ptr _parent_tracker; - std::unique_ptr _iterator; std::unique_ptr _input_block; diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index be8713137a3dfa..8271ea232c1fef 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -223,8 +223,8 @@ OLAPStatus BetaRowsetWriter::_create_segment_writer(std::unique_ptrreset(new segment_v2::SegmentWriter(wblock.get(), _num_segment, - _context.tablet_schema, writer_options, _context.parent_mem_tracker)); + writer->reset(new segment_v2::SegmentWriter(wblock.get(), _num_segment, _context.tablet_schema, + writer_options)); { std::lock_guard l(_lock); _wblocks.push_back(std::move(wblock)); diff --git a/be/src/olap/rowset/column_data.cpp b/be/src/olap/rowset/column_data.cpp index 224367480c6f7c..178710603b726a 100644 --- a/be/src/olap/rowset/column_data.cpp +++ b/be/src/olap/rowset/column_data.cpp @@ -24,16 +24,13 @@ namespace doris { -ColumnData* ColumnData::create(SegmentGroup* segment_group, - const std::shared_ptr& parent_tracker) { - ColumnData* data = new (std::nothrow) ColumnData(segment_group, parent_tracker); +ColumnData* ColumnData::create(SegmentGroup* segment_group) { + ColumnData* data = new (std::nothrow) ColumnData(segment_group); return data; } -ColumnData::ColumnData(SegmentGroup* segment_group, - const std::shared_ptr& parent_tracker) +ColumnData::ColumnData(SegmentGroup* segment_group) : _segment_group(segment_group), - _parent_tracker(parent_tracker), _eof(false), _conditions(nullptr), _col_predicates(nullptr), @@ -138,7 +135,7 @@ OLAPStatus ColumnData::_seek_to_block(const RowBlockPosition& block_pos, bool wi _segment_reader = new (std::nothrow) SegmentReader(file_name, segment_group(), block_pos.segment, _seek_columns, _load_bf_columns, _conditions, _delete_handler, _delete_status, - _lru_cache, _runtime_state, _stats, _parent_tracker); + _lru_cache, _runtime_state, _stats); if (_segment_reader == nullptr) { OLAP_LOG_WARNING("fail to malloc segment reader."); return OLAP_ERR_MALLOC_ERROR; @@ -435,14 +432,12 @@ void ColumnData::set_read_params(const std::vector& return_columns, } _read_vector_batch.reset(new VectorizedRowBatch(&(_segment_group->get_tablet_schema()), - _return_columns, _num_rows_per_block, - _parent_tracker)); + _return_columns, _num_rows_per_block)); _seek_vector_batch.reset(new VectorizedRowBatch(&(_segment_group->get_tablet_schema()), - _seek_columns, _num_rows_per_block, - _parent_tracker)); + _seek_columns, _num_rows_per_block)); - _read_block.reset(new RowBlock(&(_segment_group->get_tablet_schema()), _parent_tracker)); + _read_block.reset(new RowBlock(&(_segment_group->get_tablet_schema()))); RowBlockInfo block_info; block_info.row_num = _num_rows_per_block; block_info.null_supported = true; @@ -580,7 +575,7 @@ OLAPStatus ColumnData::schema_change_init() { _read_vector_batch.reset(new VectorizedRowBatch(&(_segment_group->get_tablet_schema()), _return_columns, _num_rows_per_block)); - _read_block.reset(new RowBlock(&(_segment_group->get_tablet_schema()), _parent_tracker)); + _read_block.reset(new RowBlock(&(_segment_group->get_tablet_schema()))); RowBlockInfo block_info; block_info.row_num = _num_rows_per_block; diff --git a/be/src/olap/rowset/column_data.h b/be/src/olap/rowset/column_data.h index e2565b4c4b84a8..c5ad7410195ae7 100644 --- a/be/src/olap/rowset/column_data.h +++ b/be/src/olap/rowset/column_data.h @@ -39,10 +39,8 @@ class SegmentReader; // This class is column data reader. this class will be used in two case. class ColumnData { public: - static ColumnData* create(SegmentGroup* segment_group, - const std::shared_ptr& parent_tracker = nullptr); - ColumnData(SegmentGroup* segment_group, - const std::shared_ptr& parent_tracker = nullptr); + static ColumnData* create(SegmentGroup* segment_group); + ColumnData(SegmentGroup* segment_group); ~ColumnData(); // 为了与之前兼容, 暴露部分index的接口 @@ -139,7 +137,6 @@ class ColumnData { private: SegmentGroup* _segment_group; - std::shared_ptr _parent_tracker; // 当到达文件末尾或者到达end key时设置此标志 bool _eof; const Conditions* _conditions; diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index 8e952d48a6ab63..a6b533ef15588c 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -32,7 +32,6 @@ namespace doris { class DataDir; -class MemTracker; class OlapTuple; class RowCursor; class Rowset; @@ -119,10 +118,6 @@ class Rowset : public std::enable_shared_from_this { // returns OLAP_ERR_ROWSET_CREATE_READER when failed to create reader virtual OLAPStatus create_reader(std::shared_ptr* result) = 0; - // Support adding parent tracker, but should be careful about destruction sequence. - virtual OLAPStatus create_reader(const std::shared_ptr& parent_tracker, - std::shared_ptr* result) = 0; - // Split range denoted by `start_key` and `end_key` into sub-ranges, each contains roughly // `request_block_row_count` rows. Sub-range is represented by pair of OlapTuples and added to `ranges`. // diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index 8c314f5dba3cd6..74b65327ccc157 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -62,7 +62,6 @@ struct RowsetWriterContext { // indicate whether the data among segments is overlapping. // default is OVERLAP_UNKNOWN. SegmentsOverlapPB segments_overlap; - std::shared_ptr parent_mem_tracker; // segment file use uint32 to represent row number, therefore the maximum is UINT32_MAX. // the default is set to INT32_MAX to avoid overflow issue when casting from uint32_t to int. // test cases can change this value to control flush timing diff --git a/be/src/olap/rowset/segment_reader.cpp b/be/src/olap/rowset/segment_reader.cpp index a1d1f9c500202e..da149b282b9309 100644 --- a/be/src/olap/rowset/segment_reader.cpp +++ b/be/src/olap/rowset/segment_reader.cpp @@ -25,6 +25,7 @@ #include "olap/in_stream.h" #include "olap/olap_cond.h" #include "olap/out_stream.h" +#include "runtime/thread_context.h" #include "olap/row_block.h" #include "olap/rowset/segment_group.h" @@ -37,8 +38,7 @@ SegmentReader::SegmentReader(const std::string file, SegmentGroup* segment_group const std::set& load_bf_columns, const Conditions* conditions, const DeleteHandler* delete_handler, const DelCondSatisfied delete_status, Cache* lru_cache, - RuntimeState* runtime_state, OlapReaderStatistics* stats, - const std::shared_ptr& parent_tracker) + RuntimeState* runtime_state, OlapReaderStatistics* stats) : _file_name(file), _segment_group(segment_group), _segment_id(segment_id), @@ -58,8 +58,7 @@ SegmentReader::SegmentReader(const std::string file, SegmentGroup* segment_group _is_using_mmap(false), _is_data_loaded(false), _buffer_size(0), - _tracker(MemTracker::create_tracker(-1, "SegmentReader:" + file, parent_tracker)), - _mem_pool(new MemPool(_tracker.get())), + _mem_pool(new MemPool("SegmentReader:" + file)), _shared_buffer(nullptr), _lru_cache(lru_cache), _runtime_state(runtime_state), @@ -86,10 +85,6 @@ SegmentReader::~SegmentReader() { _lru_cache = nullptr; _file_handler.close(); - if (_is_data_loaded && _runtime_state != nullptr) { - MemTracker::batch_consume(_buffer_size * -1, _runtime_state->mem_trackers()); - } - for (auto& it : _streams) { delete it.second; } @@ -237,6 +232,7 @@ OLAPStatus SegmentReader::seek_to_block(uint32_t first_block, uint32_t last_bloc if (!_is_data_loaded) { _reset_readers(); + if (!CHECK_MEM_LIMIT(_buffer_size)) return OLAP_ERR_FETCH_MEMORY_EXCEEDED; res = _read_all_data_streams(&_buffer_size); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to read data stream"); @@ -249,12 +245,6 @@ OLAPStatus SegmentReader::seek_to_block(uint32_t first_block, uint32_t last_bloc return res; } - if (_runtime_state != nullptr) { - if (!MemTracker::batch_consume(_buffer_size, _runtime_state->mem_trackers())) { - return OLAP_ERR_FETCH_MEMORY_EXCEEDED; - } - } - _is_data_loaded = true; } @@ -835,10 +825,6 @@ OLAPStatus SegmentReader::_reset_readers() { for (std::map::iterator it = _streams.begin(); it != _streams.end(); ++it) { - if (_runtime_state != nullptr) { - MemTracker::batch_consume(-1 * it->second->get_buffer_size(), - _runtime_state->mem_trackers()); - } delete it->second; } @@ -849,10 +835,6 @@ OLAPStatus SegmentReader::_reset_readers() { if ((*it) == nullptr) { continue; } - if (_runtime_state != nullptr) { - MemTracker::batch_consume(-1 * (*it)->get_buffer_size(), - _runtime_state->mem_trackers()); - } delete (*it); } diff --git a/be/src/olap/rowset/segment_reader.h b/be/src/olap/rowset/segment_reader.h index 0d3aef0b75f949..91464ac2ca5916 100644 --- a/be/src/olap/rowset/segment_reader.h +++ b/be/src/olap/rowset/segment_reader.h @@ -51,8 +51,7 @@ class SegmentReader { const std::vector& used_columns, const std::set& load_bf_columns, const Conditions* conditions, const DeleteHandler* delete_handler, const DelCondSatisfied delete_status, - Cache* lru_cache, RuntimeState* runtime_state, OlapReaderStatistics* stats, - const std::shared_ptr& parent_tracker = nullptr); + Cache* lru_cache, RuntimeState* runtime_state, OlapReaderStatistics* stats); ~SegmentReader(); @@ -317,7 +316,6 @@ class SegmentReader { std::vector _cache_handle; const FileHeader* _file_header; - std::shared_ptr _tracker; std::unique_ptr _mem_pool; StorageByteBuffer* _shared_buffer; diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index f986fce59879a1..95ad47e4f1a9b8 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -37,8 +37,7 @@ BinaryDictPageBuilder::BinaryDictPageBuilder(const PageBuilderOptions& options) _data_page_builder(nullptr), _dict_builder(nullptr), _encoding_type(DICT_ENCODING), - _tracker(new MemTracker()), - _pool(_tracker.get()) { + _pool("BinaryDictPageBuilder") { // initially use DICT_ENCODING // TODO: the data page builder type can be created by Factory according to user config _data_page_builder.reset(new BitshufflePageBuilder(options)); diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h index 6b9d23b80ad7bf..6ee2d404404f75 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.h +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h @@ -32,7 +32,6 @@ #include "olap/rowset/segment_v2/options.h" #include "olap/types.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "olap/rowset/segment_v2/bitshuffle_page.h" namespace doris { @@ -91,7 +90,6 @@ class BinaryDictPageBuilder : public PageBuilder { // used to remember the insertion order of dict keys std::vector _dict_items; // TODO(zc): rethink about this mem pool - std::shared_ptr _tracker; MemPool _pool; faststring _buffer; faststring _first_value; diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h index 4313169aa644ea..6007b587f10da0 100644 --- a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h @@ -25,7 +25,6 @@ #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/indexed_column_reader.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" namespace doris { @@ -71,8 +70,7 @@ class BitmapIndexIterator { _dict_column_iter(reader->_dict_column_reader.get()), _bitmap_column_iter(reader->_bitmap_column_reader.get()), _current_rowid(0), - _tracker(new MemTracker()), - _pool(new MemPool(_tracker.get())) {} + _pool(new MemPool("BitmapIndexIterator")) {} bool has_null_bitmap() const { return _reader->_has_null; } @@ -109,7 +107,6 @@ class BitmapIndexIterator { IndexedColumnIterator _dict_column_iter; IndexedColumnIterator _bitmap_column_iter; rowid_t _current_rowid; - std::shared_ptr _tracker; std::unique_ptr _pool; }; diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp b/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp index a81dc92dc56e59..73b582fa745ed7 100644 --- a/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp @@ -26,7 +26,6 @@ #include "olap/rowset/segment_v2/indexed_column_writer.h" #include "olap/types.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "util/faststring.h" #include "util/slice.h" @@ -67,8 +66,7 @@ class BitmapIndexWriterImpl : public BitmapIndexWriter { explicit BitmapIndexWriterImpl(const TypeInfo* typeinfo) : _typeinfo(typeinfo), _reverted_index_size(0), - _tracker(new MemTracker()), - _pool(_tracker.get()) {} + _pool("BitmapIndexWriterImpl") {} ~BitmapIndexWriterImpl() = default; @@ -186,7 +184,6 @@ class BitmapIndexWriterImpl : public BitmapIndexWriter { roaring::Roaring _null_bitmap; // unique value to its row id list MemoryIndexType _mem_index; - std::shared_ptr _tracker; MemPool _pool; }; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h index bb9377eb922c74..28b952055ed5e9 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h @@ -27,7 +27,6 @@ #include "olap/rowset/segment_v2/indexed_column_reader.h" #include "olap/rowset/segment_v2/row_ranges.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" namespace doris { @@ -69,8 +68,7 @@ class BloomFilterIndexIterator { explicit BloomFilterIndexIterator(BloomFilterIndexReader* reader) : _reader(reader), _bloom_filter_iter(reader->_bloom_filter_reader.get()), - _tracker(new MemTracker()), - _pool(new MemPool(_tracker.get())) {} + _pool(new MemPool("BloomFilterIndexIterator")) {} // Read bloom filter at the given ordinal into `bf`. Status read_bloom_filter(rowid_t ordinal, std::unique_ptr* bf); @@ -80,7 +78,6 @@ class BloomFilterIndexIterator { private: BloomFilterIndexReader* _reader; IndexedColumnIterator _bloom_filter_iter; - std::shared_ptr _tracker; std::unique_ptr _pool; }; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index d45b2deb272754..3e1a204725ceae 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -28,7 +28,6 @@ #include "olap/rowset/segment_v2/indexed_column_writer.h" #include "olap/types.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "util/faststring.h" #include "util/slice.h" @@ -72,8 +71,7 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { const TypeInfo* typeinfo) : _bf_options(bf_options), _typeinfo(typeinfo), - _tracker(new MemTracker(-1, "BloomFilterIndexWriterImpl")), - _pool(_tracker.get()), + _pool("BloomFilterIndexWriterImpl"), _has_null(false), _bf_buffer_size(0) {} @@ -163,7 +161,6 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { private: BloomFilterOptions _bf_options; const TypeInfo* _typeinfo; - std::shared_ptr _tracker; MemPool _pool; bool _has_null; uint64_t _bf_buffer_size; diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index db77577788d004..dc4fa49ec014c4 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -72,8 +72,6 @@ struct ColumnIteratorOptions { // INDEX_PAGE including index_page, dict_page and short_key_page PageTypePB type; - std::shared_ptr mem_tracker; - void sanity_check() const { CHECK_NOTNULL(rblock); CHECK_NOTNULL(stats); @@ -198,7 +196,6 @@ class ColumnIterator { virtual ~ColumnIterator() = default; virtual Status init(const ColumnIteratorOptions& opts) { - DCHECK(opts.mem_tracker.get() != nullptr); _opts = opts; return Status::OK(); } @@ -386,8 +383,7 @@ class DefaultValueColumnIterator : public ColumnIterator { _schema_length(schema_length), _is_default_value_null(false), _type_size(0), - _tracker(new MemTracker()), - _pool(new MemPool(_tracker.get())) {} + _pool(new MemPool("DefaultValueColumnIterator")) {} Status init(const ColumnIteratorOptions& opts) override; @@ -423,7 +419,6 @@ class DefaultValueColumnIterator : public ColumnIterator { bool _is_default_value_null; size_t _type_size; void* _mem_value = nullptr; - std::shared_ptr _tracker; std::unique_ptr _pool; // current rowid diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index b98f4883ca4e09..ebd5430ad81b33 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -59,7 +59,6 @@ struct ColumnWriterOptions { << ", need_bloom_filter" << need_bloom_filter; return ss.str(); } - std::shared_ptr parent = nullptr; }; class BitmapIndexWriter; @@ -142,9 +141,6 @@ class ColumnWriter { private: std::unique_ptr _field; bool _is_nullable; - -protected: - std::shared_ptr _mem_tracker; }; class FlushPageCallback { diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp index 088de6940ed6de..16586a24c576ed 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp @@ -41,8 +41,7 @@ IndexedColumnWriter::IndexedColumnWriter(const IndexedColumnWriterOptions& optio : _options(options), _typeinfo(typeinfo), _wblock(wblock), - _mem_tracker(new MemTracker()), - _mem_pool(_mem_tracker.get()), + _mem_pool("IndexedColumnWriter"), _num_values(0), _num_data_pages(0), _value_key_coder(nullptr), diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.h b/be/src/olap/rowset/segment_v2/indexed_column_writer.h index bcb27f434351d7..691440afdcf57d 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_writer.h +++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.h @@ -27,7 +27,6 @@ #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/page_pointer.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "util/slice.h" namespace doris { @@ -91,7 +90,6 @@ class IndexedColumnWriter { const TypeInfo* _typeinfo; fs::WritableBlock* _wblock; // only used for `_first_value` - std::shared_ptr _mem_tracker; MemPool _mem_pool; ordinal_t _num_values; diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 48f94b0322cfed..d0d6836c02a8e7 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -47,14 +47,11 @@ Status Segment::open(const FilePathDesc& path_desc, uint32_t segment_id, const T return Status::OK(); } -Segment::Segment(const FilePathDesc& path_desc, uint32_t segment_id, const TabletSchema* tablet_schema) - : _path_desc(path_desc), _segment_id(segment_id), - _tablet_schema(tablet_schema) { -#ifndef BE_TEST - _mem_tracker = MemTracker::create_tracker(-1, "Segment", StorageEngine::instance()->tablet_mem_tracker()); -#else - _mem_tracker = MemTracker::create_tracker(-1, "Segment", nullptr); -#endif +Segment::Segment(const FilePathDesc& path_desc, uint32_t segment_id, + const TabletSchema* tablet_schema) + : _path_desc(path_desc), _segment_id(segment_id), _tablet_schema(tablet_schema) { + _mem_tracker = MemTracker::create_tracker(-1, "Segment", + StorageEngine::instance()->tablet_mem_tracker()); } Segment::~Segment() { @@ -69,7 +66,6 @@ Status Segment::_open() { } Status Segment::new_iterator(const Schema& schema, const StorageReadOptions& read_options, - std::shared_ptr parent, std::unique_ptr* iter) { if (!_is_open) { RETURN_IF_ERROR(_open()); @@ -94,7 +90,7 @@ Status Segment::new_iterator(const Schema& schema, const StorageReadOptions& rea } RETURN_IF_ERROR(_load_index()); - iter->reset(new SegmentIterator(this->shared_from_this(), schema, parent)); + iter->reset(new SegmentIterator(this->shared_from_this(), schema)); iter->get()->init(read_options); return Status::OK(); } @@ -202,7 +198,7 @@ Status Segment::_create_column_readers() { return Status::OK(); } -Status Segment::new_column_iterator(uint32_t cid, std::shared_ptr parent, ColumnIterator** iter) { +Status Segment::new_column_iterator(uint32_t cid, ColumnIterator** iter) { if (_column_readers[cid] == nullptr) { const TabletColumn& tablet_column = _tablet_schema->column(cid); if (!tablet_column.has_default_value() && !tablet_column.is_nullable()) { @@ -214,7 +210,6 @@ Status Segment::new_column_iterator(uint32_t cid, std::shared_ptr pa tablet_column.has_default_value(), tablet_column.default_value(), tablet_column.is_nullable(), type_info, tablet_column.length())); ColumnIteratorOptions iter_opts; - iter_opts.mem_tracker = MemTracker::create_tracker(-1, "DefaultColumnIterator", parent); RETURN_IF_ERROR(default_value_iter->init(iter_opts)); *iter = default_value_iter.release(); diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index 56fc852d9864f6..857d6744705b96 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -64,15 +64,13 @@ class Segment : public std::enable_shared_from_this { ~Segment(); - Status new_iterator(const Schema& schema, const StorageReadOptions& read_options, - std::shared_ptr parent, - std::unique_ptr* iter); + Status new_iterator(const Schema& schema, const StorageReadOptions& read_options, std::unique_ptr* iter); uint64_t id() const { return _segment_id; } uint32_t num_rows() const { return _footer.num_rows(); } - Status new_column_iterator(uint32_t cid, std::shared_ptr parent, ColumnIterator** iter); + Status new_column_iterator(uint32_t cid, ColumnIterator** iter); Status new_bitmap_index_iterator(uint32_t cid, BitmapIndexIterator** iter); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 9a72ef26a59330..891913e0725b3d 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -91,18 +91,14 @@ class SegmentIterator::BitmapRangeIterator { bool _eof = false; }; -SegmentIterator::SegmentIterator(std::shared_ptr segment, const Schema& schema, - std::shared_ptr parent) +SegmentIterator::SegmentIterator(std::shared_ptr segment, const Schema& schema) : _segment(std::move(segment)), _schema(schema), _column_iterators(_schema.num_columns(), nullptr), _bitmap_index_iterators(_schema.num_columns(), nullptr), _cur_rowid(0), _lazy_materialization_read(false), - _inited(false) { - // use for count the mem use of ColumnIterator - _mem_tracker = MemTracker::create_tracker(-1, "SegmentIterator", std::move(parent)); -} + _inited(false) {} SegmentIterator::~SegmentIterator() { for (auto iter : _column_iterators) { @@ -198,18 +194,16 @@ Status SegmentIterator::_prepare_seek(const StorageReadOptions::KeyRange& key_ra } } _seek_schema = std::make_unique(key_fields, key_fields.size()); - _seek_block = std::make_unique(*_seek_schema, 1, _mem_tracker); + _seek_block = std::make_unique(*_seek_schema, 1); // create used column iterator for (auto cid : _seek_schema->column_ids()) { if (_column_iterators[cid] == nullptr) { RETURN_IF_ERROR( - _segment->new_column_iterator(cid, _mem_tracker, &_column_iterators[cid])); + _segment->new_column_iterator(cid, &_column_iterators[cid])); ColumnIteratorOptions iter_opts; iter_opts.stats = _opts.stats; iter_opts.rblock = _rblock.get(); - iter_opts.mem_tracker = - MemTracker::create_tracker(-1, "ColumnIterator", _mem_tracker); RETURN_IF_ERROR(_column_iterators[cid]->init(iter_opts)); } } @@ -335,13 +329,11 @@ Status SegmentIterator::_init_return_column_iterators() { for (auto cid : _schema.column_ids()) { if (_column_iterators[cid] == nullptr) { RETURN_IF_ERROR( - _segment->new_column_iterator(cid, _mem_tracker, &_column_iterators[cid])); + _segment->new_column_iterator(cid, &_column_iterators[cid])); ColumnIteratorOptions iter_opts; iter_opts.stats = _opts.stats; iter_opts.use_page_cache = _opts.use_page_cache; iter_opts.rblock = _rblock.get(); - iter_opts.mem_tracker = - MemTracker::create_tracker(-1, "ColumnIterator", _mem_tracker); RETURN_IF_ERROR(_column_iterators[cid]->init(iter_opts)); } } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 2eae13eb3422d0..a93ef74bc5f1fd 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -47,8 +47,7 @@ class ColumnIterator; class SegmentIterator : public RowwiseIterator { public: - SegmentIterator(std::shared_ptr segment, const Schema& _schema, - std::shared_ptr parent); + SegmentIterator(std::shared_ptr segment, const Schema& _schema); ~SegmentIterator() override; Status init(const StorageReadOptions& opts) override; diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index e7d4db99c514cd..dd8b21985a103b 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -28,6 +28,7 @@ #include "olap/short_key_index.h" #include "runtime/mem_tracker.h" #include "util/crc32c.h" +#include "runtime/thread_context.h" #include "util/faststring.h" namespace doris { @@ -37,14 +38,13 @@ const char* k_segment_magic = "D0R1"; const uint32_t k_segment_magic_length = 4; SegmentWriter::SegmentWriter(fs::WritableBlock* wblock, uint32_t segment_id, - const TabletSchema* tablet_schema, const SegmentWriterOptions& opts, - std::shared_ptr parent) + const TabletSchema* tablet_schema, const SegmentWriterOptions& opts) : _segment_id(segment_id), _tablet_schema(tablet_schema), _opts(opts), _wblock(wblock), _mem_tracker( - MemTracker::create_tracker(-1, "Segment-" + std::to_string(segment_id), parent)) { + MemTracker::create_virtual_tracker(-1, "SegmentWriter:Segment-" + std::to_string(segment_id))) { CHECK_NOTNULL(_wblock); } @@ -90,7 +90,6 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec __attribute__((unused)) return Status::NotSupported("Do not support bitmap index for array type"); } } - opts.parent = _mem_tracker; std::unique_ptr writer; RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _wblock, &writer)); diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index d0600996ad9292..77a66c85db3640 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -53,7 +53,7 @@ struct SegmentWriterOptions { class SegmentWriter { public: explicit SegmentWriter(fs::WritableBlock* block, uint32_t segment_id, - const TabletSchema* tablet_schema, const SegmentWriterOptions& opts, std::shared_ptr parent = nullptr); + const TabletSchema* tablet_schema, const SegmentWriterOptions& opts); ~SegmentWriter(); Status init(uint32_t write_mbytes_per_sec); diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.cpp b/be/src/olap/rowset/segment_v2/zone_map_index.cpp index ce29cfdb8130aa..e63df2bcc76c0e 100644 --- a/be/src/olap/rowset/segment_v2/zone_map_index.cpp +++ b/be/src/olap/rowset/segment_v2/zone_map_index.cpp @@ -25,14 +25,13 @@ #include "olap/rowset/segment_v2/indexed_column_writer.h" #include "olap/types.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" namespace doris { namespace segment_v2 { ZoneMapIndexWriter::ZoneMapIndexWriter(Field* field) - : _field(field), _tracker(new MemTracker(-1, "ZoneMapIndexWriter")), _pool(_tracker.get()) { + : _field(field), _pool("ZoneMapIndexWriter") { _page_zone_map.min_value = _field->allocate_zone_map_value(&_pool); _page_zone_map.max_value = _field->allocate_zone_map_value(&_pool); _reset_zone_map(&_page_zone_map); @@ -129,8 +128,7 @@ Status ZoneMapIndexReader::load(bool use_page_cache, bool kept_in_memory) { RETURN_IF_ERROR(reader.load(use_page_cache, kept_in_memory)); IndexedColumnIterator iter(&reader); - auto tracker = std::make_shared(-1, "temp in ZoneMapIndexReader"); - MemPool pool(tracker.get()); + MemPool pool("ZoneMapIndexReader ColumnBlock"); _page_zone_maps.resize(reader.num_values()); // read and cache all page zone maps diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.h b/be/src/olap/rowset/segment_v2/zone_map_index.h index 0c129c5bd94ecd..f8ddfbb3525b03 100644 --- a/be/src/olap/rowset/segment_v2/zone_map_index.h +++ b/be/src/olap/rowset/segment_v2/zone_map_index.h @@ -27,7 +27,6 @@ #include "olap/field.h" #include "olap/rowset/segment_v2/binary_plain_page.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "util/slice.h" namespace doris { @@ -109,7 +108,6 @@ class ZoneMapIndexWriter { ZoneMap _segment_zone_map; // TODO(zc): we should replace this memory pool later, we only allocate min/max // for field. But MemPool allocate 4KB least, it will a waste for most cases. - std::shared_ptr _tracker; MemPool _pool; // serialized ZoneMapPB for each data page diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index b73d6ef30a1bba..d261faff0fc650 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -37,6 +37,7 @@ #include "runtime/exec_env.h" #include "runtime/mem_pool.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "util/defer_op.h" using std::deque; @@ -56,9 +57,7 @@ class RowBlockSorter { public: explicit RowBlockSorter(RowBlockAllocator* allocator); virtual ~RowBlockSorter(); - size_t num_rows() { - return _swap_row_block != nullptr ? _swap_row_block->capacity() : 0; - } + size_t num_rows() { return _swap_row_block != nullptr ? _swap_row_block->capacity() : 0; } bool sort(RowBlock** row_block); @@ -78,7 +77,7 @@ class RowBlockMerger { virtual ~RowBlockMerger(); bool merge(const std::vector& row_block_arr, RowsetWriter* rowset_writer, - std::shared_ptr parent, uint64_t* merged_rows); + uint64_t* merged_rows); private: struct MergeElement { @@ -742,10 +741,9 @@ bool RowBlockSorter::sort(RowBlock** row_block) { return true; } -RowBlockAllocator::RowBlockAllocator(const TabletSchema& tablet_schema, - std::shared_ptr parent, size_t memory_limitation) +RowBlockAllocator::RowBlockAllocator(const TabletSchema& tablet_schema, size_t memory_limitation) : _tablet_schema(tablet_schema), - _mem_tracker(MemTracker::create_tracker(-1, "RowBlockAllocator", parent)), + _mem_tracker(MemTracker::create_virtual_tracker(-1, "RowBlockAllocator")), _row_len(tablet_schema.row_size()), _memory_limitation(memory_limitation) { VLOG_NOTICE << "RowBlockAllocator(). row_len=" << _row_len; @@ -806,7 +804,7 @@ void RowBlockAllocator::release(RowBlock* row_block) { delete row_block; } -bool RowBlockAllocator::is_memory_enough_for_sorting(size_t num_rows, size_t allocated_rows){ +bool RowBlockAllocator::is_memory_enough_for_sorting(size_t num_rows, size_t allocated_rows) { if (num_rows <= allocated_rows) { return true; } @@ -814,17 +812,15 @@ bool RowBlockAllocator::is_memory_enough_for_sorting(size_t num_rows, size_t all return _mem_tracker->consumption() + row_block_size < _memory_limitation; } - RowBlockMerger::RowBlockMerger(TabletSharedPtr tablet) : _tablet(tablet) {} RowBlockMerger::~RowBlockMerger() {} bool RowBlockMerger::merge(const std::vector& row_block_arr, RowsetWriter* rowset_writer, - std::shared_ptr parent, uint64_t* merged_rows) { + uint64_t* merged_rows) { uint64_t tmp_merged_rows = 0; RowCursor row_cursor; - std::shared_ptr tracker(MemTracker::create_tracker(-1, "RowBlockMerger", parent)); - std::unique_ptr mem_pool(new MemPool(tracker.get())); + std::unique_ptr mem_pool(new MemPool("RowBlockMerger")); std::unique_ptr agg_object_pool(new ObjectPool()); if (row_cursor.init(_tablet->tablet_schema()) != OLAP_SUCCESS) { LOG(WARNING) << "fail to init row cursor."; @@ -932,32 +928,31 @@ bool RowBlockMerger::_pop_heap() { OLAPStatus LinkedSchemaChange::process(RowsetReaderSharedPtr rowset_reader, RowsetWriter* new_rowset_writer, TabletSharedPtr new_tablet, TabletSharedPtr base_tablet) { - // In some cases, there may be more than one type of rowset in a tablet, // in which case the conversion cannot be done directly by linked schema change, // but requires direct schema change to rewrite the data. if (rowset_reader->type() != new_rowset_writer->type()) { - LOG(INFO) << "the type of rowset " << rowset_reader->rowset()->rowset_id() << " in base tablet " << base_tablet->tablet_id() - << " is not same as type " << new_rowset_writer->type() << ", use direct schema change."; - SchemaChangeDirectly scd(_row_block_changer, _mem_tracker); + LOG(INFO) << "the type of rowset " << rowset_reader->rowset()->rowset_id() + << " in base tablet " << base_tablet->tablet_id() << " is not same as type " + << new_rowset_writer->type() << ", use direct schema change."; + SchemaChangeDirectly scd(_row_block_changer); return scd.process(rowset_reader, new_rowset_writer, new_tablet, base_tablet); } else { OLAPStatus status = new_rowset_writer->add_rowset_for_linked_schema_change( rowset_reader->rowset(), _row_block_changer.get_schema_mapping()); if (status != OLAP_SUCCESS) { LOG(WARNING) << "fail to convert rowset." - << ", new_tablet=" << new_tablet->full_name() - << ", base_tablet=" << base_tablet->full_name() - << ", version=" << new_rowset_writer->version().first << "-" - << new_rowset_writer->version().second; + << ", new_tablet=" << new_tablet->full_name() + << ", base_tablet=" << base_tablet->full_name() + << ", version=" << new_rowset_writer->version().first << "-" + << new_rowset_writer->version().second; } return status; } } -SchemaChangeDirectly::SchemaChangeDirectly(const RowBlockChanger& row_block_changer, - std::shared_ptr mem_tracker) - : SchemaChange(mem_tracker), +SchemaChangeDirectly::SchemaChangeDirectly(const RowBlockChanger& row_block_changer) + : SchemaChange(), _row_block_changer(row_block_changer), _row_block_allocator(nullptr), _cursor(nullptr) {} @@ -1002,7 +997,7 @@ OLAPStatus SchemaChangeDirectly::process(RowsetReaderSharedPtr rowset_reader, RowsetWriter* rowset_writer, TabletSharedPtr new_tablet, TabletSharedPtr base_tablet) { if (_row_block_allocator == nullptr) { - _row_block_allocator = new RowBlockAllocator(new_tablet->tablet_schema(), _mem_tracker, 0); + _row_block_allocator = new RowBlockAllocator(new_tablet->tablet_schema(), 0); if (_row_block_allocator == nullptr) { LOG(FATAL) << "failed to malloc RowBlockAllocator. size=" << sizeof(RowBlockAllocator); return OLAP_ERR_INPUT_PARAMETER_ERROR; @@ -1115,9 +1110,8 @@ OLAPStatus SchemaChangeDirectly::process(RowsetReaderSharedPtr rowset_reader, } SchemaChangeWithSorting::SchemaChangeWithSorting(const RowBlockChanger& row_block_changer, - std::shared_ptr mem_tracker, size_t memory_limitation) - : SchemaChange(mem_tracker), + : SchemaChange(), _row_block_changer(row_block_changer), _memory_limitation(memory_limitation), _row_block_allocator(nullptr) { @@ -1138,8 +1132,8 @@ OLAPStatus SchemaChangeWithSorting::process(RowsetReaderSharedPtr rowset_reader, TabletSharedPtr new_tablet, TabletSharedPtr base_tablet) { if (_row_block_allocator == nullptr) { - _row_block_allocator = new (nothrow) - RowBlockAllocator(new_tablet->tablet_schema(), _mem_tracker, _memory_limitation); + _row_block_allocator = + new (nothrow) RowBlockAllocator(new_tablet->tablet_schema(), _memory_limitation); if (_row_block_allocator == nullptr) { LOG(FATAL) << "failed to malloc RowBlockAllocator. size=" << sizeof(RowBlockAllocator); return OLAP_ERR_INPUT_PARAMETER_ERROR; @@ -1174,7 +1168,7 @@ OLAPStatus SchemaChangeWithSorting::process(RowsetReaderSharedPtr rowset_reader, // src_rowsets to store the rowset generated by internal sorting std::vector src_rowsets; - Defer defer{[&]() { + Defer defer {[&]() { // remove the intermediate rowsets generated by internal sorting for (auto& row_set : src_rowsets) { StorageEngine::instance()->add_unused_rowset(row_set); @@ -1208,10 +1202,10 @@ OLAPStatus SchemaChangeWithSorting::process(RowsetReaderSharedPtr rowset_reader, LOG(WARNING) << "failed to allocate RowBlock."; return OLAP_ERR_INPUT_PARAMETER_ERROR; } else { - // do memory check for sorting, in case schema change task fail at row block sorting because of + // do memory check for sorting, in case schema change task fail at row block sorting because of // not doing internal sorting first - if (!_row_block_allocator->is_memory_enough_for_sorting(ref_row_block->row_block_info().row_num, - row_block_sorter.num_rows())) { + if (!_row_block_allocator->is_memory_enough_for_sorting( + ref_row_block->row_block_info().row_num, row_block_sorter.num_rows())) { if (new_row_block != nullptr) { _row_block_allocator->release(new_row_block); new_row_block = nullptr; @@ -1367,7 +1361,6 @@ bool SchemaChangeWithSorting::_internal_sorting(const std::vector& ro context.rowset_state = VISIBLE; context.version = version; context.segments_overlap = segments_overlap; - context.parent_mem_tracker = _mem_tracker; VLOG_NOTICE << "init rowset builder. tablet=" << new_tablet->full_name() << ", block_row_size=" << new_tablet->num_rows_per_row_block(); @@ -1377,7 +1370,7 @@ bool SchemaChangeWithSorting::_internal_sorting(const std::vector& ro return false; } - if (!merger.merge(row_block_arr, rowset_writer.get(), _mem_tracker, &merged_rows)) { + if (!merger.merge(row_block_arr, rowset_writer.get(), &merged_rows)) { LOG(WARNING) << "failed to merge row blocks."; new_tablet->data_dir()->remove_pending_ids(ROWSET_ID_PREFIX + rowset_writer->rowset_id().to_string()); @@ -1396,7 +1389,7 @@ bool SchemaChangeWithSorting::_external_sorting(vector& src_row std::vector rs_readers; for (auto& rowset : src_rowsets) { RowsetReaderSharedPtr rs_reader; - auto res = rowset->create_reader(_mem_tracker, &rs_reader); + auto res = rowset->create_reader(&rs_reader); if (res != OLAP_SUCCESS) { LOG(WARNING) << "failed to create rowset reader."; return false; @@ -1419,7 +1412,7 @@ bool SchemaChangeWithSorting::_external_sorting(vector& src_row } SchemaChangeHandler::SchemaChangeHandler() - : _mem_tracker(MemTracker::create_tracker(-1, "SchemaChange", StorageEngine::instance()->schema_change_mem_tracker())) { + : _mem_tracker(MemTracker::create_tracker(-1, "SchemaChangeHandler")) { REGISTER_HOOK_METRIC(schema_change_mem_consumption, [this]() { return _mem_tracker->consumption(); }); } @@ -1429,6 +1422,7 @@ SchemaChangeHandler::~SchemaChangeHandler() { } OLAPStatus SchemaChangeHandler::process_alter_tablet_v2(const TAlterTabletReqV2& request) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); LOG(INFO) << "begin to do request alter tablet: base_tablet_id=" << request.base_tablet_id << ", base_schema_hash=" << request.base_schema_hash << ", new_tablet_id=" << request.new_tablet_id @@ -1531,9 +1525,6 @@ OLAPStatus SchemaChangeHandler::_do_process_alter_tablet_v2(const TAlterTabletRe reader_context.seek_columns = &return_columns; reader_context.sequence_id_idx = reader_context.tablet_schema->sequence_col_idx(); - auto mem_tracker = MemTracker::create_tracker(-1, "AlterTablet:" + std::to_string(base_tablet->tablet_id()) + "-" - + std::to_string(new_tablet->tablet_id()), _mem_tracker, MemTrackerLevel::TASK); - do { // get history data to be converted and it will check if there is hold in base tablet res = _get_versions_to_be_changed(base_tablet, &versions_to_be_changed); @@ -1595,7 +1586,7 @@ OLAPStatus SchemaChangeHandler::_do_process_alter_tablet_v2(const TAlterTabletRe } // acquire data sources correspond to history versions - base_tablet->capture_rs_readers(versions_to_be_changed, &rs_readers, mem_tracker); + base_tablet->capture_rs_readers(versions_to_be_changed, &rs_readers); if (rs_readers.size() < 1) { LOG(WARNING) << "fail to acquire all data sources. " << "version_num=" << versions_to_be_changed.size() @@ -1693,6 +1684,7 @@ OLAPStatus SchemaChangeHandler::schema_version_convert(TabletSharedPtr base_tabl TabletSharedPtr new_tablet, RowsetSharedPtr* base_rowset, RowsetSharedPtr* new_rowset) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); OLAPStatus res = OLAP_SUCCESS; LOG(INFO) << "begin to convert delta version for schema changing. " << "base_tablet=" << base_tablet->full_name() @@ -1719,14 +1711,14 @@ OLAPStatus SchemaChangeHandler::schema_version_convert(TabletSharedPtr base_tabl size_t memory_limitation = config::memory_limitation_per_thread_for_schema_change; LOG(INFO) << "doing schema change with sorting for base_tablet " << base_tablet->full_name(); - sc_procedure = new (nothrow) SchemaChangeWithSorting( - rb_changer, _mem_tracker, memory_limitation * 1024 * 1024 * 1024); + sc_procedure = new (nothrow) + SchemaChangeWithSorting(rb_changer, memory_limitation * 1024 * 1024 * 1024); } else if (sc_directly) { LOG(INFO) << "doing schema change directly for base_tablet " << base_tablet->full_name(); - sc_procedure = new (nothrow) SchemaChangeDirectly(rb_changer, _mem_tracker); + sc_procedure = new (nothrow) SchemaChangeDirectly(rb_changer); } else { LOG(INFO) << "doing linked schema change for base_tablet " << base_tablet->full_name(); - sc_procedure = new (nothrow) LinkedSchemaChange(rb_changer, _mem_tracker); + sc_procedure = new (nothrow) LinkedSchemaChange(rb_changer); } if (sc_procedure == nullptr) { @@ -1753,7 +1745,7 @@ OLAPStatus SchemaChangeHandler::schema_version_convert(TabletSharedPtr base_tabl reader_context.sequence_id_idx = reader_context.tablet_schema->sequence_col_idx(); RowsetReaderSharedPtr rowset_reader; - RETURN_NOT_OK((*base_rowset)->create_reader(_mem_tracker, &rowset_reader)); + RETURN_NOT_OK((*base_rowset)->create_reader(&rowset_reader)); RETURN_NOT_OK(rowset_reader->init(&reader_context)); RowsetWriterContext writer_context; @@ -1773,7 +1765,6 @@ OLAPStatus SchemaChangeHandler::schema_version_convert(TabletSharedPtr base_tabl writer_context.load_id.set_hi((*base_rowset)->load_id().hi()); writer_context.load_id.set_lo((*base_rowset)->load_id().lo()); writer_context.segments_overlap = (*base_rowset)->rowset_meta()->segments_overlap(); - writer_context.parent_mem_tracker = _mem_tracker; std::unique_ptr rowset_writer; RowsetFactory::create_rowset_writer(writer_context, &rowset_writer); @@ -1871,16 +1862,16 @@ OLAPStatus SchemaChangeHandler::_convert_historical_rowsets(const SchemaChangePa size_t memory_limitation = config::memory_limitation_per_thread_for_schema_change; LOG(INFO) << "doing schema change with sorting for base_tablet " << sc_params.base_tablet->full_name(); - sc_procedure = new (nothrow) SchemaChangeWithSorting( - rb_changer, _mem_tracker, memory_limitation * 1024 * 1024 * 1024); + sc_procedure = new (nothrow) + SchemaChangeWithSorting(rb_changer, memory_limitation * 1024 * 1024 * 1024); } else if (sc_directly) { LOG(INFO) << "doing schema change directly for base_tablet " << sc_params.base_tablet->full_name(); - sc_procedure = new (nothrow) SchemaChangeDirectly(rb_changer, _mem_tracker); + sc_procedure = new (nothrow) SchemaChangeDirectly(rb_changer); } else { LOG(INFO) << "doing linked schema change for base_tablet " << sc_params.base_tablet->full_name(); - sc_procedure = new (nothrow) LinkedSchemaChange(rb_changer, _mem_tracker); + sc_procedure = new (nothrow) LinkedSchemaChange(rb_changer); } if (sc_procedure == nullptr) { @@ -1918,7 +1909,6 @@ OLAPStatus SchemaChangeHandler::_convert_historical_rowsets(const SchemaChangePa writer_context.rowset_state = VISIBLE; writer_context.version = rs_reader->version(); writer_context.segments_overlap = rs_reader->rowset()->rowset_meta()->segments_overlap(); - writer_context.parent_mem_tracker = _mem_tracker; std::unique_ptr rowset_writer; OLAPStatus status = RowsetFactory::create_rowset_writer(writer_context, &rowset_writer); diff --git a/be/src/olap/schema_change.h b/be/src/olap/schema_change.h index 53aa34934a5aec..4312e2cc91b4c5 100644 --- a/be/src/olap/schema_change.h +++ b/be/src/olap/schema_change.h @@ -76,7 +76,7 @@ class RowBlockChanger { class RowBlockAllocator { public: - RowBlockAllocator(const TabletSchema& tablet_schema, std::shared_ptr parent, size_t memory_limitation); + RowBlockAllocator(const TabletSchema& tablet_schema, size_t memory_limitation); virtual ~RowBlockAllocator(); OLAPStatus allocate(RowBlock** row_block, size_t num_rows, bool null_supported); @@ -93,7 +93,7 @@ class RowBlockAllocator { class SchemaChange { public: - SchemaChange(std::shared_ptr tracker) : _mem_tracker(std::move(tracker)), _filtered_rows(0), _merged_rows(0) {} + SchemaChange() : _filtered_rows(0), _merged_rows(0) {} virtual ~SchemaChange() = default; virtual OLAPStatus process(RowsetReaderSharedPtr rowset_reader, @@ -111,8 +111,7 @@ class SchemaChange { void reset_filtered_rows() { _filtered_rows = 0; } void reset_merged_rows() { _merged_rows = 0; } -protected: - std::shared_ptr _mem_tracker; + private: uint64_t _filtered_rows; uint64_t _merged_rows; @@ -120,8 +119,8 @@ class SchemaChange { class LinkedSchemaChange : public SchemaChange { public: - explicit LinkedSchemaChange(const RowBlockChanger& row_block_changer, std::shared_ptr mem_tracker) - : SchemaChange(mem_tracker), _row_block_changer(row_block_changer) {} + explicit LinkedSchemaChange(const RowBlockChanger& row_block_changer) + : SchemaChange(), _row_block_changer(row_block_changer) {} ~LinkedSchemaChange() {} virtual OLAPStatus process(RowsetReaderSharedPtr rowset_reader, RowsetWriter* new_rowset_writer, @@ -137,7 +136,7 @@ class SchemaChangeDirectly : public SchemaChange { public: // @params tablet the instance of tablet which has new schema. // @params row_block_changer changer to modify the data of RowBlock - explicit SchemaChangeDirectly(const RowBlockChanger& row_block_changer, std::shared_ptr mem_tracker); + explicit SchemaChangeDirectly(const RowBlockChanger& row_block_changer); virtual ~SchemaChangeDirectly(); virtual OLAPStatus process(RowsetReaderSharedPtr rowset_reader, RowsetWriter* new_rowset_writer, @@ -156,7 +155,7 @@ class SchemaChangeDirectly : public SchemaChange { // @breif schema change with sorting class SchemaChangeWithSorting : public SchemaChange { public: - explicit SchemaChangeWithSorting(const RowBlockChanger& row_block_changer, std::shared_ptr mem_tracker, + explicit SchemaChangeWithSorting(const RowBlockChanger& row_block_changer, size_t memory_limitation); virtual ~SchemaChangeWithSorting(); @@ -237,6 +236,7 @@ class SchemaChangeHandler { static OLAPStatus _init_column_mapping(ColumnMapping* column_mapping, const TabletColumn& column_schema, const std::string& value); + private: SchemaChangeHandler(); virtual ~SchemaChangeHandler(); diff --git a/be/src/olap/segment_loader.cpp b/be/src/olap/segment_loader.cpp index 0da658c4efbec2..2c3a95689b69bb 100644 --- a/be/src/olap/segment_loader.cpp +++ b/be/src/olap/segment_loader.cpp @@ -30,10 +30,9 @@ void SegmentLoader::create_global_instance(size_t capacity) { _s_instance = &instance; } -SegmentLoader::SegmentLoader(size_t capacity) - : _mem_tracker(MemTracker::create_tracker(capacity, "SegmentLoader", nullptr, MemTrackerLevel::OVERVIEW)) { +SegmentLoader::SegmentLoader(size_t capacity) { _cache = std::unique_ptr( - new_typed_lru_cache("SegmentCache", capacity, LRUCacheType::NUMBER, _mem_tracker)); + new_typed_lru_cache("SegmentLoader:SegmentCache", capacity, LRUCacheType::NUMBER)); } bool SegmentLoader::_lookup(const SegmentLoader::CacheKey& key, SegmentCacheHandle* handle) { diff --git a/be/src/olap/segment_loader.h b/be/src/olap/segment_loader.h index 2a75efa544c2e2..30cfce304d5d28 100644 --- a/be/src/olap/segment_loader.h +++ b/be/src/olap/segment_loader.h @@ -25,7 +25,6 @@ #include "olap/lru_cache.h" #include "olap/olap_common.h" // for rowset id #include "olap/rowset/beta_rowset.h" -#include "runtime/mem_tracker.h" #include "util/time.h" namespace doris { @@ -107,7 +106,6 @@ class SegmentLoader { static SegmentLoader* _s_instance; // A LRU cache to cache all opened segments std::unique_ptr _cache = nullptr; - std::shared_ptr _mem_tracker = nullptr; }; // A handle for a single rowset from segment lru cache. diff --git a/be/src/olap/snapshot_manager.cpp b/be/src/olap/snapshot_manager.cpp index 80816d71c7be2e..4baa4a47910004 100644 --- a/be/src/olap/snapshot_manager.cpp +++ b/be/src/olap/snapshot_manager.cpp @@ -34,6 +34,7 @@ #include "olap/rowset/rowset_converter.h" #include "olap/rowset/rowset_factory.h" #include "olap/rowset/rowset_id_generator.h" +#include "runtime/thread_context.h" #include "olap/rowset/rowset_writer.h" #include "olap/storage_engine.h" @@ -63,6 +64,7 @@ SnapshotManager* SnapshotManager::instance() { OLAPStatus SnapshotManager::make_snapshot(const TSnapshotRequest& request, string* snapshot_path, bool* allow_incremental_clone) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); OLAPStatus res = OLAP_SUCCESS; if (snapshot_path == nullptr) { LOG(WARNING) << "output parameter cannot be null"; @@ -92,6 +94,7 @@ OLAPStatus SnapshotManager::make_snapshot(const TSnapshotRequest& request, strin OLAPStatus SnapshotManager::release_snapshot(const string& snapshot_path) { // 如果请求的snapshot_path位于root/snapshot文件夹下,则认为是合法的,可以删除 // 否则认为是非法请求,返回错误结果 + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); auto stores = StorageEngine::instance()->get_stores(); for (auto store : stores) { if (store->is_remote()) { @@ -120,6 +123,7 @@ OLAPStatus SnapshotManager::release_snapshot(const string& snapshot_path) { // AlphaRowsetMeta here. OLAPStatus SnapshotManager::convert_rowset_ids(const FilePathDesc& clone_dir_desc, int64_t tablet_id, const int32_t& schema_hash) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); OLAPStatus res = OLAP_SUCCESS; // check clone dir existed if (!FileUtils::check_exist(clone_dir_desc.filepath)) { diff --git a/be/src/olap/snapshot_manager.h b/be/src/olap/snapshot_manager.h index 0efa64711befff..c13a133ab6a549 100644 --- a/be/src/olap/snapshot_manager.h +++ b/be/src/olap/snapshot_manager.h @@ -65,7 +65,10 @@ class SnapshotManager { const int32_t& schema_hash); private: - SnapshotManager() : _snapshot_base_id(0) {} + SnapshotManager() : _snapshot_base_id(0) { + _mem_tracker = MemTracker::create_tracker(-1, "SnapshotManager", nullptr, + MemTrackerLevel::OVERVIEW); + } OLAPStatus _calc_snapshot_id_path(const TabletSharedPtr& tablet, int64_t timeout_s, std::string* out_path); @@ -99,6 +102,8 @@ class SnapshotManager { // snapshot Mutex _snapshot_mutex; uint64_t _snapshot_base_id; + + std::shared_ptr _mem_tracker = nullptr; }; // SnapshotManager } // namespace doris diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index f1139e6ebc92b9..5c5c65e24fd4f4 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -112,10 +112,18 @@ StorageEngine::StorageEngine(const EngineOptions& options) _is_all_cluster_id_exist(true), _index_stream_lru_cache(nullptr), _file_cache(nullptr), - _compaction_mem_tracker(MemTracker::create_tracker(-1, "AutoCompaction", nullptr, - MemTrackerLevel::OVERVIEW)), - _tablet_mem_tracker(MemTracker::create_tracker(-1, "TabletHeader", nullptr, - MemTrackerLevel::OVERVIEW)), + _compaction_mem_tracker(MemTracker::create_tracker(-1, "StorageEngine::AutoCompaction", + nullptr, MemTrackerLevel::OVERVIEW)), + _tablet_mem_tracker(MemTracker::create_virtual_tracker( + -1, "StorageEngine::TabletHeader", nullptr, MemTrackerLevel::OVERVIEW)), + _schema_change_mem_tracker(MemTracker::create_tracker( + -1, "StorageEngine::SchemaChange", nullptr, MemTrackerLevel::OVERVIEW)), + _clone_mem_tracker(MemTracker::create_tracker(-1, "StorageEngine::Clone", nullptr, + MemTrackerLevel::OVERVIEW)), + _batch_load_mem_tracker(MemTracker::create_tracker(-1, "StorageEngine::BatchLoad", + nullptr, MemTrackerLevel::OVERVIEW)), + _consistency_mem_tracker(MemTracker::create_tracker(-1, "StorageEngine::Consistency", + nullptr, MemTrackerLevel::OVERVIEW)), _stop_background_threads_latch(1), _tablet_manager(new TabletManager(config::tablet_map_shard_size)), _txn_manager(new TxnManager(config::txn_map_shard_size, config::txn_shard_size)), @@ -1063,17 +1071,12 @@ bool StorageEngine::check_rowset_id_in_unused_rowsets(const RowsetId& rowset_id) void StorageEngine::create_cumulative_compaction( TabletSharedPtr best_tablet, std::shared_ptr& cumulative_compaction) { - std::string tracker_label = - "StorageEngine:CumulativeCompaction:" + std::to_string(best_tablet->tablet_id()); - cumulative_compaction.reset( - new CumulativeCompaction(best_tablet, tracker_label, _compaction_mem_tracker)); + cumulative_compaction.reset(new CumulativeCompaction(best_tablet)); } void StorageEngine::create_base_compaction(TabletSharedPtr best_tablet, std::shared_ptr& base_compaction) { - std::string tracker_label = - "StorageEngine:BaseCompaction:" + std::to_string(best_tablet->tablet_id()); - base_compaction.reset(new BaseCompaction(best_tablet, tracker_label, _compaction_mem_tracker)); + base_compaction.reset(new BaseCompaction(best_tablet)); } // Return json: diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index a07d3a98bed18e..83ac78d13c5fee 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -185,8 +185,12 @@ class StorageEngine { Status get_compaction_status_json(std::string* result); + std::shared_ptr compaction_mem_tracker() { return _compaction_mem_tracker; } std::shared_ptr tablet_mem_tracker() { return _tablet_mem_tracker; } std::shared_ptr schema_change_mem_tracker() { return _schema_change_mem_tracker; } + std::shared_ptr clone_mem_tracker() { return _clone_mem_tracker; } + std::shared_ptr batch_load_mem_tracker() { return _batch_load_mem_tracker; } + std::shared_ptr consistency_mem_tracker() { return _consistency_mem_tracker; } // check cumulative compaction config void check_cumulative_compaction_config(); @@ -326,6 +330,9 @@ class StorageEngine { std::shared_ptr _compaction_mem_tracker; std::shared_ptr _tablet_mem_tracker; std::shared_ptr _schema_change_mem_tracker; + std::shared_ptr _clone_mem_tracker; + std::shared_ptr _batch_load_mem_tracker; + std::shared_ptr _consistency_mem_tracker; CountDownLatch _stop_background_threads_latch; scoped_refptr _unused_rowset_monitor_thread; diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index b96290db74c2fb..910c23bbdb6503 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -639,17 +639,15 @@ OLAPStatus Tablet::_capture_consistent_rowsets_unlocked( } OLAPStatus Tablet::capture_rs_readers(const Version& spec_version, - std::vector* rs_readers, - std::shared_ptr parent_tracker) const { + std::vector* rs_readers) const { std::vector version_path; RETURN_NOT_OK(capture_consistent_versions(spec_version, &version_path)); - RETURN_NOT_OK(capture_rs_readers(version_path, rs_readers, parent_tracker)); + RETURN_NOT_OK(capture_rs_readers(version_path, rs_readers)); return OLAP_SUCCESS; } OLAPStatus Tablet::capture_rs_readers(const std::vector& version_path, - std::vector* rs_readers, - std::shared_ptr parent_tracker) const { + std::vector* rs_readers) const { DCHECK(rs_readers != nullptr && rs_readers->empty()); for (auto version : version_path) { auto it = _rs_version_map.find(version); @@ -666,7 +664,7 @@ OLAPStatus Tablet::capture_rs_readers(const std::vector& version_path, } } RowsetReaderSharedPtr rs_reader; - auto res = it->second->create_reader(parent_tracker, &rs_reader); + auto res = it->second->create_reader(&rs_reader); if (res != OLAP_SUCCESS) { LOG(WARNING) << "failed to create reader for rowset:" << it->second->rowset_id(); return OLAP_ERR_CAPTURE_ROWSET_READER_ERROR; diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 03a86f820a56ec..f8ab386acf96fd 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -124,12 +124,10 @@ class Tablet : public BaseTablet { OLAPStatus capture_consistent_rowsets(const Version& spec_version, std::vector* rowsets) const; OLAPStatus capture_rs_readers(const Version& spec_version, - std::vector* rs_readers, - std::shared_ptr parent_tracker = nullptr) const; + std::vector* rs_readers) const; OLAPStatus capture_rs_readers(const std::vector& version_path, - std::vector* rs_readers, - std::shared_ptr parent_tracker = nullptr) const; + std::vector* rs_readers) const; DelPredicateArray delete_predicates() { return _tablet_meta->delete_predicates(); } void add_delete_predicate(const DeletePredicatePB& delete_predicate, int64_t version); diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index 592bac395383a4..75a6a298fda9d6 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -49,6 +49,7 @@ #include "service/backend_options.h" #include "util/doris_metrics.h" #include "util/file_utils.h" +#include "runtime/thread_context.h" #include "util/histogram.h" #include "util/path_util.h" #include "util/pretty_printer.h" @@ -73,7 +74,8 @@ static bool _cmp_tablet_by_create_time(const TabletSharedPtr& a, const TabletSha } TabletManager::TabletManager(int32_t tablet_map_lock_shard_size) - : _mem_tracker(MemTracker::create_tracker(-1, "TabletMeta", nullptr, MemTrackerLevel::OVERVIEW)), + : _mem_tracker(MemTracker::create_virtual_tracker(-1, "TabletManager", nullptr, + MemTrackerLevel::OVERVIEW)), _tablets_shards_size(tablet_map_lock_shard_size), _tablets_shards_mask(tablet_map_lock_shard_size - 1), _last_update_stat_ms(0) { @@ -88,7 +90,6 @@ TabletManager::TabletManager(int32_t tablet_map_lock_shard_size) } TabletManager::~TabletManager() { - _mem_tracker->release(_mem_tracker->consumption()); DEREGISTER_HOOK_METRIC(tablet_meta_mem_consumption); } diff --git a/be/src/olap/task/engine_alter_tablet_task.cpp b/be/src/olap/task/engine_alter_tablet_task.cpp index 51f029648a20d5..32ee5852510de6 100644 --- a/be/src/olap/task/engine_alter_tablet_task.cpp +++ b/be/src/olap/task/engine_alter_tablet_task.cpp @@ -18,6 +18,7 @@ #include "olap/task/engine_alter_tablet_task.h" #include "olap/schema_change.h" +#include "runtime/thread_context.h" namespace doris { @@ -31,9 +32,16 @@ EngineAlterTabletTask::EngineAlterTabletTask(const TAlterTabletReqV2& request, i _signature(signature), _task_type(task_type), _error_msgs(error_msgs), - _process_name(process_name) {} + _process_name(process_name) { + _mem_tracker = MemTracker::create_tracker( + config::memory_limitation_per_thread_for_schema_change * 1024 * 1024 * 1024, + fmt::format("{}: {}-{}", process_name, std::to_string(_alter_tablet_req.base_tablet_id), + std::to_string(_alter_tablet_req.new_tablet_id)), + StorageEngine::instance()->schema_change_mem_tracker(), MemTrackerLevel::TASK); +} OLAPStatus EngineAlterTabletTask::execute() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DorisMetrics::instance()->create_rollup_requests_total->increment(1); auto schema_change_handler = SchemaChangeHandler::instance(); diff --git a/be/src/olap/task/engine_alter_tablet_task.h b/be/src/olap/task/engine_alter_tablet_task.h index 73dd5514ac9da8..d02b3ac286a293 100644 --- a/be/src/olap/task/engine_alter_tablet_task.h +++ b/be/src/olap/task/engine_alter_tablet_task.h @@ -43,6 +43,7 @@ class EngineAlterTabletTask : public EngineTask { vector* _error_msgs; const string& _process_name; + std::shared_ptr _mem_tracker; }; // EngineTask } // namespace doris diff --git a/be/src/olap/task/engine_batch_load_task.cpp b/be/src/olap/task/engine_batch_load_task.cpp index a2750155415795..d73256df5b647f 100644 --- a/be/src/olap/task/engine_batch_load_task.cpp +++ b/be/src/olap/task/engine_batch_load_task.cpp @@ -35,6 +35,7 @@ #include "olap/push_handler.h" #include "olap/storage_engine.h" #include "olap/tablet.h" +#include "runtime/thread_context.h" #include "util/doris_metrics.h" #include "util/pretty_printer.h" @@ -52,11 +53,15 @@ EngineBatchLoadTask::EngineBatchLoadTask(TPushReq& push_req, std::vectorbatch_load_mem_tracker(), MemTrackerLevel::TASK); } EngineBatchLoadTask::~EngineBatchLoadTask() {} OLAPStatus EngineBatchLoadTask::execute() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); AgentStatus status = DORIS_SUCCESS; if (_push_req.push_type == TPushType::LOAD || _push_req.push_type == TPushType::LOAD_DELETE || _push_req.push_type == TPushType::LOAD_V2) { diff --git a/be/src/olap/task/engine_batch_load_task.h b/be/src/olap/task/engine_batch_load_task.h index 125dc7fc149eb6..3e9d92c71a6891 100644 --- a/be/src/olap/task/engine_batch_load_task.h +++ b/be/src/olap/task/engine_batch_load_task.h @@ -77,6 +77,7 @@ class EngineBatchLoadTask : public EngineTask { AgentStatus* _res_status; std::string _remote_file_path; std::string _local_file_path; + std::shared_ptr _mem_tracker; }; // class Pusher } // namespace doris #endif // DORIS_BE_SRC_OLAP_TASK_ENGINE_BATCH_LOAD_TASK_H diff --git a/be/src/olap/task/engine_checksum_task.cpp b/be/src/olap/task/engine_checksum_task.cpp index b795efb834ab27..be44f0ac65a4a2 100644 --- a/be/src/olap/task/engine_checksum_task.cpp +++ b/be/src/olap/task/engine_checksum_task.cpp @@ -17,8 +17,9 @@ #include "olap/task/engine_checksum_task.h" -#include "olap/tuple_reader.h" #include "olap/row.h" +#include "olap/tuple_reader.h" +#include "runtime/thread_context.h" namespace doris { @@ -27,9 +28,14 @@ EngineChecksumTask::EngineChecksumTask(TTabletId tablet_id, TSchemaHash schema_h : _tablet_id(tablet_id), _schema_hash(schema_hash), _version(version), - _checksum(checksum) {} + _checksum(checksum) { + _mem_tracker = MemTracker::create_tracker(-1, "compute checksum: " + std::to_string(tablet_id), + StorageEngine::instance()->consistency_mem_tracker(), + MemTrackerLevel::TASK); +} OLAPStatus EngineChecksumTask::execute() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); OLAPStatus res = _compute_checksum(); return res; } // execute @@ -87,8 +93,7 @@ OLAPStatus EngineChecksumTask::_compute_checksum() { } RowCursor row; - std::shared_ptr tracker(new MemTracker(-1)); - std::unique_ptr mem_pool(new MemPool(tracker.get())); + std::unique_ptr mem_pool(new MemPool("EngineChecksumTask:_compute_checksum")); std::unique_ptr agg_object_pool(new ObjectPool()); res = row.init(tablet->tablet_schema(), reader_params.return_columns); if (res != OLAP_SUCCESS) { diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index 71e73a3b642ab9..0ae6c7fb498367 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -30,6 +30,7 @@ #include "olap/rowset/rowset_factory.h" #include "olap/snapshot_manager.h" #include "runtime/client_cache.h" +#include "runtime/thread_context.h" #include "util/thrift_rpc_helper.h" using std::set; @@ -55,9 +56,14 @@ EngineCloneTask::EngineCloneTask(const TCloneReq& clone_req, const TMasterInfo& _tablet_infos(tablet_infos), _res_status(res_status), _signature(signature), - _master_info(master_info) {} + _master_info(master_info) { + _mem_tracker = MemTracker::create_tracker( + -1, "clone tablet: " + std::to_string(_clone_req.tablet_id), + StorageEngine::instance()->clone_mem_tracker(), MemTrackerLevel::TASK); +} OLAPStatus EngineCloneTask::execute() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // register the tablet to avoid it is deleted by gc thread during clone process StorageEngine::instance()->tablet_manager()->register_clone_tablet(_clone_req.tablet_id); OLAPStatus st = _do_clone(); @@ -758,9 +764,9 @@ OLAPStatus EngineCloneTask::_finish_full_clone(Tablet* tablet, TabletMeta* clone // but some rowset is useless, so that remove them here for (auto& rs_meta_ptr : rs_metas_found_in_src) { RowsetSharedPtr rowset_to_remove; - auto s = - RowsetFactory::create_rowset(&(cloned_tablet_meta->tablet_schema()), - tablet->tablet_path_desc().filepath, rs_meta_ptr, &rowset_to_remove); + auto s = RowsetFactory::create_rowset(&(cloned_tablet_meta->tablet_schema()), + tablet->tablet_path_desc().filepath, rs_meta_ptr, + &rowset_to_remove); if (s != OLAP_SUCCESS) { LOG(WARNING) << "failed to init rowset to remove: " << rs_meta_ptr->rowset_id().to_string(); diff --git a/be/src/olap/task/engine_clone_task.h b/be/src/olap/task/engine_clone_task.h index dbe3c1fef81b63..508e5e755dd0bd 100644 --- a/be/src/olap/task/engine_clone_task.h +++ b/be/src/olap/task/engine_clone_task.h @@ -76,6 +76,7 @@ class EngineCloneTask : public EngineTask { const TMasterInfo& _master_info; int64_t _copy_size; int64_t _copy_time_ms; + std::shared_ptr _mem_tracker; }; // EngineTask } // namespace doris diff --git a/be/src/olap/tuple_reader.cpp b/be/src/olap/tuple_reader.cpp index 5c15c2b42f9741..93ba2513d9866a 100644 --- a/be/src/olap/tuple_reader.cpp +++ b/be/src/olap/tuple_reader.cpp @@ -30,7 +30,6 @@ #include "olap/schema.h" #include "olap/storage_engine.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "util/date_func.h" using std::nothrow; diff --git a/be/src/runtime/buffered_block_mgr2.cc b/be/src/runtime/buffered_block_mgr2.cc index 5b70f763c6c9ab..09de2b4a1b2089 100644 --- a/be/src/runtime/buffered_block_mgr2.cc +++ b/be/src/runtime/buffered_block_mgr2.cc @@ -22,6 +22,7 @@ #include "runtime/mem_pool.h" #include "runtime/mem_tracker.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/tmp_file_mgr.h" #include "util/bit_util.h" #include "util/debug_util.h" @@ -219,11 +220,9 @@ BufferedBlockMgr2::BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_m _writes_issued(0), _state(state) {} -Status BufferedBlockMgr2::create(RuntimeState* state, const std::shared_ptr& parent, - RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, - int64_t mem_limit, int64_t block_size, +Status BufferedBlockMgr2::create(RuntimeState* state, RuntimeProfile* profile, + TmpFileMgr* tmp_file_mgr, int64_t mem_limit, int64_t block_size, std::shared_ptr* block_mgr) { - DCHECK(parent != nullptr); block_mgr->reset(); { // we do not use global BlockMgrsMap for now, to avoid mem-exceeded different fragments @@ -245,7 +244,7 @@ Status BufferedBlockMgr2::create(RuntimeState* state, const std::shared_ptrquery_id()] = *block_mgr; } } - (*block_mgr)->init(state->exec_env()->disk_io_mgr(), profile, parent, mem_limit); + (*block_mgr)->init(state->exec_env()->disk_io_mgr(), profile, mem_limit); return Status::OK(); } @@ -290,6 +289,7 @@ void BufferedBlockMgr2::clear_reservations(Client* client) { bool BufferedBlockMgr2::try_acquire_tmp_reservation(Client* client, int num_buffers) { lock_guard lock(_lock); + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); // TODO: Can the modifications to the client's mem variables can be made w/o the lock? DCHECK_EQ(client->_num_tmp_reserved_buffers, 0); if (client->_num_pinned_buffers < client->_num_reserved_buffers) { @@ -309,6 +309,7 @@ bool BufferedBlockMgr2::try_acquire_tmp_reservation(Client* client, int num_buff } bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); // Later, we use this interface to manage the consumption of memory of hashtable instead of ReservationTracker. // So it is possible to allocate 0, which has no additional impact on the behavior of BufferedBlockMgr. // The process of memory allocation still by BufferPool, Because bufferpool has done a lot of optimization in memory allocation @@ -451,6 +452,7 @@ Status BufferedBlockMgr2::add_exec_msg(const std::string& msg) const { Status BufferedBlockMgr2::get_new_block(Client* client, Block* unpin_block, Block** block, int64_t len) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); DCHECK_LE(len, _max_block_size) << "Cannot request block bigger than max_len"; DCHECK_NE(len, 0) << "Cannot request block of zero size"; *block = nullptr; @@ -515,6 +517,7 @@ Status BufferedBlockMgr2::get_new_block(Client* client, Block* unpin_block, Bloc } Status BufferedBlockMgr2::transfer_buffer(Block* dst, Block* src, bool unpin) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); Status status = Status::OK(); DCHECK(dst != nullptr); DCHECK(src != nullptr); @@ -559,6 +562,7 @@ Status BufferedBlockMgr2::transfer_buffer(Block* dst, Block* src, bool unpin) { } BufferedBlockMgr2::~BufferedBlockMgr2() { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); { lock_guard lock(_s_block_mgrs_lock); BlockMgrsMap::iterator it = _s_query_to_block_mgrs.find(_query_id); @@ -636,6 +640,7 @@ Status BufferedBlockMgr2::delete_or_unpin_block(Block* block, bool unpin) { } Status BufferedBlockMgr2::pin_block(Block* block, bool* pinned, Block* release_block, bool unpin) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); DCHECK(block != nullptr); DCHECK(!block->_is_deleted); *pinned = false; @@ -716,6 +721,7 @@ Status BufferedBlockMgr2::pin_block(Block* block, bool* pinned, Block* release_b } Status BufferedBlockMgr2::unpin_block(Block* block) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); DCHECK(!block->_is_deleted) << "Unpin for deleted block."; lock_guard unpinned_lock(_lock); @@ -918,6 +924,7 @@ void BufferedBlockMgr2::write_complete(Block* block, const Status& write_status) } void BufferedBlockMgr2::delete_block(Block* block) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); DCHECK(!block->_is_deleted); lock_guard lock(_lock); @@ -1261,14 +1268,12 @@ string BufferedBlockMgr2::debug_internal() const { << " Total pinned buffers: " << _total_pinned_buffers << endl << " Unfullfilled reserved buffers: " << _unfullfilled_reserved_buffers << endl << " Remaining memory: " << _mem_tracker->spare_capacity() - << " (#blocks=" << (_mem_tracker->spare_capacity() / _max_block_size) << ")" - << endl + << " (#blocks=" << (_mem_tracker->spare_capacity() / _max_block_size) << ")" << endl << " Block write threshold: " << _block_write_threshold; return ss.str(); } -void BufferedBlockMgr2::init(DiskIoMgr* io_mgr, RuntimeProfile* parent_profile, - const std::shared_ptr& parent_tracker, int64_t mem_limit) { +void BufferedBlockMgr2::init(DiskIoMgr* io_mgr, RuntimeProfile* parent_profile, int64_t mem_limit) { unique_lock l(_lock); if (_initialized) { return; @@ -1293,7 +1298,7 @@ void BufferedBlockMgr2::init(DiskIoMgr* io_mgr, RuntimeProfile* parent_profile, _integrity_check_timer = ADD_TIMER(_profile.get(), "TotalIntegrityCheckTime"); // Create a new mem_tracker and allocate buffers. - _mem_tracker = MemTracker::create_tracker(mem_limit, "BufferedBlockMgr2", parent_tracker); + _mem_tracker = MemTracker::create_tracker(mem_limit, "BufferedBlockMgr2"); _initialized = true; } diff --git a/be/src/runtime/buffered_block_mgr2.h b/be/src/runtime/buffered_block_mgr2.h index 493398e8d9d5e6..1aeddba9e75828 100644 --- a/be/src/runtime/buffered_block_mgr2.h +++ b/be/src/runtime/buffered_block_mgr2.h @@ -283,9 +283,9 @@ class BufferedBlockMgr2 { // same query id has already been created, that block mgr is returned. // - mem_limit: maximum memory that will be used by the block mgr. // - buffer_size: maximum size of each buffer. - static Status create(RuntimeState* state, const std::shared_ptr& parent, - RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, int64_t mem_limit, - int64_t buffer_size, std::shared_ptr* block_mgr); + static Status create(RuntimeState* state, RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, + int64_t mem_limit, int64_t buffer_size, + std::shared_ptr* block_mgr); ~BufferedBlockMgr2(); @@ -406,8 +406,7 @@ class BufferedBlockMgr2 { BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_mgr, int64_t block_size); // Initializes the block mgr. Idempotent and thread-safe. - void init(DiskIoMgr* io_mgr, RuntimeProfile* profile, - const std::shared_ptr& parent_tracker, int64_t mem_limit); + void init(DiskIoMgr* io_mgr, RuntimeProfile* profile, int64_t mem_limit); // Initializes _tmp_files. This is initialized the first time we need to write to disk. // Must be called with _lock taken. diff --git a/be/src/runtime/buffered_tuple_stream2.cc b/be/src/runtime/buffered_tuple_stream2.cc index d0c9428ffee0a7..6f55086e8befb5 100644 --- a/be/src/runtime/buffered_tuple_stream2.cc +++ b/be/src/runtime/buffered_tuple_stream2.cc @@ -484,7 +484,7 @@ Status BufferedTupleStream2::get_rows(unique_ptr* batch, bool* got_row return Status::OK(); } RETURN_IF_ERROR(prepare_for_read(false)); - batch->reset(new RowBatch(_desc, num_rows(), _block_mgr->get_tracker(_block_mgr_client).get())); + batch->reset(new RowBatch(_desc, num_rows())); bool eos = false; // Loop until get_next fills the entire batch. Each call can stop at block // boundaries. We generally want it to stop, so that blocks can be freed diff --git a/be/src/runtime/buffered_tuple_stream3.cc b/be/src/runtime/buffered_tuple_stream3.cc index e5bdb9ecd0e532..0da366a645a612 100644 --- a/be/src/runtime/buffered_tuple_stream3.cc +++ b/be/src/runtime/buffered_tuple_stream3.cc @@ -18,10 +18,8 @@ #include #include "runtime/buffered_tuple_stream3.inline.h" -#include "runtime/bufferpool/reservation_tracker.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/string_value.h" @@ -669,8 +667,7 @@ void BufferedTupleStream3::UnpinStream(UnpinMode mode) { CHECK_CONSISTENCY_FULL(); } */ -Status BufferedTupleStream3::GetRows(const std::shared_ptr& tracker, - std::unique_ptr* batch, bool* got_rows) { +Status BufferedTupleStream3::GetRows(std::unique_ptr* batch, bool* got_rows) { if (num_rows() > numeric_limits::max()) { // RowBatch::num_rows_ is a 32-bit int, avoid an overflow. return Status::InternalError( @@ -687,7 +684,7 @@ Status BufferedTupleStream3::GetRows(const std::shared_ptr& tracker, // TODO chenhao // capacity in RowBatch use int, but _num_rows is int64_t // it may be precision loss - batch->reset(new RowBatch(*desc_, num_rows(), tracker.get())); + batch->reset(new RowBatch(*desc_, num_rows())); bool eos = false; // Loop until GetNext fills the entire batch. Each call can stop at page // boundaries. We generally want it to stop, so that pages can be freed diff --git a/be/src/runtime/buffered_tuple_stream3.h b/be/src/runtime/buffered_tuple_stream3.h index 7d8f053d037085..b9d6c13e28801f 100644 --- a/be/src/runtime/buffered_tuple_stream3.h +++ b/be/src/runtime/buffered_tuple_stream3.h @@ -30,7 +30,6 @@ namespace doris { -class MemTracker; class RuntimeState; class RowDescriptor; class SlotDescriptor; @@ -333,8 +332,7 @@ class BufferedTupleStream3 { /// process. If the current unused reservation is not sufficient to pin the stream in /// memory, this will try to increase the reservation. If that fails, 'got_rows' is set /// to false. - Status GetRows(const std::shared_ptr& tracker, std::unique_ptr* batch, - bool* got_rows) WARN_UNUSED_RESULT; + Status GetRows(std::unique_ptr* batch, bool* got_rows) WARN_UNUSED_RESULT; /// Must be called once at the end to cleanup all resources. If 'batch' is non-nullptr, /// attaches buffers from pinned pages that rows returned from GetNext() may reference. diff --git a/be/src/runtime/bufferpool/buffer_pool.cc b/be/src/runtime/bufferpool/buffer_pool.cc index c0660bd77b135e..3ff0a2e10ef90a 100644 --- a/be/src/runtime/bufferpool/buffer_pool.cc +++ b/be/src/runtime/bufferpool/buffer_pool.cc @@ -378,8 +378,7 @@ BufferPool::Client::Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group buffers_allocated_bytes_(0) { // Set up a child profile with buffer pool info. RuntimeProfile* child_profile = profile->create_child("Buffer pool", true, true); - reservation_.InitChildTracker(child_profile, parent_reservation, mem_tracker.get(), - reservation_limit); + reservation_.InitChildTracker(child_profile, parent_reservation, nullptr, reservation_limit); counters_.alloc_time = ADD_TIMER(child_profile, "AllocTime"); counters_.cumulative_allocations = ADD_COUNTER(child_profile, "CumulativeAllocations", TUnit::UNIT); diff --git a/be/src/runtime/cache/result_cache.h b/be/src/runtime/cache/result_cache.h index 910cc191ee3948..7e4352ac7946d1 100644 --- a/be/src/runtime/cache/result_cache.h +++ b/be/src/runtime/cache/result_cache.h @@ -33,7 +33,6 @@ #include "runtime/cache/cache_utils.h" #include "runtime/cache/result_node.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "runtime/row_batch.h" #include "runtime/tuple_row.h" diff --git a/be/src/runtime/data_stream_mgr.cpp b/be/src/runtime/data_stream_mgr.cpp index fbe2af1ca74d5b..6363c61d4b3346 100644 --- a/be/src/runtime/data_stream_mgr.cpp +++ b/be/src/runtime/data_stream_mgr.cpp @@ -71,7 +71,7 @@ shared_ptr DataStreamMgr::create_recvr( VLOG_FILE << "creating receiver for fragment=" << fragment_instance_id << ", node=" << dest_node_id; shared_ptr recvr(new DataStreamRecvr( - this, state->instance_mem_tracker(), row_desc, fragment_instance_id, dest_node_id, + this, row_desc, fragment_instance_id, dest_node_id, num_senders, is_merging, buffer_size, profile, sub_plan_query_statistics_recvr)); uint32_t hash_value = get_hash_value(fragment_instance_id, dest_node_id); lock_guard l(_lock); diff --git a/be/src/runtime/data_stream_mgr.h b/be/src/runtime/data_stream_mgr.h index be370603c0e4dc..e627de17276d0f 100644 --- a/be/src/runtime/data_stream_mgr.h +++ b/be/src/runtime/data_stream_mgr.h @@ -30,7 +30,6 @@ #include "gen_cpp/Types_types.h" // for TUniqueId #include "gen_cpp/internal_service.pb.h" #include "runtime/descriptors.h" // for PlanNodeId -#include "runtime/mem_tracker.h" #include "runtime/query_statistics.h" #include "util/runtime_profile.h" diff --git a/be/src/runtime/data_stream_recvr.cc b/be/src/runtime/data_stream_recvr.cc index 9bf9bc26f03d33..f11b178fda30e0 100644 --- a/be/src/runtime/data_stream_recvr.cc +++ b/be/src/runtime/data_stream_recvr.cc @@ -28,6 +28,7 @@ #include "runtime/data_stream_mgr.h" #include "runtime/row_batch.h" #include "runtime/sorted_run_merger.h" +#include "runtime/thread_context.h" #include "util/debug_util.h" #include "util/logging.h" #include "util/runtime_profile.h" @@ -184,6 +185,8 @@ Status DataStreamRecvr::SenderQueue::get_batch(RowBatch** next_batch) { if (!_pending_closures.empty()) { auto closure_pair = _pending_closures.front(); + // When the batch queue reaches the upper limit of memory, calling run to let + // brpc send data packets may cause additional memory to be released closure_pair.first->Run(); _pending_closures.pop_front(); @@ -248,7 +251,7 @@ void DataStreamRecvr::SenderQueue::add_batch(const PRowBatch& pb_batch, int be_n // Note: if this function makes a row batch, the batch *must* be added // to _batch_queue. It is not valid to create the row batch and destroy // it in this thread. - batch = new RowBatch(_recvr->row_desc(), pb_batch, _recvr->mem_tracker().get()); + batch = new RowBatch(_recvr->row_desc(), pb_batch); } VLOG_ROW << "added #rows=" << batch->num_rows() << " batch_size=" << batch_size << "\n"; @@ -270,8 +273,7 @@ void DataStreamRecvr::SenderQueue::add_batch(RowBatch* batch, bool use_move) { if (_is_cancelled) { return; } - RowBatch* nbatch = - new RowBatch(_recvr->row_desc(), batch->capacity(), _recvr->mem_tracker().get()); + RowBatch* nbatch = new RowBatch(_recvr->row_desc(), batch->capacity()); if (use_move) { nbatch->acquire_state(batch); } else { @@ -360,6 +362,7 @@ void DataStreamRecvr::SenderQueue::close() { Status DataStreamRecvr::create_merger(const TupleRowComparator& less_than) { DCHECK(_is_merging); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); vector child_input_batch_suppliers; // Create the merger that will a single stream of sorted rows. _merger.reset(new SortedRunMerger(less_than, &_row_desc, _profile, false)); @@ -373,8 +376,9 @@ Status DataStreamRecvr::create_merger(const TupleRowComparator& less_than) { } Status DataStreamRecvr::create_parallel_merger(const TupleRowComparator& less_than, - uint32_t batch_size, MemTracker* mem_tracker) { + uint32_t batch_size) { DCHECK(_is_merging); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); vector child_input_batch_suppliers; // Create the merger that will a single stream of sorted rows. @@ -398,8 +402,8 @@ Status DataStreamRecvr::create_parallel_merger(const TupleRowComparator& less_th auto step = _sender_queues.size() / parallel_thread + 1; for (int i = 0; i < _sender_queues.size(); i += step) { // Create the merger that will a single stream of sorted rows. - std::unique_ptr child_merger(new ChildSortedRunMerger( - less_than, &_row_desc, _profile, mem_tracker, batch_size, false)); + std::unique_ptr child_merger( + new ChildSortedRunMerger(less_than, &_row_desc, _profile, batch_size, false)); vector input_batch_suppliers; for (int j = i; j < std::min((size_t)i + step, _sender_queues.size()); ++j) { input_batch_suppliers.emplace_back(bind(mem_fn(&SenderQueue::get_batch), @@ -420,6 +424,7 @@ void DataStreamRecvr::transfer_all_resources(RowBatch* transfer_batch) { // _child_mergers is not empty, means use parallel merge need transfer resource from // _sender queue. // the need transfer resources from child_merger input_row_batch + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (!_child_mergers.empty()) { _merger->transfer_all_resources(transfer_batch); } else { @@ -432,10 +437,9 @@ void DataStreamRecvr::transfer_all_resources(RowBatch* transfer_batch) { } DataStreamRecvr::DataStreamRecvr( - DataStreamMgr* stream_mgr, const std::shared_ptr& parent_tracker, - const RowDescriptor& row_desc, const TUniqueId& fragment_instance_id, - PlanNodeId dest_node_id, int num_senders, bool is_merging, int total_buffer_limit, - RuntimeProfile* profile, + DataStreamMgr* stream_mgr, const RowDescriptor& row_desc, + const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id, int num_senders, + bool is_merging, int total_buffer_limit, RuntimeProfile* profile, std::shared_ptr sub_plan_query_statistics_recvr) : _mgr(stream_mgr), _fragment_instance_id(fragment_instance_id), @@ -446,7 +450,7 @@ DataStreamRecvr::DataStreamRecvr( _num_buffered_bytes(0), _profile(profile), _sub_plan_query_statistics_recvr(sub_plan_query_statistics_recvr) { - _mem_tracker = MemTracker::create_tracker(-1, "DataStreamRecvr", parent_tracker, + _mem_tracker = MemTracker::create_tracker(-1, "DataStreamRecvr", nullptr, MemTrackerLevel::VERBOSE, _profile); // Create one queue per sender if is_merging is true. @@ -469,22 +473,26 @@ DataStreamRecvr::DataStreamRecvr( Status DataStreamRecvr::get_next(RowBatch* output_batch, bool* eos) { DCHECK(_merger.get() != nullptr); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); return _merger->get_next(output_batch, eos); } void DataStreamRecvr::add_batch(const PRowBatch& batch, int sender_id, int be_number, int64_t packet_seq, ::google::protobuf::Closure** done) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; // Add all batches to the same queue if _is_merging is false. _sender_queues[use_sender_id]->add_batch(batch, be_number, packet_seq, done); } void DataStreamRecvr::add_batch(RowBatch* batch, int sender_id, bool use_move) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; _sender_queues[use_sender_id]->add_batch(batch, use_move); } void DataStreamRecvr::remove_sender(int sender_id, int be_number) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; _sender_queues[use_sender_id]->decrement_senders(be_number); } @@ -496,6 +504,7 @@ void DataStreamRecvr::cancel_stream() { } void DataStreamRecvr::close() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); for (int i = 0; i < _sender_queues.size(); ++i) { _sender_queues[i]->close(); } @@ -504,8 +513,6 @@ void DataStreamRecvr::close() { _mgr->deregister_recvr(fragment_instance_id(), dest_node_id()); _mgr = nullptr; _merger.reset(); - // TODO: Maybe shared tracker doesn't need to be reset manually - _mem_tracker.reset(); } DataStreamRecvr::~DataStreamRecvr() { @@ -515,6 +522,7 @@ DataStreamRecvr::~DataStreamRecvr() { Status DataStreamRecvr::get_batch(RowBatch** next_batch) { DCHECK(!_is_merging); DCHECK_EQ(_sender_queues.size(), 1); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); return _sender_queues[0]->get_batch(next_batch); } diff --git a/be/src/runtime/data_stream_recvr.h b/be/src/runtime/data_stream_recvr.h index 3e4806c6c28ca2..9bc084d6462b6a 100644 --- a/be/src/runtime/data_stream_recvr.h +++ b/be/src/runtime/data_stream_recvr.h @@ -88,8 +88,7 @@ class DataStreamRecvr { // queues. The exprs used in less_than must have already been prepared and opened. Status create_merger(const TupleRowComparator& less_than); - Status create_parallel_merger(const TupleRowComparator& less_than, uint32_t batch_size, - MemTracker* mem_tracker); + Status create_parallel_merger(const TupleRowComparator& less_than, uint32_t batch_size); // Fill output_batch with the next batch of rows obtained by merging the per-sender // input streams. Must only be called if _is_merging is true. Status get_next(RowBatch* output_batch, bool* eos); @@ -101,7 +100,6 @@ class DataStreamRecvr { const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } PlanNodeId dest_node_id() const { return _dest_node_id; } const RowDescriptor& row_desc() const { return _row_desc; } - std::shared_ptr mem_tracker() const { return _mem_tracker; } void add_sub_plan_statistics(const PQueryStatistics& statistics, int sender_id) { _sub_plan_query_statistics_recvr->insert(statistics, sender_id); @@ -115,10 +113,9 @@ class DataStreamRecvr { friend class DataStreamMgr; class SenderQueue; - DataStreamRecvr(DataStreamMgr* stream_mgr, const std::shared_ptr& parent_tracker, - const RowDescriptor& row_desc, const TUniqueId& fragment_instance_id, - PlanNodeId dest_node_id, int num_senders, bool is_merging, - int total_buffer_limit, RuntimeProfile* profile, + DataStreamRecvr(DataStreamMgr* stream_mgr, const RowDescriptor& row_desc, + const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id, int num_senders, + bool is_merging, int total_buffer_limit, RuntimeProfile* profile, std::shared_ptr sub_plan_query_statistics_recvr); // If receive queue is full, done is enqueue pending, and return with *done is nullptr diff --git a/be/src/runtime/data_stream_sender.cpp b/be/src/runtime/data_stream_sender.cpp index 0fdb68f6d55f3b..4b311533ecf65c 100644 --- a/be/src/runtime/data_stream_sender.cpp +++ b/be/src/runtime/data_stream_sender.cpp @@ -39,6 +39,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "service/backend_options.h" #include "service/brpc.h" @@ -89,7 +90,7 @@ Status DataStreamSender::Channel::init(RuntimeState* state) { // TODO: figure out how to size _batch int capacity = std::max(1, _buffer_size / std::max(_row_desc.get_row_size(), 1)); - _batch.reset(new RowBatch(_row_desc, capacity, _parent->_mem_tracker.get())); + _batch.reset(new RowBatch(_row_desc, capacity)); if (_brpc_dest_addr.hostname.empty()) { LOG(WARNING) << "there is no brpc destination address's hostname" @@ -391,6 +392,7 @@ Status DataStreamSender::prepare(RuntimeState* state) { _mem_tracker = MemTracker::create_tracker( -1, "DataStreamSender:" + print_id(state->fragment_instance_id()), state->instance_mem_tracker(), MemTrackerLevel::VERBOSE, _profile); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (_part_type == TPartitionType::UNPARTITIONED || _part_type == TPartitionType::RANDOM) { std::random_device rd; @@ -430,6 +432,7 @@ DataStreamSender::~DataStreamSender() { } Status DataStreamSender::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(state != nullptr); RETURN_IF_ERROR(Expr::open(_partition_expr_ctxs, state)); for (auto iter : _partition_infos) { @@ -439,6 +442,7 @@ Status DataStreamSender::open(RuntimeState* state) { } Status DataStreamSender::send(RuntimeState* state, RowBatch* batch) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_profile->total_time_counter()); // Unpartition or _channel size @@ -642,6 +646,7 @@ Status DataStreamSender::close(RuntimeState* state, Status exec_status) { // make all channels close parallel if (_closed) return Status::OK(); _closed = true; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); Status final_st = Status::OK(); for (int i = 0; i < _channels.size(); ++i) { Status st = _channels[i]->close(state); diff --git a/be/src/runtime/disk_io_mgr.cc b/be/src/runtime/disk_io_mgr.cc index 0adf10be22a57c..585d88d62660da 100644 --- a/be/src/runtime/disk_io_mgr.cc +++ b/be/src/runtime/disk_io_mgr.cc @@ -20,6 +20,8 @@ #include #include "runtime/disk_io_mgr_internal.h" +#include "runtime/exec_env.h" +#include "runtime/thread_context.h" using std::string; using std::stringstream; @@ -198,7 +200,10 @@ string DiskIoMgr::debug_string() { } DiskIoMgr::BufferDescriptor::BufferDescriptor(DiskIoMgr* io_mgr) - : _io_mgr(io_mgr), _reader(nullptr), _buffer(nullptr) {} + : _io_mgr(io_mgr), + _reader(nullptr), + _buffer(nullptr), + _mem_tracker(io_mgr->cached_buffers_mem_tracker()) {} void DiskIoMgr::BufferDescriptor::reset(RequestContext* reader, ScanRange* range, char* buffer, int64_t buffer_len) { @@ -229,14 +234,8 @@ void DiskIoMgr::BufferDescriptor::set_mem_tracker(std::shared_ptr tr if (_mem_tracker.get() == tracker.get()) { return; } - // TODO(yingchun): use transfer_to? - if (_mem_tracker != nullptr) { - _mem_tracker->release(_buffer_len); - } + Status st = _mem_tracker->transfer_to(tracker, _buffer_len); _mem_tracker = std::move(tracker); - if (_mem_tracker != nullptr) { - _mem_tracker->consume(_buffer_len); - } } DiskIoMgr::WriteRange::WriteRange(const string& file, int64_t file_offset, int disk_id, @@ -360,12 +359,15 @@ DiskIoMgr::~DiskIoMgr() { } Status DiskIoMgr::init(const int64_t mem_limit) { - _disk_io_mem_tracker = MemTracker::create_tracker(mem_limit, "DiskIO", nullptr, MemTrackerLevel::OVERVIEW); + _mem_tracker = + MemTracker::create_tracker(mem_limit, "DiskIO", nullptr, MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + _cached_buffers_mem_tracker = MemTracker::create_tracker( + mem_limit, "DiskIO:CachedBuffers", _mem_tracker, MemTrackerLevel::OVERVIEW); // If we hit the process limit, see if we can reclaim some memory by removing // previously allocated (but unused) io buffers. - /* - * process_mem_tracker->add_gc_function(bind(&DiskIoMgr::gc_io_buffers, this)); - */ + ExecEnv::GetInstance()->process_mem_tracker()->add_gc_function( + std::bind(&DiskIoMgr::gc_io_buffers, this, std::placeholders::_1)); for (int i = 0; i < _disk_queues.size(); ++i) { _disk_queues[i] = new DiskQueue(i); @@ -445,6 +447,7 @@ void DiskIoMgr::unregister_context(RequestContext* reader) { // is on. // If wait_for_disks_completion is true, wait for the number of active disks to become 0. void DiskIoMgr::cancel_context(RequestContext* context, bool wait_for_disks_completion) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); context->cancel(Status::Cancelled("Cancelled")); if (wait_for_disks_completion) { @@ -522,6 +525,7 @@ Status DiskIoMgr::validate_scan_range(ScanRange* range) { Status DiskIoMgr::add_scan_ranges(RequestContext* reader, const vector& ranges, bool schedule_immediately) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (ranges.empty()) { return Status::OK(); } @@ -571,6 +575,7 @@ Status DiskIoMgr::add_scan_ranges(RequestContext* reader, const vectorconsume(*buffer_size); + _cached_buffers_mem_tracker->consume(*buffer_size); + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); buffer = new char[*buffer_size]; } else { buffer = _free_buffers[idx].front(); @@ -724,28 +731,34 @@ char* DiskIoMgr::get_free_buffer(int64_t* buffer_size) { return buffer; } -void DiskIoMgr::gc_io_buffers() { +void DiskIoMgr::gc_io_buffers(int64_t bytes_to_free) { unique_lock lock(_free_buffers_lock); int buffers_freed = 0; int bytes_freed = 0; for (int idx = 0; idx < _free_buffers.size(); ++idx) { - for (list::iterator iter = _free_buffers[idx].begin(); - iter != _free_buffers[idx].end(); ++iter) { - int64_t buffer_size = (1 << idx) * _min_buffer_size; - _disk_io_mem_tracker->release(buffer_size); - --_num_allocated_buffers; - delete[] * iter; - - ++buffers_freed; - bytes_freed += buffer_size; + { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); + for (list::iterator iter = _free_buffers[idx].begin(); + iter != _free_buffers[idx].end(); ++iter) { + int64_t buffer_size = (1 << idx) * _min_buffer_size; + _cached_buffers_mem_tracker->release(buffer_size); + --_num_allocated_buffers; + delete[] * iter; + + ++buffers_freed; + bytes_freed += buffer_size; + } } _free_buffers[idx].clear(); + if (bytes_freed >= bytes_to_free) { + break; + } } } void DiskIoMgr::return_free_buffer(BufferDescriptor* desc) { return_free_buffer(desc->_buffer, desc->_buffer_len); - desc->set_mem_tracker(nullptr); + desc->set_mem_tracker(_cached_buffers_mem_tracker); desc->_buffer = nullptr; } @@ -759,8 +772,9 @@ void DiskIoMgr::return_free_buffer(char* buffer, int64_t buffer_size) { if (!config::disable_mem_pools && _free_buffers[idx].size() < config::max_free_io_buffers) { _free_buffers[idx].push_back(buffer); } else { - _disk_io_mem_tracker->release(buffer_size); + _cached_buffers_mem_tracker->release(buffer_size); --_num_allocated_buffers; + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); delete[] buffer; } } @@ -816,15 +830,9 @@ bool DiskIoMgr::get_next_request_range(DiskQueue* disk_queue, RequestRange** ran // We just picked a reader, check the mem limits. // TODO: we can do a lot better here. The reader can likely make progress // with fewer io buffers. - bool disk_io_limit_exceeded = _disk_io_mem_tracker->limit_exceeded(); - bool reader_limit_exceeded = - (*request_context)->_mem_tracker != nullptr - ? (*request_context)->_mem_tracker->any_limit_exceeded() - : false; - // bool reader_limit_exceeded = (*request_context)->_mem_tracker != nullptr - // ? (*request_context)->_mem_tracker->limit_exceeded() : false; - - if (disk_io_limit_exceeded || reader_limit_exceeded) { + if ((*request_context)->_mem_tracker != nullptr + ? (*request_context)->_mem_tracker->any_limit_exceeded() + : false) { (*request_context)->cancel(Status::MemoryLimitExceeded("Memory limit exceeded")); } @@ -1016,17 +1024,8 @@ void DiskIoMgr::read_range(DiskQueue* disk_queue, RequestContext* reader, ScanRa int64_t bytes_remaining = range->_len - range->_bytes_read; DCHECK_GT(bytes_remaining, 0); int64_t buffer_size = std::min(bytes_remaining, static_cast(_max_buffer_size)); - bool enough_memory = true; - if (reader->_mem_tracker != nullptr) { - enough_memory = reader->_mem_tracker->spare_capacity() > LOW_MEMORY; - if (!enough_memory) { - // Low memory, GC and try again. - gc_io_buffers(); - enough_memory = reader->_mem_tracker->spare_capacity() > LOW_MEMORY; - } - } - if (!enough_memory) { + if (reader->_mem_tracker != nullptr && reader->_mem_tracker->spare_capacity() <= LOW_MEMORY) { RequestContext::PerDiskState& state = reader->_disk_states[disk_queue->disk_id]; unique_lock reader_lock(reader->_lock); @@ -1150,6 +1149,7 @@ int DiskIoMgr::free_buffers_idx(int64_t buffer_size) { } Status DiskIoMgr::add_write_range(RequestContext* writer, WriteRange* write_range) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK_LE(write_range->len(), _max_buffer_size); unique_lock writer_lock(writer->_lock); diff --git a/be/src/runtime/disk_io_mgr.h b/be/src/runtime/disk_io_mgr.h index 0679b8a1d72a28..fc386896f1a1c1 100644 --- a/be/src/runtime/disk_io_mgr.h +++ b/be/src/runtime/disk_io_mgr.h @@ -262,9 +262,6 @@ class DiskIoMgr { // Reader that this buffer is for RequestContext* _reader; - // The current tracker this buffer is associated with. - std::shared_ptr _mem_tracker; - // Scan range that this buffer is for. ScanRange* _scan_range; @@ -284,6 +281,9 @@ class DiskIoMgr { Status _status; int64_t _scan_range_offset; + + // The current tracker this buffer is associated with. + std::shared_ptr _mem_tracker; }; // The request type, read or write associated with a request range. @@ -657,6 +657,10 @@ class DiskIoMgr { // Returns the number of buffers currently owned by all readers. int num_buffers_in_readers() const { return _num_buffers_in_readers; } + std::shared_ptr cached_buffers_mem_tracker() const { + return _cached_buffers_mem_tracker; + } + // Dumps the disk IoMgr queues (for readers and disks) std::string debug_string(); @@ -691,8 +695,9 @@ class DiskIoMgr { // Pool to allocate BufferDescriptors. ObjectPool _pool; + std::shared_ptr _mem_tracker; // account for io buffers. - std::shared_ptr _disk_io_mem_tracker; + std::shared_ptr _cached_buffers_mem_tracker; // Number of worker(read) threads per disk. Also the max depth of queued // work to the disk. @@ -787,10 +792,9 @@ class DiskIoMgr { char* get_free_buffer(int64_t* buffer_size); // Garbage collect all unused io buffers. This is currently only triggered when the - // process wide limit is hit. This is not good enough. While it is sufficient for - // the IoMgr, other components do not trigger this GC. + // process wide limit is hit. // TODO: make this run periodically? - void gc_io_buffers(); + void gc_io_buffers(int64_t bytes_to_free = INT_MAX); // Returns a buffer to the free list. buffer_size / _min_buffer_size should be a power // of 2, and buffer_size should be <= _max_buffer_size. These constraints will be met diff --git a/be/src/runtime/dpp_sink.cpp b/be/src/runtime/dpp_sink.cpp index 1321a6a1b91876..6605c75269f919 100644 --- a/be/src/runtime/dpp_sink.cpp +++ b/be/src/runtime/dpp_sink.cpp @@ -557,8 +557,7 @@ Status Translator::prepare(RuntimeState* state) { RETURN_IF_ERROR(create_writer(state)); // 4. new batch for writer - _batch_to_write.reset( - new RowBatch(_row_desc, state->batch_size(), state->instance_mem_tracker().get())); + _batch_to_write.reset(new RowBatch(_row_desc, state->batch_size())); if (_batch_to_write.get() == nullptr) { return Status::InternalError("No memory to allocate RowBatch."); } @@ -795,7 +794,7 @@ Status Translator::process(RuntimeState* state) { SCOPED_TIMER(_agg_timer); bool eos = false; while (!eos) { - RowBatch batch(_row_desc, state->batch_size(), state->instance_mem_tracker().get()); + RowBatch batch(_row_desc, state->batch_size()); RETURN_IF_ERROR(_sorter->get_next(&batch, &eos)); diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index 932c8fb526ad1a..8fa0bf781d375d 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -119,8 +119,8 @@ class ExecEnv { } std::shared_ptr process_mem_tracker() { return _process_mem_tracker; } - std::shared_ptr new_process_mem_tracker() { return _new_process_mem_tracker; } std::shared_ptr query_pool_mem_tracker() { return _query_pool_mem_tracker; } + std::shared_ptr load_pool_mem_tracker() { return _load_pool_mem_tracker; } MemTrackerTaskPool* task_pool_mem_tracker_registry() { return _task_pool_mem_tracker_registry.get(); } @@ -160,9 +160,6 @@ class ExecEnv { RoutineLoadTaskExecutor* routine_load_task_executor() { return _routine_load_task_executor; } HeartbeatFlags* heartbeat_flags() { return _heartbeat_flags; } - // The root tracker should be set before calling ExecEnv::init(); - void set_root_mem_tracker(std::shared_ptr root_tracker); - private: Status _init(const std::vector& store_paths); void _destroy(); @@ -192,10 +189,10 @@ class ExecEnv { // The ancestor of all trackers in the process. It is the only child of the root tracker. // All manually created trackers should specify the process tracker as the parent. std::shared_ptr _process_mem_tracker = nullptr; - // TODO(zxy): Will replace _process_mem_tracker in future. - std::shared_ptr _new_process_mem_tracker = nullptr; // The ancestor for all querys tracker. std::shared_ptr _query_pool_mem_tracker = nullptr; + // The ancestor for all load tracker. + std::shared_ptr _load_pool_mem_tracker = nullptr; std::unique_ptr _task_pool_mem_tracker_registry; // The following two thread pools are used in different scenarios. diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 89d67f1eb19d14..c7248803c22d67 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -73,6 +73,8 @@ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(send_batch_thread_pool_thread_num, MetricUnit DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(send_batch_thread_pool_queue_size, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(query_mem_consumption, MetricUnit::BYTES, "", mem_consumption, Labels({{"type", "query"}})); +DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(load_mem_consumption, MetricUnit::BYTES, "", mem_consumption, + Labels({{"type", "load"}})); Status ExecEnv::init(ExecEnv* env, const std::vector& store_paths) { return env->_init(store_paths); @@ -174,14 +176,18 @@ Status ExecEnv::_init_mem_tracker() { << ". Using physical memory instead"; global_memory_limit_bytes = MemInfo::physical_mem(); } - _process_mem_tracker = MemTracker::create_tracker(global_memory_limit_bytes, "Process", - MemTracker::get_root_tracker(), MemTrackerLevel::OVERVIEW); - _new_process_mem_tracker = MemTracker::create_tracker(global_memory_limit_bytes, "NewProcess", - MemTracker::get_root_tracker(), MemTrackerLevel::OVERVIEW); - _query_pool_mem_tracker = MemTracker::create_tracker(global_memory_limit_bytes, "QueryPool", - _new_process_mem_tracker, MemTrackerLevel::OVERVIEW); + _process_mem_tracker = + MemTracker::create_tracker(global_memory_limit_bytes, "Process", + MemTracker::get_root_tracker(), MemTrackerLevel::OVERVIEW); + _query_pool_mem_tracker = + MemTracker::create_tracker(global_memory_limit_bytes, "QueryPool", _process_mem_tracker, + MemTrackerLevel::OVERVIEW); REGISTER_HOOK_METRIC(query_mem_consumption, [this]() { return _query_pool_mem_tracker->consumption(); }); + _load_pool_mem_tracker = MemTracker::create_tracker( + global_memory_limit_bytes, "LoadPool", _process_mem_tracker, MemTrackerLevel::OVERVIEW); + REGISTER_HOOK_METRIC(load_mem_consumption, + [this]() { return _load_pool_mem_tracker->consumption(); }); LOG(INFO) << "Using global memory limit: " << PrettyPrinter::print(global_memory_limit_bytes, TUnit::BYTES) << ", origin config value: " << config::mem_limit; @@ -318,6 +324,7 @@ void ExecEnv::_destroy() { SAFE_DELETE(_heartbeat_flags); DEREGISTER_HOOK_METRIC(query_mem_consumption); + DEREGISTER_HOOK_METRIC(load_mem_consumption); _is_init = false; } diff --git a/be/src/runtime/export_sink.cpp b/be/src/runtime/export_sink.cpp index b1dbcdb5bc20ad..37cb719117e60c 100644 --- a/be/src/runtime/export_sink.cpp +++ b/be/src/runtime/export_sink.cpp @@ -28,7 +28,6 @@ #include "exprs/expr.h" #include "exprs/expr_context.h" #include "gutil/strings/numbers.h" -#include "runtime/mem_tracker.h" #include "runtime/mysql_table_sink.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" @@ -72,10 +71,8 @@ Status ExportSink::prepare(RuntimeState* state) { _profile = state->obj_pool()->add(new RuntimeProfile(title.str())); SCOPED_TIMER(_profile->total_time_counter()); - _mem_tracker = MemTracker::create_tracker(-1, "ExportSink", state->instance_mem_tracker()); - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _mem_tracker)); + RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _expr_mem_tracker)); // TODO(lingbin): add some Counter _bytes_written_counter = ADD_COUNTER(profile(), "BytesExported", TUnit::BYTES); diff --git a/be/src/runtime/export_sink.h b/be/src/runtime/export_sink.h index c6bb7fe7cd3f59..7f46136c2d7a05 100644 --- a/be/src/runtime/export_sink.h +++ b/be/src/runtime/export_sink.h @@ -31,7 +31,6 @@ class TExpr; class RuntimeState; class RuntimeProfile; class ExprContext; -class MemTracker; class FileWriter; class TupleRow; @@ -75,8 +74,6 @@ class ExportSink : public DataSink { RuntimeProfile* _profile; - std::shared_ptr _mem_tracker; - RuntimeProfile::Counter* _bytes_written_counter; RuntimeProfile::Counter* _rows_written_counter; RuntimeProfile::Counter* _write_timer; diff --git a/be/src/runtime/fold_constant_executor.cpp b/be/src/runtime/fold_constant_executor.cpp index 2ecc6ecb6c620a..5c6d3ace55c1f3 100644 --- a/be/src/runtime/fold_constant_executor.cpp +++ b/be/src/runtime/fold_constant_executor.cpp @@ -24,6 +24,7 @@ #include "runtime/runtime_state.h" #include "runtime/mem_tracker.h" #include "exprs/expr_context.h" +#include "runtime/thread_context.h" #include "exprs/expr.h" #include "common/object_pool.h" #include "common/status.h" @@ -43,6 +44,7 @@ TUniqueId FoldConstantExecutor::_dummy_id; Status FoldConstantExecutor::fold_constant_expr( const TFoldConstantParams& params, PConstantExprResult* response) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const auto& expr_map = params.expr_map; auto expr_result_map = response->mutable_expr_result_map(); @@ -50,7 +52,6 @@ Status FoldConstantExecutor::fold_constant_expr( // init Status status = _init(query_globals); if (UNLIKELY(!status.ok())) { - LOG(WARNING) << "Failed to init mem trackers, msg: " << status.get_error_msg(); return status; } @@ -64,7 +65,6 @@ Status FoldConstantExecutor::fold_constant_expr( // prepare and open context status = _prepare_and_open(ctx); if (UNLIKELY(!status.ok())) { - LOG(WARNING) << "Failed to init mem trackers, msg: " << status.get_error_msg(); return status; } @@ -189,7 +189,7 @@ Status FoldConstantExecutor::_init(const TQueryGlobals& query_globals) { _runtime_profile = _runtime_state->runtime_profile(); _runtime_profile->set_name("FoldConstantExpr"); _mem_tracker = MemTracker::create_tracker(-1, "FoldConstantExpr", _runtime_state->instance_mem_tracker()); - _mem_pool.reset(new MemPool(_mem_tracker.get())); + _mem_pool.reset(new MemPool(_mem_tracker)); return Status::OK(); } diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 4884aaaaa222d4..660e61ae944b89 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -310,12 +310,12 @@ void FragmentExecState::coordinator_callback(const Status& status, RuntimeProfil RuntimeState* runtime_state = _executor.runtime_state(); DCHECK(runtime_state != nullptr); - if (runtime_state->query_options().query_type == TQueryType::LOAD && !done && status.ok()) { + if (runtime_state->query_type() == TQueryType::LOAD && !done && status.ok()) { // this is a load plan, and load is not finished, just make a brief report params.__set_loaded_rows(runtime_state->num_rows_load_total()); params.__set_loaded_bytes(runtime_state->num_bytes_load_total()); } else { - if (runtime_state->query_options().query_type == TQueryType::LOAD) { + if (runtime_state->query_type() == TQueryType::LOAD) { params.__set_loaded_rows(runtime_state->num_rows_load_total()); params.__set_loaded_bytes(runtime_state->num_bytes_load_total()); } @@ -470,10 +470,10 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi .query_id(exec_state->query_id()) .instance_id(exec_state->fragment_instance_id()) .tag("pthread_id", std::to_string((uintptr_t)pthread_self())); - SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(exec_state->query_id()), - exec_state->fragment_instance_id()); - // thread_local_ctx.attach(ThreadContext::QUERY, print_id(exec_state->query_id()), - // exec_state->fragment_instance_id()); + SCOPED_ATTACH_TASK_THREAD_4ARG(exec_state->executor()->runtime_state()->query_type(), + print_id(exec_state->query_id()), + exec_state->fragment_instance_id(), + exec_state->executor()->runtime_state()->instance_mem_tracker()); exec_state->execute(); std::shared_ptr fragments_ctx = exec_state->get_fragments_ctx(); @@ -494,7 +494,6 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi // Callback after remove from this id cb(exec_state->executor()); - // thread_local_ctx.detach(); } Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params) { diff --git a/be/src/runtime/free_pool.hpp b/be/src/runtime/free_pool.hpp index 379d2549aef696..6dde1648edf857 100644 --- a/be/src/runtime/free_pool.hpp +++ b/be/src/runtime/free_pool.hpp @@ -40,7 +40,7 @@ namespace doris { // contains the link to the next allocation. // This has O(1) Allocate() and Free(). // This is not thread safe. -// TODO: consider integrating this with MemPool. +// TODO(zxy): consider integrating this with MemPool. // TODO: consider changing to something more granular than doubling. class FreePool { public: diff --git a/be/src/runtime/initial_reservations.cc b/be/src/runtime/initial_reservations.cc index 0e622e85328a5b..21d0e66aaecc8a 100644 --- a/be/src/runtime/initial_reservations.cc +++ b/be/src/runtime/initial_reservations.cc @@ -40,8 +40,7 @@ InitialReservations::InitialReservations(ObjectPool* obj_pool, : initial_reservation_mem_tracker_( MemTracker::create_tracker(-1, "InitialReservations", query_mem_tracker)), remaining_initial_reservation_claims_(initial_reservation_total_claims) { - initial_reservations_.InitChildTracker(nullptr, query_reservation, - initial_reservation_mem_tracker_.get(), + initial_reservations_.InitChildTracker(nullptr, query_reservation, nullptr, numeric_limits::max()); } diff --git a/be/src/runtime/load_channel.cpp b/be/src/runtime/load_channel.cpp index 276425d0ab205b..3e961ecfde6f3a 100644 --- a/be/src/runtime/load_channel.cpp +++ b/be/src/runtime/load_channel.cpp @@ -19,17 +19,17 @@ #include "olap/lru_cache.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "runtime/tablets_channel.h" namespace doris { LoadChannel::LoadChannel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, - const std::shared_ptr& mem_tracker, bool is_high_priority, - const std::string& sender_ip) + bool is_high_priority, const std::string& sender_ip) : _load_id(load_id), _timeout_s(timeout_s), _is_high_priority(is_high_priority), _sender_ip(sender_ip) { _mem_tracker = MemTracker::create_tracker( - mem_limit, "LoadChannel:" + _load_id.to_string(), mem_tracker, MemTrackerLevel::TASK); + mem_limit, "LoadChannel:" + _load_id.to_string(), nullptr, MemTrackerLevel::TASK); // _last_updated_time should be set before being inserted to // _load_channels in load_channel_mgr, or it may be erased // immediately by gc thread. @@ -43,6 +43,7 @@ LoadChannel::~LoadChannel() { } Status LoadChannel::open(const PTabletWriterOpenRequest& params) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t index_id = params.index_id(); std::shared_ptr channel; { @@ -53,7 +54,7 @@ Status LoadChannel::open(const PTabletWriterOpenRequest& params) { } else { // create a new tablets channel TabletsChannelKey key(params.id(), index_id); - channel.reset(new TabletsChannel(key, _mem_tracker, _is_high_priority)); + channel.reset(new TabletsChannel(key, _is_high_priority)); _tablets_channels.insert({index_id, channel}); } } @@ -67,6 +68,7 @@ Status LoadChannel::open(const PTabletWriterOpenRequest& params) { Status LoadChannel::add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t index_id = request.index_id(); // 1. get tablets channel std::shared_ptr channel; @@ -111,6 +113,7 @@ Status LoadChannel::add_batch(const PTabletWriterAddBatchRequest& request, } void LoadChannel::handle_mem_exceed_limit(bool force) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // lock so that only one thread can check mem limit std::lock_guard l(_lock); if (!(force || _mem_tracker->limit_exceeded())) { @@ -145,6 +148,7 @@ bool LoadChannel::_find_largest_consumption_channel(std::shared_ptr l(_lock); for (auto& it : _tablets_channels) { it.second->cancel(); diff --git a/be/src/runtime/load_channel.h b/be/src/runtime/load_channel.h index 13490f5fa847f0..ba0ad3033498c8 100644 --- a/be/src/runtime/load_channel.h +++ b/be/src/runtime/load_channel.h @@ -39,8 +39,7 @@ class TabletsChannel; class LoadChannel { public: LoadChannel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, - const std::shared_ptr& mem_tracker, bool is_high_priority, - const std::string& sender_ip); + bool is_high_priority, const std::string& sender_ip); ~LoadChannel(); // open a new load channel if not exist diff --git a/be/src/runtime/load_channel_mgr.cpp b/be/src/runtime/load_channel_mgr.cpp index 6604e85fc1279e..a2b109fefe5cf7 100644 --- a/be/src/runtime/load_channel_mgr.cpp +++ b/be/src/runtime/load_channel_mgr.cpp @@ -21,6 +21,7 @@ #include "olap/lru_cache.h" #include "runtime/load_channel.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "service/backend_options.h" #include "util/doris_metrics.h" #include "util/stopwatch.hpp" @@ -28,8 +29,8 @@ namespace doris { DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(load_channel_count, MetricUnit::NOUNIT); -DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(load_mem_consumption, MetricUnit::BYTES, "", - mem_consumption, Labels({{"type", "load"}})); +DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(load_channel_mem_consumption, MetricUnit::BYTES, "", mem_consumption, + Labels({{"type", "load"}})); // Calculate the total memory limit of all load tasks on this BE static int64_t calc_process_max_load_memory(int64_t process_mem_limit) { @@ -70,12 +71,11 @@ LoadChannelMgr::LoadChannelMgr() : _stop_background_threads_latch(1) { std::lock_guard l(_lock); return _load_channels.size(); }); - _last_success_channel = new_lru_cache("LastestSuccessChannelCache", 1024, _mem_tracker); } LoadChannelMgr::~LoadChannelMgr() { DEREGISTER_HOOK_METRIC(load_channel_count); - DEREGISTER_HOOK_METRIC(load_mem_consumption); + DEREGISTER_HOOK_METRIC(load_channel_mem_consumption); _stop_background_threads_latch.count_down(); if (_load_channels_clean_thread) { _load_channels_clean_thread->join(); @@ -85,15 +85,18 @@ LoadChannelMgr::~LoadChannelMgr() { Status LoadChannelMgr::init(int64_t process_mem_limit) { int64_t load_mem_limit = calc_process_max_load_memory(process_mem_limit); - _mem_tracker = MemTracker::create_tracker(load_mem_limit, "LoadChannelMgr", nullptr, MemTrackerLevel::OVERVIEW); - REGISTER_HOOK_METRIC(load_mem_consumption, [this]() { - return _mem_tracker->consumption(); - }); + _mem_tracker = MemTracker::create_tracker(load_mem_limit, "LoadChannelMgr", + ExecEnv::GetInstance()->process_mem_tracker(), + MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + REGISTER_HOOK_METRIC(load_channel_mem_consumption, [this]() { return _mem_tracker->consumption(); }); + _last_success_channel = new_lru_cache("LastestSuccessChannelCache", 1024); RETURN_IF_ERROR(_start_bg_worker()); return Status::OK(); } Status LoadChannelMgr::open(const PTabletWriterOpenRequest& params) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); UniqueId load_id(params.id()); std::shared_ptr channel; { @@ -112,7 +115,7 @@ Status LoadChannelMgr::open(const PTabletWriterOpenRequest& params) { int64_t job_timeout_s = calc_job_timeout_s(timeout_in_req_s); bool is_high_priority = (params.has_is_high_priority() && params.is_high_priority()); - channel.reset(new LoadChannel(load_id, job_max_memory, job_timeout_s, _mem_tracker, is_high_priority, + channel.reset(new LoadChannel(load_id, job_max_memory, job_timeout_s, is_high_priority, params.sender_ip())); _load_channels.insert({load_id, channel}); } @@ -126,6 +129,7 @@ static void dummy_deleter(const CacheKey& key, void* value) {} Status LoadChannelMgr::add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); UniqueId load_id(request.id()); // 1. get load channel std::shared_ptr channel; @@ -175,6 +179,7 @@ Status LoadChannelMgr::add_batch(const PTabletWriterAddBatchRequest& request, } void LoadChannelMgr::_handle_mem_exceed_limit() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // lock so that only one thread can check mem limit std::lock_guard l(_lock); if (!_mem_tracker->limit_exceeded()) { @@ -208,6 +213,7 @@ void LoadChannelMgr::_handle_mem_exceed_limit() { } Status LoadChannelMgr::cancel(const PTabletWriterCancelRequest& params) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); UniqueId load_id(params.id()); std::shared_ptr cancelled_channel; { @@ -248,6 +254,7 @@ Status LoadChannelMgr::_start_bg_worker() { } Status LoadChannelMgr::_start_load_channels_clean() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::vector> need_delete_channels; LOG(INFO) << "cleaning timed out load channels"; time_t now = time(nullptr); diff --git a/be/src/runtime/mem_pool.cpp b/be/src/runtime/mem_pool.cpp index 3e373d288efc3a..06396851837d92 100644 --- a/be/src/runtime/mem_pool.cpp +++ b/be/src/runtime/mem_pool.cpp @@ -38,28 +38,42 @@ const int MemPool::MAX_CHUNK_SIZE; const int MemPool::DEFAULT_ALIGNMENT; uint32_t MemPool::k_zero_length_region_ alignas(std::max_align_t) = MEM_POOL_POISON; -MemPool::MemPool(MemTracker* mem_tracker) +MemPool::MemPool(std::shared_ptr mem_tracker) : current_chunk_idx_(-1), next_chunk_size_(INITIAL_CHUNK_SIZE), total_allocated_bytes_(0), total_reserved_bytes_(0), peak_allocated_bytes_(0), - new_mem_tracker_(thread_local_ctx.thread_mem_tracker()), - mem_tracker_(mem_tracker) { - DCHECK(mem_tracker != nullptr); + _mem_tracker(mem_tracker) {} + +MemPool::MemPool(std::string label) + : current_chunk_idx_(-1), + next_chunk_size_(INITIAL_CHUNK_SIZE), + total_allocated_bytes_(0), + total_reserved_bytes_(0), + peak_allocated_bytes_(0) { + _mem_tracker = MemTracker::create_tracker(-1, label + ":MemPool"); } +MemPool::MemPool() + : current_chunk_idx_(-1), + next_chunk_size_(INITIAL_CHUNK_SIZE), + total_allocated_bytes_(0), + total_reserved_bytes_(0), + peak_allocated_bytes_(0), + _mem_tracker(thread_local_ctx.thread_mem_tracker()) {} + MemPool::ChunkInfo::ChunkInfo(const Chunk& chunk_) : chunk(chunk_), allocated_bytes(0) { DorisMetrics::instance()->memory_pool_bytes_total->increment(chunk.size); } MemPool::~MemPool() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t total_bytes_released = 0; for (auto& chunk : chunks_) { total_bytes_released += chunk.chunk.size; ChunkAllocator::instance()->free(chunk.chunk); } - mem_tracker_->release(total_bytes_released); DorisMetrics::instance()->memory_pool_bytes_total->increment(-total_bytes_released); } @@ -74,6 +88,7 @@ void MemPool::clear() { } void MemPool::free_all() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t total_bytes_released = 0; for (auto& chunk : chunks_) { total_bytes_released += chunk.chunk.size; @@ -85,16 +100,16 @@ void MemPool::free_all() { total_allocated_bytes_ = 0; total_reserved_bytes_ = 0; - mem_tracker_->release(total_bytes_released); DorisMetrics::instance()->memory_pool_bytes_total->increment(-total_bytes_released); } -bool MemPool::find_chunk(size_t min_size, bool check_limits) { +Status MemPool::find_chunk(size_t min_size, bool check_limits) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // Try to allocate from a free chunk. We may have free chunks after the current chunk // if Clear() was called. The current chunk may be free if ReturnPartialAllocation() // was called. The first free chunk (if there is one) can therefore be either the // current chunk or the chunk immediately after the current chunk. - int first_free_idx; + int first_free_idx = 0; if (current_chunk_idx_ == -1) { first_free_idx = 0; } else { @@ -109,7 +124,7 @@ bool MemPool::find_chunk(size_t min_size, bool check_limits) { if (idx != first_free_idx) std::swap(chunks_[idx], chunks_[first_free_idx]); current_chunk_idx_ = first_free_idx; DCHECK(check_integrity(true)); - return true; + return Status::OK(); } } @@ -127,20 +142,10 @@ bool MemPool::find_chunk(size_t min_size, bool check_limits) { } chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size); - if (check_limits) { - Status st = mem_tracker_->try_consume(chunk_size); - WARN_IF_ERROR(st, "try to allocate a new buffer failed"); - if (!st) return false; - } else { - mem_tracker_->consume(chunk_size); - } // Allocate a new chunk. Return early if allocate fails. Chunk chunk; - if (!ChunkAllocator::instance()->allocate(chunk_size, &chunk)) { - mem_tracker_->release(chunk_size); - return false; - } + RETURN_IF_ERROR(ChunkAllocator::instance()->allocate(chunk_size, &chunk, check_limits)); ASAN_POISON_MEMORY_REGION(chunk.data, chunk_size); // Put it before the first free chunk. If no free chunks, it goes at the end. if (first_free_idx == static_cast(chunks_.size())) { @@ -155,12 +160,13 @@ bool MemPool::find_chunk(size_t min_size, bool check_limits) { next_chunk_size_ = static_cast(std::min(chunk_size * 2, MAX_CHUNK_SIZE)); DCHECK(check_integrity(true)); - return true; + return Status::OK(); } void MemPool::acquire_data(MemPool* src, bool keep_current) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(src->check_integrity(false)); - int num_acquired_chunks; + int num_acquired_chunks = 0; if (keep_current) { num_acquired_chunks = src->current_chunk_idx_; } else if (src->get_free_offset() == 0) { @@ -177,20 +183,16 @@ void MemPool::acquire_data(MemPool* src, bool keep_current) { auto end_chunk = src->chunks_.begin() + num_acquired_chunks; int64_t total_transferred_bytes = 0; + // There is no limit check, assuming that both ends of acquire_data are in the same query. for (auto i = src->chunks_.begin(); i != end_chunk; ++i) { total_transferred_bytes += i->chunk.size; - i->chunk.mem_tracker->transfer_to(new_mem_tracker_, i->chunk.size); - i->chunk.mem_tracker = new_mem_tracker_; + Status st = i->chunk.mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), + i->chunk.size); + i->chunk.mem_tracker = thread_local_ctx.thread_mem_tracker(); } src->total_reserved_bytes_ -= total_transferred_bytes; total_reserved_bytes_ += total_transferred_bytes; - // Skip unnecessary atomic ops if the mem_trackers are the same. - if (src->mem_tracker_ != mem_tracker_) { - src->mem_tracker_->release(total_transferred_bytes); - mem_tracker_->consume(total_transferred_bytes); - } - // insert new chunks after current_chunk_idx_ auto insert_chunk = chunks_.begin() + current_chunk_idx_ + 1; chunks_.insert(insert_chunk, src->chunks_.begin(), end_chunk); @@ -216,7 +218,7 @@ void MemPool::acquire_data(MemPool* src, bool keep_current) { } void MemPool::exchange_data(MemPool* other) { - int64_t delta_size = other->total_reserved_bytes_ - total_reserved_bytes_; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::swap(current_chunk_idx_, other->current_chunk_idx_); std::swap(next_chunk_size_, other->next_chunk_size_); @@ -224,20 +226,16 @@ void MemPool::exchange_data(MemPool* other) { std::swap(total_reserved_bytes_, other->total_reserved_bytes_); std::swap(peak_allocated_bytes_, other->peak_allocated_bytes_); std::swap(chunks_, other->chunks_); - + + // There is no limit check, assuming that both ends of acquire_data are in the same query. for (auto i = chunks_.begin(); i != chunks_.end(); ++i) { - i->chunk.mem_tracker->transfer_to(new_mem_tracker_, i->chunk.size); - i->chunk.mem_tracker = new_mem_tracker_; + Status st = i->chunk.mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), + i->chunk.size); + i->chunk.mem_tracker = thread_local_ctx.thread_mem_tracker(); } for (auto i = other->chunks_.begin(); i != other->chunks_.end(); ++i) { - i->chunk.mem_tracker->transfer_to(other->new_mem_tracker_, i->chunk.size); - i->chunk.mem_tracker = other->new_mem_tracker_; - } - - // update MemTracker - if (other->mem_tracker_ != mem_tracker_) { - mem_tracker_->consume(delta_size); - other->mem_tracker_->release(delta_size); + Status st = i->chunk.mem_tracker->transfer_to(other->_mem_tracker, i->chunk.size); + i->chunk.mem_tracker = other->_mem_tracker; } } diff --git a/be/src/runtime/mem_pool.h b/be/src/runtime/mem_pool.h index 3880bf3bbf4d3a..87863ebab71c8e 100644 --- a/be/src/runtime/mem_pool.h +++ b/be/src/runtime/mem_pool.h @@ -27,6 +27,7 @@ #include "common/config.h" #include "common/logging.h" +#include "common/status.h" #include "gutil/dynamic_annotations.h" #include "olap/olap_define.h" #include "runtime/memory/chunk.h" @@ -86,12 +87,11 @@ class MemTracker; /// At this point p.total_allocated_bytes_ would be 0. /// The one remaining (empty) chunk is released: /// delete p; -// -// 存在pool的申请和释放在不同线程被调用。 class MemPool { public: - /// 'tracker' tracks the amount of memory allocated by this pool. Must not be nullptr. - MemPool(MemTracker* mem_tracker); + MemPool(std::shared_ptr mem_tracker); + MemPool(std::string label); + MemPool(); /// Frees all chunks of memory and subtracts the total allocated bytes /// from the registered limits. @@ -100,33 +100,37 @@ class MemPool { /// Allocates a section of memory of 'size' bytes with DEFAULT_ALIGNMENT at the end /// of the the current chunk. Creates a new chunk if there aren't any chunks /// with enough capacity. - uint8_t* allocate(int64_t size) { return allocate(size, DEFAULT_ALIGNMENT); } + uint8_t* allocate(int64_t size, Status* rst = nullptr) { + return allocate(size, DEFAULT_ALIGNMENT, rst); + } /// Same as Allocate() expect add a check when return a nullptr - OLAPStatus allocate_safely(int64_t size, uint8_t*& ret) { - return allocate_safely(size, DEFAULT_ALIGNMENT, ret); + OLAPStatus allocate_safely(int64_t size, uint8_t*& ret, Status* rst = nullptr) { + return allocate_safely(size, DEFAULT_ALIGNMENT, ret, rst); } /// Same as Allocate() except the mem limit is checked before the allocation and /// this call will fail (returns nullptr) if it does. /// The caller must handle the nullptr case. This should be used for allocations /// where the size can be very big to bound the amount by which we exceed mem limits. - uint8_t* try_allocate(int64_t size) { return allocate(size, DEFAULT_ALIGNMENT); } + uint8_t* try_allocate(int64_t size, Status* rst = nullptr) { + return allocate(size, DEFAULT_ALIGNMENT, rst); + } /// Same as TryAllocate() except a non-default alignment can be specified. It /// should be a power-of-two in [1, alignof(std::max_align_t)]. - uint8_t* try_allocate_aligned(int64_t size, int alignment) { + uint8_t* try_allocate_aligned(int64_t size, int alignment, Status* rst = nullptr) { DCHECK_GE(alignment, 1); DCHECK_LE(alignment, config::memory_max_alignment); DCHECK_EQ(BitUtil::RoundUpToPowerOfTwo(alignment), alignment); - return allocate(size, alignment); + return allocate(size, alignment, rst); } /// Same as TryAllocate() except returned memory is not aligned at all. - uint8_t* try_allocate_unaligned(int64_t size) { + uint8_t* try_allocate_unaligned(int64_t size, Status* rst = nullptr) { // Call templated implementation directly so that it is inlined here and the // alignment logic can be optimised out. - return allocate(size, 1); + return allocate(size, 1, rst); } /// Makes all allocated chunks available for re-use, but doesn't delete any chunks. @@ -153,7 +157,7 @@ class MemPool { int64_t total_reserved_bytes() const { return total_reserved_bytes_; } int64_t peak_allocated_bytes() const { return peak_allocated_bytes_; } - MemTracker* mem_tracker() { return mem_tracker_; } + std::shared_ptr mem_tracker() { return _mem_tracker; } static constexpr int DEFAULT_ALIGNMENT = 8; @@ -183,7 +187,7 @@ class MemPool { /// if a new chunk needs to be created. /// If check_limits is true, this call can fail (returns false) if adding a /// new chunk exceeds the mem limits. - bool find_chunk(size_t min_size, bool check_limits); + Status find_chunk(size_t min_size, bool check_limits); /// Check integrity of the supporting data structures; always returns true but DCHECKs /// all invariants. @@ -198,7 +202,7 @@ class MemPool { } template - uint8_t* ALWAYS_INLINE allocate(int64_t size, int alignment) { + uint8_t* ALWAYS_INLINE allocate(int64_t size, int alignment, Status* rst) { DCHECK_GE(size, 0); if (UNLIKELY(size == 0)) return reinterpret_cast(&k_zero_length_region_); @@ -224,7 +228,12 @@ class MemPool { // guarantee alignment. //static_assert( //INITIAL_CHUNK_SIZE >= config::FLAGS_MEMORY_MAX_ALIGNMENT, "Min chunk size too low"); - if (UNLIKELY(!find_chunk(size, CHECK_LIMIT_FIRST))) return nullptr; + if (rst == nullptr) { + if (UNLIKELY(!find_chunk(size, CHECK_LIMIT_FIRST))) return nullptr; + } else { + *rst = find_chunk(size, CHECK_LIMIT_FIRST); + if (UNLIKELY(!*rst)) return nullptr; + } ChunkInfo& info = chunks_[current_chunk_idx_]; uint8_t* result = info.chunk.data + info.allocated_bytes; @@ -238,8 +247,9 @@ class MemPool { } template - OLAPStatus ALWAYS_INLINE allocate_safely(int64_t size, int alignment, uint8_t*& ret) { - uint8_t* result = allocate(size, alignment); + OLAPStatus ALWAYS_INLINE allocate_safely(int64_t size, int alignment, uint8_t*& ret, + Status* rst = nullptr) { + uint8_t* result = allocate(size, alignment, rst); if (result == nullptr) { return OLAP_ERR_MALLOC_ERROR; } @@ -272,13 +282,12 @@ class MemPool { /// The current and peak memory footprint of this pool. This is different from /// total allocated_bytes_ since it includes bytes in chunks that are not used. - MemTracker* mem_tracker_; - std::shared_ptr new_mem_tracker_; + std::shared_ptr _mem_tracker; }; // Stamp out templated implementations here so they're included in IR module -template uint8_t* MemPool::allocate(int64_t size, int alignment); -template uint8_t* MemPool::allocate(int64_t size, int alignment); +template uint8_t* MemPool::allocate(int64_t size, int alignment, Status* rst); +template uint8_t* MemPool::allocate(int64_t size, int alignment, Status* rst); } // namespace doris #endif diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index fb5929a7b1f4f5..17397428c1daef 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -43,7 +43,7 @@ static GoogleOnceType root_tracker_once = GOOGLE_ONCE_INIT; void MemTracker::create_root_tracker() { root_tracker.reset(new MemTracker(-1, "Root", nullptr, MemTrackerLevel::OVERVIEW, nullptr)); - root_tracker->Init(); + root_tracker->init(); } std::shared_ptr MemTracker::get_root_tracker() { @@ -79,26 +79,26 @@ std::shared_ptr MemTracker::create_tracker(int64_t byte_limit, const const std::shared_ptr& parent, MemTrackerLevel level, RuntimeProfile* profile) { - std::shared_ptr reset_parent; - std::string reset_label = label; - if (parent) { - // If parent contains query memtracker, add query ID to label. - if (parent->get_task_mem_tracker() != nullptr) { - std::vector parent_label = split(parent->get_task_mem_tracker()->label(), "="); - reset_label = label + ":" + parent_label[parent_label.size() - 1]; - } else { - reset_label = label; - } - reset_parent = std::move(parent); - } else { - reset_parent = get_root_tracker(); + std::shared_ptr reset_parent = parent; + if (!reset_parent) { + reset_parent = thread_local_ctx.thread_mem_tracker(); } - std::shared_ptr tracker( - new MemTracker(byte_limit, reset_label, reset_parent, - level > reset_parent->_level ? level : reset_parent->_level, profile)); + std::shared_ptr tracker(new MemTracker( + byte_limit, reset_parent->has_virtual_ancestor() == false ? label : "" + label, + reset_parent, level > reset_parent->_level ? level : reset_parent->_level, profile)); reset_parent->add_child_tracker(tracker); - tracker->Init(); + tracker->init(); + return tracker; +} + +std::shared_ptr MemTracker::create_virtual_tracker( + int64_t byte_limit, const std::string& label, const std::shared_ptr& parent, + MemTrackerLevel level) { + std::shared_ptr tracker(new MemTracker( + byte_limit, "" + label, + parent == nullptr ? thread_local_ctx.thread_mem_tracker() : parent, level, nullptr)); + tracker->init_virtual(); return tracker; } @@ -117,7 +117,7 @@ MemTracker::MemTracker(int64_t byte_limit, const std::string& label, } } -void MemTracker::Init() { +void MemTracker::init() { DCHECK_GE(_limit, -1); MemTracker* tracker = this; while (tracker != nullptr) { @@ -129,10 +129,17 @@ void MemTracker::Init() { DCHECK_EQ(_all_trackers[0], this); } +void MemTracker::init_virtual() { + DCHECK_GE(_limit, -1); + _all_trackers.push_back(this); + if (this->has_limit()) _limit_trackers.push_back(this); + _virtual = true; +} + MemTracker::~MemTracker() { // TCMalloc hook will be triggered during destructor memtracker, may cause crash. if (_label == "Root") GLOBAL_STOP_THREAD_LOCAL_MEM_TRACKER(); - if (parent()) { + if (!_virtual && parent()) { if (consumption() != 0) { memory_leak_check(this); // At present, it can only guarantee the accurate recording of the Instance tracker, @@ -152,8 +159,9 @@ MemTracker::~MemTracker() { } } -void MemTracker::transfer_to(std::shared_ptr dst, int64_t bytes) { +void MemTracker::transfer_to_relative(std::shared_ptr dst, int64_t bytes) { DCHECK_EQ(_all_trackers.back(), dst->_all_trackers.back()) << "Must have same ancestor"; + DCHECK(!dst->has_limit()); // Find the common ancestor and update trackers between 'this'/'dst' and // the common ancestor. This logic handles all cases, including the // two trackers being the same or being ancestors of each other because @@ -162,6 +170,7 @@ void MemTracker::transfer_to(std::shared_ptr dst, int64_t bytes) { int dst_ancestor_idx = dst->_all_trackers.size() - 1; while (ancestor_idx > 0 && dst_ancestor_idx > 0 && _all_trackers[ancestor_idx - 1] == dst->_all_trackers[dst_ancestor_idx - 1]) { + DCHECK(!dst->_all_trackers[dst_ancestor_idx - 1]->has_limit()); --ancestor_idx; --dst_ancestor_idx; } @@ -170,6 +179,22 @@ void MemTracker::transfer_to(std::shared_ptr dst, int64_t bytes) { dst->consume(bytes, common_ancestor); } +Status MemTracker::transfer_to(std::shared_ptr dst, int64_t bytes) { + // Must release first, then consume + release(bytes); + Status st = dst->try_consume(bytes); + if (!st) { + consume(bytes); + return st; + } + return Status::OK(); +} + +void MemTracker::transfer_to_force(std::shared_ptr dst, int64_t bytes) { + release(bytes); + dst->consume(bytes); +} + // Calling this on the query tracker results in output like: // // Query(4a4c81fedaed337d:4acadfda00000000) Limit=10.00 GB Total=508.28 MB Peak=508.45 MB @@ -240,26 +265,24 @@ std::string MemTracker::log_usage(int max_recursive_depth, return join(usage_strings, "\n"); } -MemTracker* MemTracker::get_task_mem_tracker() { - MemTracker* tracker = this; - while (tracker != nullptr && tracker->_level != MemTrackerLevel::TASK) { - tracker = tracker->_parent.get(); - } - return tracker; -} - Status MemTracker::mem_limit_exceeded(RuntimeState* state, const std::string& details, - int64_t failed_allocation_size) { - DCHECK_GE(failed_allocation_size, 0); + int64_t failed_allocation_size, Status failed_alloc) { MemTracker* process_tracker = ExecEnv::GetInstance()->process_mem_tracker().get(); std::string detail = - "Memory exceed limit. details: {}, Label: {}, could not allocate size {} without " - "exceeding limit on backend: {}, Memory left in process limit: {}, by fragment: {}."; - detail = fmt::format( - detail, details, _label, PrettyPrinter::print(failed_allocation_size, TUnit::BYTES), - BackendOptions::get_localhost(), - PrettyPrinter::print(process_tracker->spare_capacity(), TUnit::BYTES), - state != nullptr ? print_id(state->fragment_instance_id()) : std::string()); + "Memory exceed limit. fragment={}, details={}, on backend={}. Memory left in process " + "limit={}."; + detail = fmt::format(detail, state != nullptr ? print_id(state->fragment_instance_id()) : "", + details, BackendOptions::get_localhost(), + PrettyPrinter::print(process_tracker->spare_capacity(), TUnit::BYTES)); + if (!failed_alloc) { + detail += " failed alloc=<{}>. current tracker={}."; + detail = fmt::format(detail, failed_alloc.to_string(), _label); + } else { + detail += " current tracker ."; + detail = fmt::format(detail, _label, _consumption->current_value(), _limit, + PrettyPrinter::print(failed_allocation_size, TUnit::BYTES)); + } + detail += " If query, can change the limit by session variable exec_mem_limit."; Status status = Status::MemoryLimitExceeded(detail); if (state != nullptr) state->log_error(detail); @@ -269,18 +292,14 @@ Status MemTracker::mem_limit_exceeded(RuntimeState* state, const std::string& de // levels limits the level of detail to a one-line summary for each query MemTracker. detail += "\n" + process_tracker->log_usage(2); } - if (get_task_mem_tracker() != nullptr) { - detail += "\n" + get_task_mem_tracker()->log_usage(); + if (parent_task_mem_tracker() != nullptr) { + detail += "\n" + parent_task_mem_tracker()->log_usage(); } LOG(WARNING) << detail; return status; } -void MemTracker::add_gc_function(GcFunction f) { - _gc_functions.push_back(f); -} - bool MemTracker::gc_memory(int64_t max_consumption) { if (max_consumption < 0) return true; lock_guard l(_gc_lock); @@ -289,7 +308,7 @@ bool MemTracker::gc_memory(int64_t max_consumption) { if (pre_gc_consumption < max_consumption) return false; int64_t curr_consumption = pre_gc_consumption; - const int64_t EXTRA_BYTES_TO_FREE = 512L * 1024L * 1024L; + const int64_t EXTRA_BYTES_TO_FREE = 4L * 1024L * 1024L * 1024L; // TODO(zxy) Consider as config // Try to free up some memory for (int i = 0; i < _gc_functions.size(); ++i) { // Try to free up the amount we are over plus some extra so that we don't have to diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index c984f0a3accf19..e2ce85c5a65bd1 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -74,6 +74,14 @@ class MemTracker { const std::shared_ptr& parent = std::shared_ptr(), MemTrackerLevel level = MemTrackerLevel::VERBOSE, RuntimeProfile* profile = nullptr); + // Cosume/release will not sync to parent.Usually used to manually record the specified memory, + // It is independent of the recording of TCMalloc Hook in the thread local tracker, so the same + // block of memory is recorded independently in these two trackers. + static std::shared_ptr create_virtual_tracker( + int64_t byte_limit = -1, const std::string& label = std::string(), + const std::shared_ptr& parent = std::shared_ptr(), + MemTrackerLevel level = MemTrackerLevel::VERBOSE); + // this is used for creating an orphan mem tracker, or for unit test. // If a mem tracker has parent, it should be created by `create_tracker()` MemTracker(int64_t byte_limit = -1, const std::string& label = std::string()); @@ -86,6 +94,15 @@ class MemTracker { // Gets a shared_ptr to the "root" tracker, creating it if necessary. static std::shared_ptr get_root_tracker(); + inline Status check_sys_mem_info(int64_t bytes) { + if (MemInfo::initialized() && MemInfo::current_mem() + bytes >= MemInfo::mem_limit()) { + return Status::MemoryLimitExceeded(fmt::format( + "{}: TryConsume failed, bytes={} process whole consumption={} mem limit={}", + _label, bytes, MemInfo::current_mem(), MemInfo::mem_limit())); + } + return Status::OK(); + } + // Increases consumption of this tracker and its ancestors by 'bytes'. // up to (but not including) end_tracker. // This is useful if we want to move tracking between trackers that share a common (i.e. end_tracker) @@ -112,40 +129,27 @@ class MemTracker { release(-bytes); return Status::OK(); } - // TCMalloc new/delete hook will call consume before MemInfo is initialized. - if (MemInfo::initialized() && MemInfo::current_mem() + bytes >= MemInfo::mem_limit()) { - return Status::MemoryLimitExceeded(fmt::format( - "{}: TryConsume failed, bytes={} process whole consumption={} mem limit={}", - _label, bytes, MemInfo::current_mem(), MemInfo::mem_limit())); - } + RETURN_IF_ERROR(check_sys_mem_info(bytes)); int i; // Walk the tracker tree top-down. for (i = _all_trackers.size() - 1; i >= 0; --i) { MemTracker* tracker = _all_trackers[i]; - const int64_t limit = tracker->limit(); - if (limit < 0) { + if (tracker->limit() < 0) { tracker->_consumption->add(bytes); // No limit at this tracker. } else { // If TryConsume fails, we can try to GC, but we may need to try several times if // there are concurrent consumers because we don't take a lock before trying to // update _consumption. while (true) { - if (LIKELY(tracker->_consumption->try_add(bytes, limit))) break; - - if (UNLIKELY(tracker->gc_memory(limit - bytes))) { + if (LIKELY(tracker->_consumption->try_add(bytes, tracker->limit()))) break; + Status st = tracker->try_gc_memory(bytes); + if (!st) { // Failed for this mem tracker. Roll back the ones that succeeded. for (int j = _all_trackers.size() - 1; j > i; --j) { _all_trackers[j]->_consumption->add(-bytes); } - return Status::MemoryLimitExceeded(fmt::format( - "{}: TryConsume failed, bytes={} consumption={} imit={} " - "attempting to GC", - tracker->label(), bytes, tracker->_consumption->current_value(), - limit)); + return st; } - VLOG_NOTICE << "GC succeeded, TryConsume bytes=" << bytes - << " consumption=" << tracker->_consumption->current_value() - << " limit=" << limit; } } } @@ -182,11 +186,15 @@ class MemTracker { return Status::OK(); } - /// Transfer 'bytes' of consumption from this tracker to 'dst', updating - /// all ancestors up to the first shared ancestor. Must not be used if - /// 'dst' has a limit, or an ancestor with a limit, that is not a common - /// ancestor with the tracker, because this does not check memory limits. - void transfer_to(std::shared_ptr dst, int64_t bytes); + // Transfer 'bytes' of consumption from this tracker to 'dst'. + // updating all ancestors up to the first shared ancestor. Must not be used if + // 'dst' has a limit, or an ancestor with a limit, that is not a common + // ancestor with the tracker, because this does not check memory limits. + void transfer_to_relative(std::shared_ptr dst, int64_t bytes); + WARN_UNUSED_RESULT + Status transfer_to(std::shared_ptr dst, int64_t bytes); + // Forced transfer, 'dst' may limit exceed, and more ancestor trackers will be updated. + void transfer_to_force(std::shared_ptr dst, int64_t bytes); // Returns true if a valid limit of this tracker or one of its ancestors is exceeded. MemTracker* limit_exceeded_tracker() const { @@ -227,6 +235,24 @@ class MemTracker { int64_t limit() const { return _limit; } bool has_limit() const { return _limit >= 0; } + Status check_limit(int64_t bytes) { + if (bytes <= 0) return Status::OK(); + RETURN_IF_ERROR(check_sys_mem_info(bytes)); + int i; + // Walk the tracker tree top-down. + for (i = _all_trackers.size() - 1; i >= 0; --i) { + MemTracker* tracker = _all_trackers[i]; + if (tracker->limit() > 0) { + while (true) { + if (LIKELY(tracker->_consumption->current_value() + bytes < tracker->limit())) + break; + RETURN_IF_ERROR(tracker->try_gc_memory(bytes)); + } + } + } + return Status::OK(); + } + const std::string& label() const { return _label; } // Returns the memory consumed in bytes. @@ -240,7 +266,7 @@ class MemTracker { /// previously-added GC functions were successful at freeing up enough memory. /// 'f' does not need to be thread-safe as long as it is added to only one MemTracker. /// Note that 'f' must be valid for the lifetime of this MemTracker. - void add_gc_function(GcFunction f); + void add_gc_function(GcFunction f) { _gc_functions.push_back(f); } /// Logs the usage of this tracker and optionally its children (recursively). /// If 'logged_consumption' is non-nullptr, sets the consumption value logged. @@ -256,10 +282,25 @@ class MemTracker { /// 'failed_allocation_size' is zero, nothing about the allocation size is logged. /// If 'state' is non-nullptr, logs the error to 'state'. Status mem_limit_exceeded(RuntimeState* state, const std::string& details = std::string(), - int64_t failed_allocation = 0) WARN_UNUSED_RESULT; + int64_t failed_allocation = -1, + Status failed_alloc = Status::OK()) WARN_UNUSED_RESULT; // If an ancestor of this tracker is a Task MemTracker, return that tracker. Otherwise return nullptr. - MemTracker* get_task_mem_tracker(); + MemTracker* parent_task_mem_tracker() { + MemTracker* tracker = this; + while (tracker != nullptr && tracker->_level != MemTrackerLevel::TASK) { + tracker = tracker->_parent.get(); + } + return tracker; + } + + bool has_virtual_ancestor() { + MemTracker* tracker = this; + while (tracker != nullptr && tracker->_virtual == false) { + tracker = tracker->_parent.get(); + } + return tracker == nullptr ? false : true; + } std::string debug_string() { std::stringstream msg; @@ -285,9 +326,21 @@ class MemTracker { // any added GC functions. Returns true if max_consumption is still exceeded. Takes gc_lock. bool gc_memory(int64_t max_consumption); - /// Walks the MemTracker hierarchy and populates _all_trackers and - /// limit_trackers_ - void Init(); + inline Status try_gc_memory(int64_t bytes) { + if (UNLIKELY(gc_memory(_limit - bytes))) { + return Status::MemoryLimitExceeded( + fmt::format("label={} TryConsume failed size={}, used={}, limit={}", + label(), bytes, _consumption->current_value(), _limit)); + } + VLOG_NOTICE << "GC succeeded, TryConsume bytes=" << bytes + << " consumption=" << _consumption->current_value() << " limit=" << _limit; + return Status::OK(); + } + + // Walks the MemTracker hierarchy and populates _all_trackers and + // limit_trackers_ + void init(); + void init_virtual(); // Adds tracker to _child_trackers void add_child_tracker(const std::shared_ptr& tracker) { @@ -332,9 +385,11 @@ class MemTracker { std::string _label; + std::shared_ptr _parent; // The parent of this tracker. + MemTrackerLevel _level; - std::shared_ptr _parent; // The parent of this tracker. + bool _virtual = false; std::shared_ptr _consumption; // in bytes @@ -355,24 +410,10 @@ class MemTracker { std::vector _gc_functions; }; -#define LIMIT_EXCEEDED(tracker, state, msg) \ - do { \ - stringstream str; \ - str << "Memory exceed limit. " << msg << " "; \ - str << "Backend: " << BackendOptions::get_localhost() << ", "; \ - str << "fragment: " << print_id(state->fragment_instance_id()) << " "; \ - str << "Used: " << tracker->consumption() << ", Limit: " << tracker->limit() << ". "; \ - str << "You can change the limit by session variable exec_mem_limit."; \ - return Status::MemoryLimitExceeded(str.str()); \ - } while (false) - -#define RETURN_IF_LIMIT_EXCEEDED(state, msg) \ - do { \ - /* if (UNLIKELY(MemTracker::limit_exceeded(*(state)->mem_trackers()))) { */ \ - MemTracker* tracker = state->instance_mem_tracker()->limit_exceeded_tracker(); \ - if (tracker != nullptr) { \ - LIMIT_EXCEEDED(tracker, state, msg); \ - } \ - } while (false) +#define RETURN_LIMIT_EXCEEDED(tracker, state, msg) return tracker->mem_limit_exceeded(state, msg); +#define RETURN_ALLOC_LIMIT_EXCEEDED(tracker, state, msg, size, st) \ + return tracker->mem_limit_exceeded(state, msg, size, st); +#define RETURN_IF_LIMIT_EXCEEDED(tracker, state, msg) \ + if (tracker->any_limit_exceeded()) RETURN_LIMIT_EXCEEDED(tracker, state, msg); } // namespace doris diff --git a/be/src/runtime/mem_tracker_task_pool.cpp b/be/src/runtime/mem_tracker_task_pool.cpp index a233b714ed01fe..1208161d297b73 100644 --- a/be/src/runtime/mem_tracker_task_pool.cpp +++ b/be/src/runtime/mem_tracker_task_pool.cpp @@ -23,50 +23,62 @@ namespace doris { +std::shared_ptr MemTrackerTaskPool::register_task_mem_tracker_impl( + const std::string& task_id, int64_t mem_limit, const std::string& label, + std::shared_ptr parent) { + DCHECK(!task_id.empty()); + // First time this task_id registered, make a new object, otherwise do nothing. + // Combine create_tracker and emplace into one operation to avoid the use of locks + // Name for task MemTrackers. '$0' is replaced with the task id. + _task_mem_trackers.try_emplace_l( + task_id, [](std::shared_ptr) {}, + MemTracker::create_tracker(mem_limit, label, parent, MemTrackerLevel::TASK)); + std::shared_ptr tracker = get_task_mem_tracker(task_id); + return tracker; +} + std::shared_ptr MemTrackerTaskPool::register_query_mem_tracker( const std::string& query_id, int64_t mem_limit) { - DCHECK(!query_id.empty()); - VLOG_FILE << "Register query memory tracker, query id: " << query_id + VLOG_FILE << "Register Query memory tracker, query id: " << query_id << " limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); + return register_task_mem_tracker_impl(query_id, mem_limit, fmt::format("queryId={}", query_id), + ExecEnv::GetInstance()->query_pool_mem_tracker()); +} - // First time this query_id registered, make a new object, otherwise do nothing. - // Combine create_tracker and emplace into one operation to avoid the use of locks - // Name for query MemTrackers. '$0' is replaced with the query id. - _query_mem_trackers.try_emplace_l( - query_id, [](std::shared_ptr) {}, - MemTracker::create_tracker(mem_limit, fmt::format("queryId={}", query_id), - ExecEnv::GetInstance()->query_pool_mem_tracker(), - MemTrackerLevel::TASK)); - std::shared_ptr tracker = get_query_mem_tracker(query_id); - return tracker; +std::shared_ptr MemTrackerTaskPool::register_load_mem_tracker( + const std::string& load_id, int64_t mem_limit) { + VLOG_FILE << "Register Load memory tracker, load id: " << load_id + << " limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); + return register_task_mem_tracker_impl(load_id, mem_limit, fmt::format("loadId={}", load_id), + ExecEnv::GetInstance()->load_pool_mem_tracker()); } -std::shared_ptr MemTrackerTaskPool::get_query_mem_tracker(const std::string& query_id) { - DCHECK(!query_id.empty()); +std::shared_ptr MemTrackerTaskPool::get_task_mem_tracker(const std::string& task_id) { + DCHECK(!task_id.empty()); std::shared_ptr tracker = nullptr; // Avoid using locks to resolve erase conflicts - _query_mem_trackers.if_contains(query_id, - [&tracker](std::shared_ptr v) { tracker = v; }); + _task_mem_trackers.if_contains(task_id, + [&tracker](std::shared_ptr v) { tracker = v; }); return tracker; } -void MemTrackerTaskPool::logout_query_mem_tracker() { - std::vector expired_querys; - for (auto it = _query_mem_trackers.begin(); it != _query_mem_trackers.end(); it++) { - // No RuntimeState uses this query MemTracker, it is only referenced by this map, delete it +void MemTrackerTaskPool::logout_task_mem_tracker() { + std::vector expired_tasks; + for (auto it = _task_mem_trackers.begin(); it != _task_mem_trackers.end(); it++) { + // No RuntimeState uses this task MemTracker, it is only referenced by this map, delete it if (it->second.use_count() == 1) { if (!config::memory_leak_detection || it->second->consumption() == 0) { - expired_querys.emplace_back(it->first); + expired_tasks.emplace_back(it->first); } else { LOG(WARNING) << "Memory tracker " << it->second->debug_string() << " Memory leak " << it->second->consumption(); } } } - for (auto qid : expired_querys) { - DCHECK(_query_mem_trackers[qid].use_count() == 1); - _query_mem_trackers.erase(qid); - VLOG_FILE << "Deregister query memory tracker, query id: " << qid; + for (auto tid : expired_tasks) { + DCHECK(_task_mem_trackers[tid].use_count() == 1); + _task_mem_trackers.erase(tid); + VLOG_FILE << "Deregister task memory tracker, task id: " << tid; } } diff --git a/be/src/runtime/mem_tracker_task_pool.h b/be/src/runtime/mem_tracker_task_pool.h index f927dffd369663..0cf566446dadb0 100644 --- a/be/src/runtime/mem_tracker_task_pool.h +++ b/be/src/runtime/mem_tracker_task_pool.h @@ -26,22 +26,28 @@ namespace doris { // Global task pool for query MemTrackers. Owned by ExecEnv. class MemTrackerTaskPool { public: - // Construct a MemTracker object for 'query_id' with 'mem_limit' as the memory limit. - // The MemTracker is a child of the process MemTracker, Calling this with the same - // 'query_id' will return the same MemTracker object. This is used to track the local - // memory usage of all querys executing. The first time this is called for a query, - // a new MemTracker object is created with the process tracker as its parent. + // Construct a MemTracker object for 'task_id' with 'mem_limit' as the memory limit. + // The MemTracker is a child of the pool MemTracker, Calling this with the same + // 'task_id' will return the same MemTracker object. This is used to track the local + // memory usage of all tasks executing. The first time this is called for a task, + // a new MemTracker object is created with the pool tracker as its parent. // Newly created trackers will always have a limit of -1. + std::shared_ptr register_task_mem_tracker_impl(const std::string& task_id, + int64_t mem_limit, + const std::string& label, + std::shared_ptr parent); std::shared_ptr register_query_mem_tracker(const std::string& query_id, - int64_t mem_limit = -1); + int64_t mem_limit); + std::shared_ptr register_load_mem_tracker(const std::string& load_id, + int64_t mem_limit); - std::shared_ptr get_query_mem_tracker(const std::string& query_id); + std::shared_ptr get_task_mem_tracker(const std::string& task_id); - void logout_query_mem_tracker(); + void logout_task_mem_tracker(); private: - // All per-query MemTracker objects. - // The life cycle of query memtracker in the process is the same as query runtime state, + // All per-task MemTracker objects. + // The life cycle of task memtracker in the process is the same as task runtime state, // MemTrackers will be removed from this map after query finish or cancel. using TaskTrackersMap = phmap::parallel_flat_hash_map< std::string, std::shared_ptr, phmap::priv::hash_default_hash, @@ -49,7 +55,7 @@ class MemTrackerTaskPool { std::allocator>>, 12, std::mutex>; - TaskTrackersMap _query_mem_trackers; + TaskTrackersMap _task_mem_trackers; }; } // namespace doris \ No newline at end of file diff --git a/be/src/runtime/memory/chunk_allocator.cpp b/be/src/runtime/memory/chunk_allocator.cpp index 25d6ed82a8fe24..77717ee7a79955 100644 --- a/be/src/runtime/memory/chunk_allocator.cpp +++ b/be/src/runtime/memory/chunk_allocator.cpp @@ -116,10 +116,9 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit) : _reserve_bytes_limit(reserve_limit), _reserved_bytes(0), _arenas(CpuInfo::get_max_num_cores()) { - _chunk_allocator_mem_tracker = - MemTracker::create_tracker(static_cast(reserve_limit), "ChunkAllocator", - nullptr, MemTrackerLevel::OVERVIEW); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker, "ChunkAllocator", false); + _mem_tracker = + MemTracker::create_tracker(-1, "ChunkAllocator", nullptr, MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); for (int i = 0; i < _arenas.size(); ++i) { _arenas[i].reset(new ChunkArena()); } @@ -134,11 +133,16 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit) INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_system_free_cost_ns); } -bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { +Status ChunkAllocator::allocate(size_t size, Chunk* chunk, bool check_limits) { // fast path: allocate from current core arena chunk->mem_tracker = thread_local_ctx.thread_mem_tracker(); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker, "ChunkAllocator", false); - thread_local_ctx.thread_mem_tracker()->transfer_to(chunk->mem_tracker, size); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + if (check_limits) { + RETURN_IF_ERROR(thread_local_ctx.thread_mem_tracker()->transfer_to(chunk->mem_tracker, size)); + } else { + thread_local_ctx.thread_mem_tracker()->transfer_to_force(chunk->mem_tracker, size); + } + int core_id = CpuInfo::get_current_core(); chunk->size = size; chunk->core_id = core_id; @@ -147,7 +151,7 @@ bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { DCHECK_GE(_reserved_bytes, 0); _reserved_bytes.fetch_sub(size); chunk_pool_local_core_alloc_count->increment(1); - return true; + return Status::OK(); } if (_reserved_bytes > size) { // try to allocate from other core's arena @@ -159,7 +163,7 @@ bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { chunk_pool_other_core_alloc_count->increment(1); // reset chunk's core_id to other chunk->core_id = core_id % _arenas.size(); - return true; + return Status::OK(); } } } @@ -173,10 +177,11 @@ bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { chunk_pool_system_alloc_count->increment(1); chunk_pool_system_alloc_cost_ns->increment(cost_ns); if (chunk->data == nullptr) { - chunk->mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), size); - return false; + Status st = chunk->mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), size); + return Status::MemoryAllocFailed( + fmt::format("ChunkAllocator failed to allocate chunk {} bytes", size)); } - return true; + return Status::OK(); } void ChunkAllocator::free(Chunk& chunk) { @@ -184,8 +189,8 @@ void ChunkAllocator::free(Chunk& chunk) { return; } DCHECK(chunk.mem_tracker != nullptr); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_chunk_allocator_mem_tracker, "ChunkAllocator", false); - chunk.mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), chunk.size); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + Status st = chunk.mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), chunk.size); int64_t old_reserved_bytes = _reserved_bytes; int64_t new_reserved_bytes = 0; do { @@ -207,8 +212,8 @@ void ChunkAllocator::free(Chunk& chunk) { chunk.mem_tracker = nullptr; } -bool ChunkAllocator::allocate_align(size_t size, Chunk* chunk) { - return allocate(BitUtil::RoundUpToPowerOfTwo(size), chunk); +Status ChunkAllocator::allocate_align(size_t size, Chunk* chunk, bool check_limits) { + return allocate(BitUtil::RoundUpToPowerOfTwo(size), chunk, check_limits); } } // namespace doris diff --git a/be/src/runtime/memory/chunk_allocator.h b/be/src/runtime/memory/chunk_allocator.h index c94089596a4879..4c141358e4f142 100644 --- a/be/src/runtime/memory/chunk_allocator.h +++ b/be/src/runtime/memory/chunk_allocator.h @@ -29,6 +29,7 @@ class Chunk; class ChunkArena; class MetricEntity; class MemTracker; +class Status; // Used to allocate memory with power-of-two length. // This Allocator allocate memory from system and cache free chunks for @@ -64,9 +65,9 @@ class ChunkAllocator { // Allocate a Chunk with a power-of-two length "size". // Return true if success and allocated chunk is saved in "chunk". // Otherwise return false. - bool allocate(size_t size, Chunk* chunk); + Status allocate(size_t size, Chunk* chunk, bool check_limits = false); - bool allocate_align(size_t size, Chunk* chunk); + Status allocate_align(size_t size, Chunk* chunk, bool check_limits = false); // Free chunk allocated from this allocator void free(Chunk& chunk); @@ -81,7 +82,7 @@ class ChunkAllocator { std::shared_ptr _chunk_allocator_metric_entity; - std::shared_ptr _chunk_allocator_mem_tracker; + std::shared_ptr _mem_tracker; }; } // namespace doris diff --git a/be/src/runtime/memory_scratch_sink.h b/be/src/runtime/memory_scratch_sink.h index 658aa0eb6478ad..7f63f2cfb03bed 100644 --- a/be/src/runtime/memory_scratch_sink.h +++ b/be/src/runtime/memory_scratch_sink.h @@ -42,7 +42,6 @@ class RuntimeProfile; class BufferControlBlock; class ExprContext; class ResultWriter; -class MemTracker; class TupleRow; // used to push data to blocking queue diff --git a/be/src/runtime/odbc_table_sink.cpp b/be/src/runtime/odbc_table_sink.cpp index e9c166d8b7e0b7..991ca64f0b6fc9 100644 --- a/be/src/runtime/odbc_table_sink.cpp +++ b/be/src/runtime/odbc_table_sink.cpp @@ -21,18 +21,14 @@ #include "exprs/expr.h" #include "runtime/runtime_state.h" -#include "runtime/mem_tracker.h" -#include "util/runtime_profile.h" #include "util/debug_util.h" +#include "util/runtime_profile.h" namespace doris { OdbcTableSink::OdbcTableSink(ObjectPool* pool, const RowDescriptor& row_desc, - const std::vector& t_exprs) - : _pool(pool), - _row_desc(row_desc), - _t_output_expr(t_exprs), - _mem_tracker(MemTracker::create_tracker(-1, "OdbcTableSink")) { + const std::vector& t_exprs) + : _pool(pool), _row_desc(row_desc), _t_output_expr(t_exprs) { _name = "OOBC_TABLE_SINK"; } @@ -56,7 +52,7 @@ Status OdbcTableSink::init(const TDataSink& t_sink) { Status OdbcTableSink::prepare(RuntimeState* state) { RETURN_IF_ERROR(DataSink::prepare(state)); // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _mem_tracker)); + RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _expr_mem_tracker)); std::stringstream title; title << "ODBC_TABLE_SINK (frag_id=" << state->fragment_instance_id() << ")"; // create profile @@ -100,4 +96,4 @@ Status OdbcTableSink::close(RuntimeState* state, Status exec_status) { return Status::OK(); } -} +} // namespace doris diff --git a/be/src/runtime/odbc_table_sink.h b/be/src/runtime/odbc_table_sink.h index 385075b49aa658..3f9c8fd3b25ce6 100644 --- a/be/src/runtime/odbc_table_sink.h +++ b/be/src/runtime/odbc_table_sink.h @@ -32,7 +32,6 @@ class TOdbcTableSink; class RuntimeState; class RuntimeProfile; class ExprContext; -class MemTracker; //This class is a sinker, which put input data to odbc table class OdbcTableSink : public DataSink { @@ -73,9 +72,8 @@ class OdbcTableSink : public DataSink { bool _use_transaction; RuntimeProfile* _profile; - std::shared_ptr _mem_tracker; }; -} +} // namespace doris #endif diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp index 7b57b3420add43..4837770ba738a9 100644 --- a/be/src/runtime/plan_fragment_executor.cpp +++ b/be/src/runtime/plan_fragment_executor.cpp @@ -91,6 +91,8 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, _runtime_state->set_query_fragments_ctx(fragments_ctx); RETURN_IF_ERROR(_runtime_state->init_mem_trackers(_query_id)); + SCOPED_ATTACH_TASK_THREAD_4ARG(_runtime_state->query_type(), print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id(), _runtime_state->instance_mem_tracker()); _runtime_state->set_be_number(request.backend_num); if (request.__isset.backend_id) { _runtime_state->set_backend_id(request.backend_id); @@ -127,13 +129,6 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, << ". Using process memory limit instead"; bytes_limit = _exec_env->process_mem_tracker()->limit(); } - // NOTE: this MemTracker only for olap - _mem_tracker = - MemTracker::create_tracker(bytes_limit, - "PlanFragmentExecutor:" + print_id(_query_id) + ":" + - print_id(params.fragment_instance_id), - _exec_env->process_mem_tracker(), MemTrackerLevel::INSTANCE); - _runtime_state->set_fragment_mem_tracker(_mem_tracker); RETURN_IF_ERROR(_runtime_state->create_block_mgr()); @@ -222,8 +217,7 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, _rows_produced_counter = ADD_COUNTER(profile(), "RowsProduced", TUnit::UNIT); _fragment_cpu_timer = ADD_TIMER(profile(), "FragmentCpuTime"); - _row_batch.reset(new RowBatch(_plan->row_desc(), _runtime_state->batch_size(), - _runtime_state->instance_mem_tracker().get())); + _row_batch.reset(new RowBatch(_plan->row_desc(), _runtime_state->batch_size())); _block.reset(new doris::vectorized::Block()); // _row_batch->tuple_data_pool()->set_limits(*_runtime_state->mem_trackers()); VLOG_NOTICE << "plan_root=\n" << _plan->debug_string(); @@ -237,7 +231,7 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, } Status PlanFragmentExecutor::open() { - int64_t mem_limit = _runtime_state->fragment_mem_tracker()->limit(); + int64_t mem_limit = _runtime_state->instance_mem_tracker()->limit(); TAG(LOG(INFO)) .log("PlanFragmentExecutor::open, using query memory limit: " + PrettyPrinter::print(mem_limit, TUnit::BYTES)) @@ -460,12 +454,13 @@ void PlanFragmentExecutor::_collect_node_statistics() { DCHECK(_runtime_state->backend_id() != -1); NodeStatistics* node_statistics = _query_statistics->add_nodes_statistics(_runtime_state->backend_id()); - node_statistics->add_peak_memory(_mem_tracker->peak_consumption()); + node_statistics->add_peak_memory(_runtime_state->instance_mem_tracker()->peak_consumption()); } void PlanFragmentExecutor::report_profile() { - SCOPED_ATTACH_TASK_THREAD(ThreadContext::QUERY, print_id(_runtime_state->query_id()), - _runtime_state->fragment_instance_id()); + SCOPED_ATTACH_TASK_THREAD_4ARG( + _runtime_state->query_type(), print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id(), _runtime_state->instance_mem_tracker()); VLOG_FILE << "report_profile(): instance_id=" << _runtime_state->fragment_instance_id(); DCHECK(_report_status_cb); @@ -616,7 +611,7 @@ void PlanFragmentExecutor::update_status(const Status& new_status) { _runtime_state->set_mem_limit_exceeded(new_status.get_error_msg()); } _status = new_status; - if (_runtime_state->query_options().query_type == TQueryType::EXTERNAL) { + if (_runtime_state->query_type() == TQueryType::EXTERNAL) { TUniqueId fragment_instance_id = _runtime_state->fragment_instance_id(); _exec_env->result_queue_mgr()->update_queue_status(fragment_instance_id, new_status); @@ -702,10 +697,6 @@ void PlanFragmentExecutor::close() { << print_id(_runtime_state->fragment_instance_id()); } - // _mem_tracker init failed - if (_mem_tracker.get() != nullptr) { - _mem_tracker->release(_mem_tracker->consumption()); - } _closed = true; } diff --git a/be/src/runtime/plan_fragment_executor.h b/be/src/runtime/plan_fragment_executor.h index e40bbcb023ba87..12201d6d71da45 100644 --- a/be/src/runtime/plan_fragment_executor.h +++ b/be/src/runtime/plan_fragment_executor.h @@ -148,7 +148,6 @@ class PlanFragmentExecutor { ExecEnv* _exec_env; // not owned ExecNode* _plan; // lives in _runtime_state->obj_pool() TUniqueId _query_id; - std::shared_ptr _mem_tracker; // profile reporting-related report_status_callback _report_status_cb; diff --git a/be/src/runtime/qsorter.cpp b/be/src/runtime/qsorter.cpp index 951b35a63afaac..cec1b6cd2e3ad6 100644 --- a/be/src/runtime/qsorter.cpp +++ b/be/src/runtime/qsorter.cpp @@ -81,7 +81,7 @@ QSorter::QSorter(const RowDescriptor& row_desc, const std::vector& RuntimeState* state) : _row_desc(row_desc), _order_expr_ctxs(order_expr_ctxs), - _tuple_pool(new MemPool(state->instance_mem_tracker().get())) {} + _tuple_pool(new MemPool("QSorter")) {} Status QSorter::prepare(RuntimeState* state) { RETURN_IF_ERROR(Expr::clone_if_not_exists(_order_expr_ctxs, state, &_lhs_expr_ctxs)); diff --git a/be/src/runtime/result_file_sink.cpp b/be/src/runtime/result_file_sink.cpp index c35722c3c70e1c..878f8fe67890a7 100644 --- a/be/src/runtime/result_file_sink.cpp +++ b/be/src/runtime/result_file_sink.cpp @@ -22,7 +22,6 @@ #include "runtime/buffer_control_block.h" #include "runtime/exec_env.h" #include "runtime/file_result_writer.h" -#include "runtime/mem_tracker.h" #include "runtime/mysql_result_writer.h" #include "runtime/result_buffer_mgr.h" #include "runtime/row_batch.h" @@ -110,11 +109,8 @@ Status ResultFileSink::prepare(RuntimeState* state) { _local_bytes_send_counter = ADD_COUNTER(profile(), "LocalBytesSent", TUnit::BYTES); _uncompressed_bytes_counter = ADD_COUNTER(profile(), "UncompressedRowBatchSize", TUnit::BYTES); - _mem_tracker = MemTracker::create_tracker( - -1, "ResultFileSink:" + print_id(state->fragment_instance_id()), - state->instance_mem_tracker(), MemTrackerLevel::VERBOSE, _profile); // create writer - _output_batch = new RowBatch(_output_row_descriptor, 1024, _mem_tracker.get()); + _output_batch = new RowBatch(_output_row_descriptor, 1024); _writer.reset(new (std::nothrow) FileResultWriter( _file_opts.get(), _storage_type, state->fragment_instance_id(), _output_expr_ctxs, _profile, nullptr, _output_batch, state->return_object_data_as_binary())); diff --git a/be/src/runtime/result_file_sink.h b/be/src/runtime/result_file_sink.h index cef47cc10a8577..60a1ae2f03c238 100644 --- a/be/src/runtime/result_file_sink.h +++ b/be/src/runtime/result_file_sink.h @@ -34,7 +34,6 @@ class RuntimeProfile; class BufferControlBlock; class ExprContext; class ResultWriter; -class MemTracker; class ResultFileOptions; class ResultFileSink : public DataStreamSender { diff --git a/be/src/runtime/result_sink.h b/be/src/runtime/result_sink.h index 08fd6338c38f87..5368c8bcd8d3fa 100644 --- a/be/src/runtime/result_sink.h +++ b/be/src/runtime/result_sink.h @@ -33,7 +33,6 @@ class RuntimeProfile; class BufferControlBlock; class ExprContext; class ResultWriter; -class MemTracker; class ResultFileOptions; namespace vectorized { diff --git a/be/src/runtime/row_batch.cpp b/be/src/runtime/row_batch.cpp index 53b8f8f3948d75..eed731b84bedca 100644 --- a/be/src/runtime/row_batch.cpp +++ b/be/src/runtime/row_batch.cpp @@ -27,6 +27,7 @@ #include "runtime/collection_value.h" #include "runtime/exec_env.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/string_value.h" #include "runtime/tuple_row.h" @@ -40,8 +41,8 @@ namespace doris { const int RowBatch::AT_CAPACITY_MEM_USAGE = 8 * 1024 * 1024; const int RowBatch::FIXED_LEN_BUFFER_LIMIT = AT_CAPACITY_MEM_USAGE / 2; -RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_tracker) - : _mem_tracker(mem_tracker), +RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity) + : _mem_tracker(thread_local_ctx.thread_mem_tracker()), _has_in_flight_row(false), _num_rows(0), _num_uncommitted_rows(0), @@ -52,13 +53,13 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_ _row_desc(row_desc), _auxiliary_mem_usage(0), _need_to_return(false), - _tuple_data_pool(_mem_tracker) { - DCHECK(_mem_tracker != nullptr); + _tuple_data_pool() { DCHECK_GT(capacity, 0); _tuple_ptrs_size = _capacity * _num_tuples_per_row * sizeof(Tuple*); DCHECK_GT(_tuple_ptrs_size, 0); // TODO: switch to Init() pattern so we can check memory limit and return Status. if (config::enable_partitioned_aggregation) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); _mem_tracker->consume(_tuple_ptrs_size); _tuple_ptrs = (Tuple**)(malloc(_tuple_ptrs_size)); DCHECK(_tuple_ptrs != nullptr); @@ -73,8 +74,8 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_ // xfer += iprot->readString(this->tuple_data[_i9]); // to allocated string data in special mempool // (change via python script that runs over Data_types.cc) -RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch, MemTracker* tracker) - : _mem_tracker(tracker), +RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch) + : _mem_tracker(thread_local_ctx.thread_mem_tracker()), _has_in_flight_row(false), _num_rows(input_batch.num_rows()), _num_uncommitted_rows(0), @@ -85,12 +86,12 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch, _row_desc(row_desc), _auxiliary_mem_usage(0), _need_to_return(false), - _tuple_data_pool(_mem_tracker) { - DCHECK(_mem_tracker != nullptr); + _tuple_data_pool() { _tuple_ptrs_size = _num_rows * _num_tuples_per_row * sizeof(Tuple*); DCHECK_GT(_tuple_ptrs_size, 0); // TODO: switch to Init() pattern so we can check memory limit and return Status. if (config::enable_partitioned_aggregation) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); _mem_tracker->consume(_tuple_ptrs_size); _tuple_ptrs = (Tuple**)(malloc(_tuple_ptrs_size)); DCHECK(_tuple_ptrs != nullptr); @@ -221,8 +222,8 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch, // xfer += iprot->readString(this->tuple_data[_i9]); // to allocated string data in special mempool // (change via python script that runs over Data_types.cc) -RowBatch::RowBatch(const RowDescriptor& row_desc, const TRowBatch& input_batch, MemTracker* tracker) - : _mem_tracker(tracker), +RowBatch::RowBatch(const RowDescriptor& row_desc, const TRowBatch& input_batch) + : _mem_tracker(thread_local_ctx.thread_mem_tracker()), _has_in_flight_row(false), _num_rows(input_batch.num_rows), _num_uncommitted_rows(0), @@ -233,12 +234,12 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const TRowBatch& input_batch, _row_desc(row_desc), _auxiliary_mem_usage(0), _need_to_return(false), - _tuple_data_pool(_mem_tracker) { - DCHECK(_mem_tracker != nullptr); + _tuple_data_pool() { _tuple_ptrs_size = _num_rows * input_batch.row_tuples.size() * sizeof(Tuple*); DCHECK_GT(_tuple_ptrs_size, 0); // TODO: switch to Init() pattern so we can check memory limit and return Status. if (config::enable_partitioned_aggregation) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); _mem_tracker->consume(_tuple_ptrs_size); _tuple_ptrs = (Tuple**)malloc(_tuple_ptrs_size); DCHECK(_tuple_ptrs != nullptr); @@ -367,6 +368,7 @@ void RowBatch::clear() { _blocks[i]->del(); } if (config::enable_partitioned_aggregation) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); DCHECK(_tuple_ptrs != nullptr); free(_tuple_ptrs); _mem_tracker->release(_tuple_ptrs_size); @@ -485,7 +487,7 @@ void RowBatch::add_io_buffer(DiskIoMgr::BufferDescriptor* buffer) { DCHECK(buffer != nullptr); _io_buffers.push_back(buffer); _auxiliary_mem_usage += buffer->buffer_len(); - buffer->set_mem_tracker(std::shared_ptr(_mem_tracker)); // TODO(yingchun): fixme + buffer->set_mem_tracker(_mem_tracker); } Status RowBatch::resize_and_allocate_tuple_buffer(RuntimeState* state, int64_t* tuple_buffer_size, @@ -567,8 +569,7 @@ void RowBatch::transfer_resource_ownership(RowBatch* dest) { DiskIoMgr::BufferDescriptor* buffer = _io_buffers[i]; dest->_io_buffers.push_back(buffer); dest->_auxiliary_mem_usage += buffer->buffer_len(); - buffer->set_mem_tracker( - std::shared_ptr(dest->_mem_tracker)); // TODO(yingchun): fixme + buffer->set_mem_tracker(dest->_mem_tracker); } _io_buffers.clear(); @@ -677,7 +678,7 @@ void RowBatch::acquire_state(RowBatch* src) { DiskIoMgr::BufferDescriptor* buffer = src->_io_buffers[i]; _io_buffers.push_back(buffer); _auxiliary_mem_usage += buffer->buffer_len(); - buffer->set_mem_tracker(std::shared_ptr(_mem_tracker)); // TODO(yingchun): fixme + buffer->set_mem_tracker(_mem_tracker); } src->_io_buffers.clear(); src->_auxiliary_mem_usage = 0; diff --git a/be/src/runtime/row_batch.h b/be/src/runtime/row_batch.h index 070a1e578fb4e5..5674c1f541fa72 100644 --- a/be/src/runtime/row_batch.h +++ b/be/src/runtime/row_batch.h @@ -83,14 +83,14 @@ class RowBatch : public RowBatchInterface { // Create RowBatch for a maximum of 'capacity' rows of tuples specified // by 'row_desc'. - RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_tracker); + RowBatch(const RowDescriptor& row_desc, int capacity); // Populate a row batch from input_batch by copying input_batch's // tuple_data into the row batch's mempool and converting all offsets // in the data back into pointers. // TODO: figure out how to transfer the data from input_batch to this RowBatch // (so that we don't need to make yet another copy) - RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch, MemTracker* tracker); + RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch); // Releases all resources accumulated at this row batch. This includes // - tuple_ptrs @@ -394,7 +394,7 @@ class RowBatch : public RowBatchInterface { std::string to_string(); private: - MemTracker* _mem_tracker; // not owned + std::shared_ptr _mem_tracker; // not owned // Close owned tuple streams and delete if needed. void close_tuple_streams(); diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index 42ec2e43561587..e8b6d8b36b7fae 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -27,6 +27,7 @@ #include "runtime/plan_fragment_executor.h" #include "runtime/runtime_filter_mgr.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "service/brpc.h" #include "util/brpc_client_cache.h" #include "util/time.h" @@ -46,13 +47,15 @@ RuntimeFilterMgr::RuntimeFilterMgr(const UniqueId& query_id, RuntimeState* state RuntimeFilterMgr::~RuntimeFilterMgr() {} Status RuntimeFilterMgr::init() { - DCHECK(_state->instance_mem_tracker().get() != nullptr); - _tracker = _state->instance_mem_tracker().get(); + DCHECK(_state->instance_mem_tracker() != nullptr); + _tracker = MemTracker::create_tracker(-1, "RuntimeFilterMgr", _state->instance_mem_tracker(), + MemTrackerLevel::TASK); return Status::OK(); } Status RuntimeFilterMgr::get_filter_by_role(const int filter_id, const RuntimeFilterRole role, IRuntimeFilter** target) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_tracker); int32_t key = filter_id; std::map* filter_map = nullptr; @@ -84,6 +87,7 @@ Status RuntimeFilterMgr::regist_filter(const RuntimeFilterRole role, const TRunt const TQueryOptions& options, int node_id) { DCHECK((role == RuntimeFilterRole::CONSUMER && node_id >= 0) || role != RuntimeFilterRole::CONSUMER); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_tracker); int32_t key = desc.filter_id; std::map* filter_map = nullptr; @@ -102,8 +106,8 @@ Status RuntimeFilterMgr::regist_filter(const RuntimeFilterRole role, const TRunt RuntimeFilterMgrVal filter_mgr_val; filter_mgr_val.role = role; - RETURN_IF_ERROR(IRuntimeFilter::create(_state, _tracker, &_pool, &desc, &options, - role, node_id, &filter_mgr_val.filter)); + RETURN_IF_ERROR(IRuntimeFilter::create(_state, &_pool, &desc, &options, role, node_id, + &filter_mgr_val.filter)); filter_map->emplace(key, filter_mgr_val); @@ -111,6 +115,7 @@ Status RuntimeFilterMgr::regist_filter(const RuntimeFilterRole role, const TRunt } Status RuntimeFilterMgr::update_filter(const PPublishFilterRequest* request, const char* data) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_tracker); UpdateRuntimeFilterParams params; params.request = request; params.data = data; @@ -137,26 +142,25 @@ Status RuntimeFilterMgr::get_merge_addr(TNetworkAddress* addr) { } Status RuntimeFilterMergeControllerEntity::_init_with_desc( - const TRuntimeFilterDesc* runtime_filter_desc, - const TQueryOptions* query_options, + const TRuntimeFilterDesc* runtime_filter_desc, const TQueryOptions* query_options, const std::vector* target_info, const int producer_size) { std::lock_guard guard(_filter_map_mutex); std::shared_ptr cntVal = std::make_shared(); // runtime_filter_desc and target will be released, // so we need to copy to cntVal - // TODO: tracker should add a name cntVal->producer_size = producer_size; cntVal->runtime_filter_desc = *runtime_filter_desc; cntVal->target_info = *target_info; cntVal->pool.reset(new ObjectPool()); - cntVal->tracker = MemTracker::create_tracker(); - cntVal->filter = cntVal->pool->add( - new IRuntimeFilter(nullptr, cntVal->tracker.get(), cntVal->pool.get())); + cntVal->filter = cntVal->pool->add(new IRuntimeFilter(nullptr, cntVal->pool.get())); std::string filter_id = std::to_string(runtime_filter_desc->filter_id); // LOG(INFO) << "entity filter id:" << filter_id; - cntVal->filter->init_with_desc(&cntVal->runtime_filter_desc, query_options, _fragment_instance_id); + cntVal->filter->init_with_desc(&cntVal->runtime_filter_desc, query_options, + _fragment_instance_id); + cntVal->_tracker = MemTracker::create_tracker( + -1, thread_local_ctx.thread_mem_tracker()->label() + ":FilterID:" + filter_id); _filter_map.emplace(filter_id, cntVal); return Status::OK(); } @@ -166,6 +170,9 @@ Status RuntimeFilterMergeControllerEntity::init(UniqueId query_id, UniqueId frag const TQueryOptions& query_options) { _query_id = query_id; _fragment_instance_id = fragment_instance_id; + _mem_tracker = MemTracker::create_tracker(-1, "RuntimeFilterMergeControllerEntity", nullptr, + MemTrackerLevel::INSTANCE); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); for (auto& filterid_to_desc : runtime_filter_params.rid_to_runtime_filter) { int filter_id = filterid_to_desc.first; const auto& target_iter = runtime_filter_params.rid_to_target_param.find(filter_id); @@ -176,7 +183,8 @@ Status RuntimeFilterMergeControllerEntity::init(UniqueId query_id, UniqueId frag if (build_iter == runtime_filter_params.runtime_filter_builder_num.end()) { return Status::InternalError("runtime filter params meet error"); } - _init_with_desc(&filterid_to_desc.second, &query_options, &target_iter->second, build_iter->second); + _init_with_desc(&filterid_to_desc.second, &query_options, &target_iter->second, + build_iter->second); } return Status::OK(); } @@ -184,6 +192,7 @@ Status RuntimeFilterMergeControllerEntity::init(UniqueId query_id, UniqueId frag // merge data Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* request, const char* data) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::shared_ptr cntVal; int merged_size = 0; { @@ -195,14 +204,13 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ return Status::InvalidArgument("unknown filter id"); } cntVal = iter->second; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(cntVal->_tracker); MergeRuntimeFilterParams params; params.data = data; params.request = request; - std::shared_ptr tracker = iter->second->tracker; ObjectPool* pool = iter->second->pool.get(); RuntimeFilterWrapperHolder holder; - RETURN_IF_ERROR( - IRuntimeFilter::create_wrapper(¶ms, tracker.get(), pool, holder.getHandle())); + RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(¶ms, pool, holder.getHandle())); RETURN_IF_ERROR(cntVal->filter->merge_from(holder.getHandle()->get())); cntVal->arrive_id.insert(UniqueId(request->fragment_id()).to_string()); merged_size = cntVal->arrive_id.size(); diff --git a/be/src/runtime/runtime_filter_mgr.h b/be/src/runtime/runtime_filter_mgr.h index 653ce675b2356a..ec6740673821a9 100644 --- a/be/src/runtime/runtime_filter_mgr.h +++ b/be/src/runtime/runtime_filter_mgr.h @@ -91,7 +91,7 @@ class RuntimeFilterMgr { std::map _producer_map; RuntimeState* _state; - MemTracker* _tracker; + std::shared_ptr _tracker; ObjectPool _pool; TNetworkAddress _merge_addr; @@ -130,13 +130,14 @@ class RuntimeFilterMergeControllerEntity { std::vector target_info; IRuntimeFilter* filter; std::unordered_set arrive_id; // fragment_instance_id ? - std::shared_ptr tracker; + std::shared_ptr _tracker; std::shared_ptr pool; }; UniqueId _query_id; UniqueId _fragment_instance_id; // protect _filter_map std::mutex _filter_map_mutex; + std::shared_ptr _mem_tracker; // TODO: convert filter id to i32 // filter-id -> val std::map> _filter_map; diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index c09a138fb5e20a..9051011eff422c 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -54,8 +54,7 @@ namespace doris { RuntimeState::RuntimeState(const TUniqueId& fragment_instance_id, const TQueryOptions& query_options, const TQueryGlobals& query_globals, ExecEnv* exec_env) - : _fragment_mem_tracker(nullptr), - _profile("Fragment " + print_id(fragment_instance_id)), + : _profile("Fragment " + print_id(fragment_instance_id)), _obj_pool(new ObjectPool()), _runtime_filter_mgr(new RuntimeFilterMgr(TUniqueId(), this)), _data_stream_recvrs_pool(new ObjectPool()), @@ -81,8 +80,7 @@ RuntimeState::RuntimeState(const TUniqueId& fragment_instance_id, RuntimeState::RuntimeState(const TPlanFragmentExecParams& fragment_exec_params, const TQueryOptions& query_options, const TQueryGlobals& query_globals, ExecEnv* exec_env) - : _fragment_mem_tracker(nullptr), - _profile("Fragment " + print_id(fragment_exec_params.fragment_instance_id)), + : _profile("Fragment " + print_id(fragment_exec_params.fragment_instance_id)), _obj_pool(new ObjectPool()), _runtime_filter_mgr(new RuntimeFilterMgr(fragment_exec_params.query_id, this)), _data_stream_recvrs_pool(new ObjectPool()), @@ -211,41 +209,30 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { auto mem_tracker_counter = ADD_COUNTER(&_profile, "MemoryLimit", TUnit::BYTES); mem_tracker_counter->set(bytes_limit); - _query_mem_tracker = - MemTracker::create_tracker(bytes_limit, "RuntimeState:query:" + print_id(query_id), - _exec_env->process_mem_tracker(), MemTrackerLevel::INSTANCE); -#ifdef BE_TEST - if (ExecEnv::GetInstance()->task_pool_mem_tracker_registry() == nullptr) { - _new_query_mem_tracker = - _exec_env->task_pool_mem_tracker_registry()->register_query_mem_tracker( - print_id(query_id), bytes_limit); - } -#else - _new_query_mem_tracker = _exec_env->task_pool_mem_tracker_registry()->register_query_mem_tracker( + if (query_type() == TQueryType::SELECT) { + _query_mem_tracker = _exec_env->task_pool_mem_tracker_registry()->register_query_mem_tracker( + print_id(query_id), bytes_limit); + } else if (query_type() == TQueryType::LOAD) { + _query_mem_tracker = _exec_env->task_pool_mem_tracker_registry()->register_load_mem_tracker( print_id(query_id), bytes_limit); -#endif + } else { + DCHECK(false); + } + _instance_mem_tracker = MemTracker::create_tracker( - -1, "RuntimeState:instance:", _query_mem_tracker, MemTrackerLevel::INSTANCE, &_profile); - - /* - // TODO: this is a stopgap until we implement ExprContext - _udf_mem_tracker.reset( - new MemTracker(-1, "UDFs", _instance_mem_tracker.get())); - _udf_pool.reset(new MemPool(_udf_mem_tracker.get())); - */ - // _udf_pool.reset(new MemPool(_instance_mem_tracker.get())); + -1, "RuntimeState:instance:" + print_id(_fragment_instance_id), _query_mem_tracker, + MemTrackerLevel::INSTANCE, &_profile); RETURN_IF_ERROR(init_buffer_poolstate()); _initial_reservations = _obj_pool->add( - new InitialReservations(_obj_pool.get(), _buffer_reservation, _query_mem_tracker, + new InitialReservations(_obj_pool.get(), _buffer_reservation, nullptr, _query_options.initial_reservation_total_claims)); RETURN_IF_ERROR(_initial_reservations->Init(_query_id, min_reservation())); DCHECK_EQ(0, _initial_reservation_refcnt.load()); if (_instance_buffer_reservation != nullptr) { - _instance_buffer_reservation->InitChildTracker(&_profile, _buffer_reservation, - _instance_mem_tracker.get(), + _instance_buffer_reservation->InitChildTracker(&_profile, _buffer_reservation, nullptr, std::numeric_limits::max()); } @@ -277,8 +264,8 @@ Status RuntimeState::init_buffer_poolstate() { VLOG_QUERY << "Buffer pool limit for " << print_id(_query_id) << ": " << max_reservation; _buffer_reservation = _obj_pool->add(new ReservationTracker); - _buffer_reservation->InitChildTracker(nullptr, exec_env->buffer_reservation(), - _query_mem_tracker.get(), max_reservation); + _buffer_reservation->InitChildTracker(nullptr, exec_env->buffer_reservation(), nullptr, + max_reservation); return Status::OK(); } @@ -291,7 +278,7 @@ Status RuntimeState::create_block_mgr() { block_mgr_limit = std::numeric_limits::max(); } RETURN_IF_ERROR(BufferedBlockMgr2::create( - this, _query_mem_tracker, runtime_profile(), _exec_env->tmp_file_mgr(), block_mgr_limit, + this, runtime_profile(), _exec_env->tmp_file_mgr(), block_mgr_limit, _exec_env->disk_io_mgr()->max_read_buffer_size(), &_block_mgr2)); return Status::OK(); } @@ -334,46 +321,13 @@ void RuntimeState::get_unreported_errors(std::vector* new_errors) { } } -Status RuntimeState::set_mem_limit_exceeded(MemTracker* tracker, int64_t failed_allocation_size, - const std::string* msg) { - DCHECK_GE(failed_allocation_size, 0); +Status RuntimeState::set_mem_limit_exceeded(const std::string& msg) { { std::lock_guard l(_process_status_lock); if (_process_status.ok()) { - if (msg != nullptr) { - _process_status = Status::MemoryLimitExceeded(*msg); - } else { - _process_status = Status::MemoryLimitExceeded("Memory limit exceeded"); - } - } else { - return _process_status; + _process_status = Status::MemoryLimitExceeded(msg); } } - - DCHECK(_query_mem_tracker.get() != nullptr); - std::stringstream ss; - ss << "Memory Limit Exceeded\n"; - if (failed_allocation_size != 0) { - DCHECK(tracker != nullptr); - ss << " " << tracker->label() << " could not allocate " - << PrettyPrinter::print(failed_allocation_size, TUnit::BYTES) - << " without exceeding limit." << std::endl; - } - - // if (_exec_env->process_mem_tracker()->limit_exceeded()) { - // ss << _exec_env->process_mem_tracker()->log_usage(); - // } else { - // ss << _query_mem_tracker->log_usage(); - // } - // log_error(ErrorMsg(TErrorCode::GENERAL, ss.str())); - log_error(ss.str()); - // Add warning about missing stats except for compute stats child queries. - // if (!query_ctx().__isset.parent_query_id && - // query_ctx().__isset.tables_missing_stats && - // !query_ctx().tables_missing_stats.empty()) { - // LogError(ErrorMsg(TErrorCode::GENERAL, - // GetTablesMissingStatsWarning(query_ctx().tables_missing_stats))); - // } DCHECK(_process_status.is_mem_limit_exceeded()); return _process_status; } @@ -381,7 +335,7 @@ Status RuntimeState::set_mem_limit_exceeded(MemTracker* tracker, int64_t failed_ Status RuntimeState::check_query_state(const std::string& msg) { // TODO: it would be nice if this also checked for cancellation, but doing so breaks // cases where we use Status::Cancelled("Cancelled") to indicate that the limit was reached. - RETURN_IF_LIMIT_EXCEEDED(this, msg); + RETURN_IF_LIMIT_EXCEEDED(_instance_mem_tracker, this, msg); return query_status(); } @@ -425,7 +379,7 @@ Status RuntimeState::create_error_log_file() { Status RuntimeState::append_error_msg_to_file(std::function line, std::function error_msg, bool* stop_processing, bool is_summary) { *stop_processing = false; - if (_query_options.query_type != TQueryType::LOAD) { + if (query_type() != TQueryType::LOAD) { return Status::OK(); } // If file havn't been opened, open it here @@ -497,12 +451,6 @@ void RuntimeState::export_load_error(const std::string& err_msg) { } } -// TODO chenhao , check scratch_limit, disable_spilling and file_group -// before spillng -Status RuntimeState::StartSpilling(MemTracker* mem_tracker) { - return Status::InternalError("Mem limit exceeded."); -} - int64_t RuntimeState::get_load_mem_limit() { if (_query_options.__isset.load_mem_limit && _query_options.load_mem_limit > 0) { return _query_options.load_mem_limit; diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 49248b3eb01784..e058706928a086 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -84,7 +84,6 @@ class RuntimeState { // The instance tracker is tied to our profile. // Specific parts of the fragment (i.e. exec nodes, sinks, data stream senders, etc) // will add a fourth level when they are initialized. - // This function also initializes a user function mem tracker (in the fourth level). Status init_mem_trackers(const TUniqueId& query_id); // for ut only @@ -113,6 +112,7 @@ class RuntimeState { int max_errors() const { return _query_options.max_errors; } int max_io_buffers() const { return _query_options.max_io_buffers; } int num_scanner_threads() const { return _query_options.num_scanner_threads; } + TQueryType::type query_type() const { return _query_options.query_type; } int64_t timestamp_ms() const { return _timestamp_ms; } const std::string& timezone() const { return _timezone; } const cctz::time_zone& timezone_obj() const { return _timezone_obj; } @@ -121,8 +121,6 @@ class RuntimeState { const TUniqueId& query_id() const { return _query_id; } const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } ExecEnv* exec_env() { return _exec_env; } - const std::vector>& mem_trackers() { return _mem_trackers; } - std::shared_ptr fragment_mem_tracker() { return _fragment_mem_tracker; } std::shared_ptr query_mem_tracker() { return _query_mem_tracker; } std::shared_ptr instance_mem_tracker() { return _instance_mem_tracker; } ThreadResourceMgr::ResourcePool* resource_pool() { return _resource_pool; } @@ -158,22 +156,6 @@ class RuntimeState { return _process_status; }; - // MemPool* udf_pool() { - // return _udf_pool.get(); - // }; - - // Create and return a stream receiver for _fragment_instance_id - // from the data stream manager. The receiver is added to _data_stream_recvrs_pool. - DataStreamRecvr* create_recvr(const RowDescriptor& row_desc, PlanNodeId dest_node_id, - int num_senders, int buffer_size, RuntimeProfile* profile); - - // Sets the fragment memory limit and adds it to _mem_trackers - void set_fragment_mem_tracker(std::shared_ptr tracker) { - DCHECK(_fragment_mem_tracker == nullptr); - _fragment_mem_tracker = tracker; - _mem_trackers.push_back(tracker); - } - // Appends error to the _error_log if there is space bool log_error(const std::string& error); @@ -226,19 +208,11 @@ class RuntimeState { _process_status = status; } - // Sets query_status_ to MEM_LIMIT_EXCEEDED and logs all the registered trackers. - // Subsequent calls to this will be no-ops. Returns query_status_. - // If 'failed_allocation_size' is not 0, then it is the size of the allocation (in - // bytes) that would have exceeded the limit allocated for 'tracker'. - // This value and tracker are only used for error reporting. + // Sets _process_status to MEM_LIMIT_EXCEEDED. + // Subsequent calls to this will be no-ops. Returns _process_status. // If 'msg' is non-nullptr, it will be appended to query_status_ in addition to the // generic "Memory limit exceeded" error. - Status set_mem_limit_exceeded(MemTracker* tracker = nullptr, int64_t failed_allocation_size = 0, - const std::string* msg = nullptr); - - Status set_mem_limit_exceeded(const std::string& msg) { - return set_mem_limit_exceeded(nullptr, 0, &msg); - } + Status set_mem_limit_exceeded(const std::string& msg = "Memory limit exceeded"); // Returns a non-OK status if query execution should stop (e.g., the query was cancelled // or a mem limit was exceeded). Exec nodes should check this periodically so execution @@ -397,17 +371,9 @@ class RuntimeState { static const int DEFAULT_BATCH_SIZE = 2048; - // all mem limits that apply to this query - std::vector> _mem_trackers; - - // Fragment memory limit. Also contained in _mem_trackers - std::shared_ptr _fragment_mem_tracker; - // MemTracker that is shared by all fragment instances running on this host. // The query mem tracker must be released after the _instance_mem_tracker. std::shared_ptr _query_mem_tracker; - // TODO(zxy): Will replace _query_mem_tracker in future. - std::shared_ptr _new_query_mem_tracker; // Memory usage of this fragment instance std::shared_ptr _instance_mem_tracker; diff --git a/be/src/runtime/sorted_run_merger.cc b/be/src/runtime/sorted_run_merger.cc index 28baab462e8105..e4a15c56a1cd53 100644 --- a/be/src/runtime/sorted_run_merger.cc +++ b/be/src/runtime/sorted_run_merger.cc @@ -178,6 +178,7 @@ class SortedRunMerger::ParallelBatchedRowSupplier : public SortedRunMerger::Batc std::condition_variable _batch_prepared_cv; void process_sorted_run_task() { + // TODO(zxy) Whether to attach mem tracker std::unique_lock lock(_mutex); while (true) { _batch_prepared_cv.wait(lock, [this]() { return !_backup_ready.load(); }); @@ -307,11 +308,9 @@ Status SortedRunMerger::get_next(RowBatch* output_batch, bool* eos) { ChildSortedRunMerger::ChildSortedRunMerger(const TupleRowComparator& compare_less_than, RowDescriptor* row_desc, RuntimeProfile* profile, - MemTracker* parent, uint32_t row_batch_size, - bool deep_copy_input) + uint32_t row_batch_size, bool deep_copy_input) : SortedRunMerger(compare_less_than, row_desc, profile, deep_copy_input), _eos(false), - _parent(parent), _row_batch_size(row_batch_size) { _get_next_timer = ADD_TIMER(profile, "ChildMergeGetNext"); _get_next_batch_timer = ADD_TIMER(profile, "ChildMergeGetNextBatch"); @@ -323,7 +322,7 @@ Status ChildSortedRunMerger::get_batch(RowBatch** output_batch) { return Status::OK(); } - _current_row_batch.reset(new RowBatch(*_input_row_desc, _row_batch_size, _parent)); + _current_row_batch.reset(new RowBatch(*_input_row_desc, _row_batch_size)); bool eos = false; RETURN_IF_ERROR(get_next(_current_row_batch.get(), &eos)); diff --git a/be/src/runtime/sorted_run_merger.h b/be/src/runtime/sorted_run_merger.h index b73cdc1b0cbbe7..c448ac8b0cafed 100644 --- a/be/src/runtime/sorted_run_merger.h +++ b/be/src/runtime/sorted_run_merger.h @@ -109,8 +109,7 @@ class SortedRunMerger { class ChildSortedRunMerger : public SortedRunMerger { public: ChildSortedRunMerger(const TupleRowComparator& compare_less_than, RowDescriptor* row_desc, - RuntimeProfile* profile, MemTracker* _parent, uint32_t row_batch_size, - bool deep_copy_input); + RuntimeProfile* profile, uint32_t row_batch_size, bool deep_copy_input); Status get_batch(RowBatch** output_batch) override; @@ -121,8 +120,6 @@ class ChildSortedRunMerger : public SortedRunMerger { // The data in merger is exhaust bool _eos = false; - MemTracker* _parent; - uint32_t _row_batch_size; }; diff --git a/be/src/runtime/spill_sorter.cc b/be/src/runtime/spill_sorter.cc index a461ebe7faff9a..fc9213501e258b 100644 --- a/be/src/runtime/spill_sorter.cc +++ b/be/src/runtime/spill_sorter.cc @@ -638,10 +638,7 @@ Status SpillSorter::Run::prepare_read() { _pin_next_fixed_len_block = _pin_next_var_len_block = false; _num_tuples_returned = 0; - // _buffered_batch.reset(new RowBatch(*_sorter->_output_row_desc, - // _sorter->_state->batch_size(), _sorter->_mem_tracker)); - _buffered_batch.reset(new RowBatch(*_sorter->_output_row_desc, _sorter->_state->batch_size(), - _sorter->_mem_tracker.get())); + _buffered_batch.reset(new RowBatch(*_sorter->_output_row_desc, _sorter->_state->batch_size())); // If the run is pinned, merge is not invoked, so _buffered_batch is not needed // and the individual blocks do not need to be pinned. @@ -1253,8 +1250,7 @@ Status SpillSorter::merge_intermediate_runs() { std::min(max_runs_per_intermediate_merge, _sorted_runs.size() - max_runs_per_intermediate_merge); RETURN_IF_ERROR(create_merger(num_runs_to_merge)); - RowBatch intermediate_merge_batch(*_output_row_desc, _state->batch_size(), - _mem_tracker.get()); + RowBatch intermediate_merge_batch(*_output_row_desc, _state->batch_size()); // merged_run is the new sorted run that is produced by the intermediate merge. Run* merged_run = _obj_pool.add(new Run(this, _output_row_desc->tuple_descriptors()[0], false)); diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp index bddd9fa281b9b3..e11371ceafe20a 100644 --- a/be/src/runtime/tablets_channel.cpp +++ b/be/src/runtime/tablets_channel.cpp @@ -21,6 +21,7 @@ #include "gutil/strings/substitute.h" #include "olap/delta_writer.h" #include "olap/memtable.h" +#include "runtime/thread_context.h" #include "runtime/row_batch.h" #include "runtime/tuple_row.h" #include "util/doris_metrics.h" @@ -32,10 +33,9 @@ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(tablet_writer_count, MetricUnit::NOUNIT); std::atomic TabletsChannel::_s_tablet_writer_count; TabletsChannel::TabletsChannel(const TabletsChannelKey& key, - const std::shared_ptr& mem_tracker, bool is_high_priority) : _key(key), _state(kInitialized), _closed_senders(64), _is_high_priority(is_high_priority) { - _mem_tracker = MemTracker::create_tracker(-1, "TabletsChannel", mem_tracker); + _mem_tracker = MemTracker::create_tracker(-1, "TabletsChannel:" + key.index_id); static std::once_flag once_flag; std::call_once(once_flag, [] { REGISTER_HOOK_METRIC(tablet_writer_count, [&]() { return _s_tablet_writer_count.load(); }); @@ -52,6 +52,7 @@ TabletsChannel::~TabletsChannel() { } Status TabletsChannel::open(const PTabletWriterOpenRequest& request) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (_state == kOpened) { // Normal case, already open by other sender @@ -78,6 +79,7 @@ Status TabletsChannel::open(const PTabletWriterOpenRequest& request) { Status TabletsChannel::add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(request.tablet_ids_size() == request.row_batch().num_rows()); int64_t cur_seq; { @@ -101,7 +103,7 @@ Status TabletsChannel::add_batch(const PTabletWriterAddBatchRequest& request, } } - RowBatch row_batch(*_row_desc, request.row_batch(), _mem_tracker.get()); + RowBatch row_batch(*_row_desc, request.row_batch()); std::unordered_map /* row index */> tablet_to_rowidxs; for (int i = 0; i < request.tablet_ids_size(); ++i) { int64_t tablet_id = request.tablet_ids(i); @@ -150,6 +152,7 @@ Status TabletsChannel::add_batch(const PTabletWriterAddBatchRequest& request, Status TabletsChannel::close(int sender_id, int64_t backend_id, bool* finished, const google::protobuf::RepeatedField& partition_ids, google::protobuf::RepeatedPtrField* tablet_vec) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (_state == kFinished) { return _close_status; @@ -199,13 +202,12 @@ Status TabletsChannel::close(int sender_id, int64_t backend_id, bool* finished, // tablet_vec will only contains success tablet, and then let FE judge it. writer->close_wait(tablet_vec, (_broken_tablets.find(writer->tablet_id()) != _broken_tablets.end())); } - // TODO(gaodayue) clear and destruct all delta writers to make sure all memory are freed - // DCHECK_EQ(_mem_tracker->consumption(), 0); } return Status::OK(); } Status TabletsChannel::reduce_mem_usage(int64_t mem_limit) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (_state == kFinished) { // TabletsChannel is closed without LoadChannel's lock, @@ -261,6 +263,7 @@ Status TabletsChannel::reduce_mem_usage(int64_t mem_limit) { } Status TabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& request) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::vector* index_slots = nullptr; int32_t schema_hash = 0; for (auto& index : _schema->indexes()) { @@ -289,7 +292,7 @@ Status TabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& request wrequest.is_high_priority = _is_high_priority; DeltaWriter* writer = nullptr; - auto st = DeltaWriter::open(&wrequest, _mem_tracker, &writer); + auto st = DeltaWriter::open(&wrequest, &writer); if (st != OLAP_SUCCESS) { std::stringstream ss; ss << "open delta writer failed, tablet_id=" << tablet.tablet_id() @@ -306,6 +309,7 @@ Status TabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& request } Status TabletsChannel::cancel() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (_state == kFinished) { return _close_status; diff --git a/be/src/runtime/tablets_channel.h b/be/src/runtime/tablets_channel.h index e99ac6264b396b..226b2b76db05f5 100644 --- a/be/src/runtime/tablets_channel.h +++ b/be/src/runtime/tablets_channel.h @@ -54,7 +54,7 @@ class OlapTableSchemaParam; // Write channel for a particular (load, index). class TabletsChannel { public: - TabletsChannel(const TabletsChannelKey& key, const std::shared_ptr& mem_tracker, bool is_high_priority); + TabletsChannel(const TabletsChannelKey& key, bool is_high_priority); ~TabletsChannel(); diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index d97a3c7c9b745e..72bd87afa3558a 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -22,19 +22,40 @@ #include "runtime/thread_mem_tracker_mgr.h" -#define SCOPED_ATTACH_TASK_THREAD(type, task_id, fragment_instance_id) \ - auto VARNAME_LINENUM(attach_task_thread) = AttachTaskThread(type, task_id, fragment_instance_id) -#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(mem_tracker, action_name, cancel_work) \ - auto VARNAME_LINENUM(switch_tracker) = \ - SwitchThreadMemTracker(mem_tracker, action_name, cancel_work) -#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB(mem_tracker, action_name, cancel_work, \ - err_call_back_func) \ - auto VARNAME_LINENUM(switch_tracker) = \ - SwitchThreadMemTracker(mem_tracker, action_name, cancel_work, err_call_back_func) +// Attach to task when thread starts +#define SCOPED_ATTACH_TASK_THREAD_2ARG(type, mem_tracker) \ + auto VARNAME_LINENUM(attach_task_thread) = AttachTaskThread(type, mem_tracker) +#define SCOPED_ATTACH_TASK_THREAD_4ARG(query_type, task_id, fragment_instance_id, mem_tracker) \ + auto VARNAME_LINENUM(attach_task_thread) = \ + AttachTaskThread(query_type, task_id, fragment_instance_id, mem_tracker) +// Toggle MemTracker during thread execution +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker) \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker, action_type) \ + do { \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker); \ + auto VARNAME_LINENUM(switch_tracker_cb) = SwitchThreadMemTrackerCallBack(action_type); \ + } while (false) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_3ARG(mem_tracker, action_type, cancel_work) \ + do { \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker); \ + auto VARNAME_LINENUM(switch_tracker_cb) = \ + SwitchThreadMemTrackerCallBack(action_type, cancel_work); \ + } while (false) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_4ARG(mem_tracker, action_type, cancel_work, \ + err_call_back_func) \ + do { \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker); \ + auto VARNAME_LINENUM(switch_tracker_cb) = \ + SwitchThreadMemTrackerCallBack(action_type, cancel_work, err_call_back_func); \ + } while (false) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB(action_type) \ + auto VARNAME_LINENUM(switch_tracker_cb) = SwitchThreadMemTrackerCallBack(action_type) #define SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER() \ auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker(true) #define GLOBAL_STOP_THREAD_LOCAL_MEM_TRACKER() \ auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker(false) +#define CHECK_MEM_LIMIT(size) thread_local_ctx.thread_mem_tracker()->check_limit(size) namespace doris { @@ -65,23 +86,22 @@ class ThreadContext { ~ThreadContext() {} void attach(const TaskType& type, const std::string& task_id, - const TUniqueId& fragment_instance_id = TUniqueId()) { + const TUniqueId& fragment_instance_id, std::shared_ptr mem_tracker) { _type = type; _task_id = task_id; - if (type == TaskType::QUERY) { - _fragment_instance_id = fragment_instance_id; - _thread_mem_tracker_mgr->attach_query(task_id, fragment_instance_id); - } + _fragment_instance_id = fragment_instance_id; + _thread_mem_tracker_mgr->attach_task(get_type(), task_id, fragment_instance_id, + mem_tracker); } void detach() { _type = TaskType::UNKNOWN; _task_id = ""; _fragment_instance_id = TUniqueId(); - _thread_mem_tracker_mgr->detach(); + _thread_mem_tracker_mgr->detach_task(); } - const std::string type() const; + const std::string get_type() const; const std::string& task_id() const { return _task_id; } const std::thread::id& thread_id() const { return _thread_id; } const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } @@ -105,8 +125,8 @@ class ThreadContext { return _thread_mem_tracker_mgr->update_tracker(mem_tracker); } std::shared_ptr update_thread_tracker_call_back( - const std::string& action_name, bool cancel_task, ERRCALLBACK err_call_back_func) { - return _thread_mem_tracker_mgr->update_consume_err_call_back(action_name, cancel_task, + const std::string& action_type, bool cancel_task, ERRCALLBACK err_call_back_func) { + return _thread_mem_tracker_mgr->update_consume_err_call_back(action_type, cancel_task, err_call_back_func); } std::shared_ptr update_thread_tracker_call_back( @@ -154,15 +174,33 @@ inline const std::string task_type_string(ThreadContext::TaskType type) { } } -inline const std::string ThreadContext::type() const { +inline const std::string ThreadContext::get_type() const { return task_type_string(_type); } class AttachTaskThread { public: - explicit AttachTaskThread(const ThreadContext::TaskType& type, const std::string& task_id, - const TUniqueId& fragment_instance_id = TUniqueId()) { - thread_local_ctx.attach(type, task_id, fragment_instance_id); + explicit AttachTaskThread(const ThreadContext::TaskType& type, + std::shared_ptr mem_tracker) { + DCHECK(mem_tracker != nullptr); + init(type, "", TUniqueId(), mem_tracker); + } + + explicit AttachTaskThread(const TQueryType::type& query_type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + std::shared_ptr mem_tracker) { + DCHECK(task_id != "" && fragment_instance_id != TUniqueId() && mem_tracker != nullptr); + if (query_type == TQueryType::SELECT) { + init(ThreadContext::TaskType::QUERY, task_id, fragment_instance_id, mem_tracker); + } else if (query_type == TQueryType::LOAD) { + init(ThreadContext::TaskType::LOAD, task_id, fragment_instance_id, mem_tracker); + } + } + + void init(const ThreadContext::TaskType& type, const std::string& task_id = "", + const TUniqueId& fragment_instance_id = TUniqueId(), + std::shared_ptr mem_tracker = nullptr) { + thread_local_ctx.attach(type, task_id, fragment_instance_id, mem_tracker); } ~AttachTaskThread() { thread_local_ctx.detach(); } @@ -170,22 +208,53 @@ class AttachTaskThread { class SwitchThreadMemTracker { public: - explicit SwitchThreadMemTracker(std::shared_ptr mem_tracker, - const std::string& action_name = std::string(), - bool cancel_work = true, - ERRCALLBACK err_call_back_func = nullptr) { - _old_mem_tracker = thread_local_ctx.update_thread_tracker(mem_tracker); - _old_tracker_call_back = thread_local_ctx.update_thread_tracker_call_back( - action_name, cancel_work, err_call_back_func); + explicit SwitchThreadMemTracker(std::shared_ptr mem_tracker) { + DCHECK(mem_tracker != nullptr); + if (mem_tracker != thread_local_ctx.thread_mem_tracker()) { + _old_mem_tracker = thread_local_ctx.update_thread_tracker(mem_tracker); + } } ~SwitchThreadMemTracker() { - thread_local_ctx.update_thread_tracker(_old_mem_tracker); - thread_local_ctx.update_thread_tracker_call_back(_old_tracker_call_back); + std::shared_ptr p = _old_mem_tracker.lock(); + if (p) { + thread_local_ctx.update_thread_tracker(_old_mem_tracker); + } } private: std::weak_ptr _old_mem_tracker; +}; + +class SwitchThreadMemTrackerCallBack { +public: + explicit SwitchThreadMemTrackerCallBack(const std::string& action_type) { + DCHECK(action_type != std::string()); + init(action_type); + } + + explicit SwitchThreadMemTrackerCallBack(const std::string& action_type, bool cancel_work) { + DCHECK(action_type != std::string()); + init(action_type, cancel_work); + } + + explicit SwitchThreadMemTrackerCallBack(const std::string& action_type, bool cancel_work, + ERRCALLBACK err_call_back_func) { + DCHECK(action_type != std::string() && err_call_back_func != nullptr); + init(action_type, cancel_work, err_call_back_func); + } + + void init(const std::string& action_type = std::string(), bool cancel_work = true, + ERRCALLBACK err_call_back_func = nullptr) { + _old_tracker_call_back = thread_local_ctx.update_thread_tracker_call_back( + action_type, cancel_work, err_call_back_func); + } + + ~SwitchThreadMemTrackerCallBack() { + thread_local_ctx.update_thread_tracker_call_back(_old_tracker_call_back); + } + +private: std::shared_ptr _old_tracker_call_back; }; diff --git a/be/src/runtime/thread_mem_tracker_mgr.cpp b/be/src/runtime/thread_mem_tracker_mgr.cpp index 053991e53f5f89..2c3696e7f85d23 100644 --- a/be/src/runtime/thread_mem_tracker_mgr.cpp +++ b/be/src/runtime/thread_mem_tracker_mgr.cpp @@ -23,8 +23,7 @@ namespace doris { std::shared_ptr ThreadMemTrackerMgr::default_mem_tracker() { - ExecEnv* exec_env = ExecEnv::GetInstance(); - std::shared_ptr process_tracker = exec_env->new_process_mem_tracker(); + std::shared_ptr process_tracker = ExecEnv::GetInstance()->process_mem_tracker(); if (process_tracker != nullptr) { return process_tracker; } else { @@ -32,26 +31,32 @@ std::shared_ptr ThreadMemTrackerMgr::default_mem_tracker() { } } -void ThreadMemTrackerMgr::attach_query(const std::string& query_id, - const TUniqueId& fragment_instance_id) { - DCHECK(query_id != "" && fragment_instance_id != TUniqueId()); - _query_id = query_id; +void ThreadMemTrackerMgr::attach_task(const std::string& action_type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + std::shared_ptr mem_tracker) { + DCHECK(task_id != "" && fragment_instance_id != TUniqueId()); + _task_id = task_id; _fragment_instance_id = fragment_instance_id; - _consume_err_call_back = std::make_shared("Query", true, nullptr); + _consume_err_call_back = std::make_shared(action_type, true, nullptr); + if (mem_tracker == nullptr) { #ifdef BE_TEST - if (ExecEnv::GetInstance()->task_pool_mem_tracker_registry() == nullptr) { - return; - } + if (ExecEnv::GetInstance()->task_pool_mem_tracker_registry() == nullptr) { + return; + } #endif - update_tracker(ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_query_mem_tracker( - query_id)); + update_tracker( + ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_task_mem_tracker( + task_id)); + } else { + update_tracker(mem_tracker); + } } -void ThreadMemTrackerMgr::detach() { +void ThreadMemTrackerMgr::detach_task() { update_tracker(default_mem_tracker()); - _query_id = ""; + _task_id = ""; _fragment_instance_id = TUniqueId(); - _consume_err_call_back = std::make_shared("", true, nullptr); + _consume_err_call_back = std::make_shared("", false, nullptr); } std::weak_ptr ThreadMemTrackerMgr::update_tracker( @@ -68,10 +73,10 @@ std::weak_ptr ThreadMemTrackerMgr::update_tracker( } std::shared_ptr ThreadMemTrackerMgr::update_consume_err_call_back( - const std::string& action_name, bool cancel_task, ERRCALLBACK call_back_func) { + const std::string& action_type, bool cancel_task, ERRCALLBACK call_back_func) { std::shared_ptr old_consume_err_call_back = _consume_err_call_back; _consume_err_call_back = - std::make_shared(action_name, cancel_task, call_back_func); + std::make_shared(action_type, cancel_task, call_back_func); return old_consume_err_call_back; } @@ -82,41 +87,29 @@ std::shared_ptr ThreadMemTrackerMgr::update_consume_err_ return old_consume_err_call_back; } -void ThreadMemTrackerMgr::exceeded_cancel_query() { - if (_fragment_instance_id != TUniqueId() && ExecEnv::GetInstance()->initialized() && +void ThreadMemTrackerMgr::exceeded_cancel_task(const std::string& cancel_details) { + std::shared_ptr task_mem_tracker = + ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_task_mem_tracker( + _task_id); + if (task_mem_tracker != nullptr && task_mem_tracker->limit_exceeded() && + _fragment_instance_id != TUniqueId() && ExecEnv::GetInstance()->initialized() && ExecEnv::GetInstance()->fragment_mgr()->is_canceling(_fragment_instance_id).ok()) { - std::string detail = - " {} Memory exceed limit in TCMalloc Hook New, Backend: {}, QueryID: {}, " - "FragmentID: {}, Used: {}, Limit: {}. You can change the limit by session variable " - "exec_mem_limit."; ExecEnv::GetInstance()->fragment_mgr()->cancel( _fragment_instance_id, PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED, - fmt::format(detail, _consume_err_call_back->action_name, - BackendOptions::get_localhost(), _query_id, - print_id(_fragment_instance_id), - std::to_string(_mem_tracker.lock()->consumption()), - std::to_string(_mem_tracker.lock()->limit()))); + cancel_details); _fragment_instance_id = TUniqueId(); // Make sure it will only be canceled once } } -void ThreadMemTrackerMgr::exceeded(Status st, int64_t mem_usage) { - DCHECK(st.is_mem_limit_exceeded()); - std::string detail = st.to_string() + ", in TCMalloc Hook New."; - auto rst = _mem_tracker.lock()->mem_limit_exceeded(nullptr, detail, mem_usage); +void ThreadMemTrackerMgr::exceeded(int64_t mem_usage, Status st) { + auto rst = _mem_tracker.lock()->mem_limit_exceeded( + nullptr, "In TCMalloc Hook, " + _consume_err_call_back->action_type, mem_usage, st); if (_consume_err_call_back->call_back_func != nullptr) { _consume_err_call_back->call_back_func(); } - if (_query_id != "") { - std::shared_ptr query_mem_tracker = - ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_query_mem_tracker( - _query_id); - if (_consume_err_call_back->cancel_task == true || - (query_mem_tracker != nullptr && query_mem_tracker->limit_exceeded())) { - exceeded_cancel_query(); - } + if (_task_id != "" && _consume_err_call_back->cancel_task == true) { + exceeded_cancel_task(rst.to_string()); } - LOG(WARNING) << rst.to_string(); } void ThreadMemTrackerMgr::noncache_consume() { @@ -131,7 +124,7 @@ void ThreadMemTrackerMgr::noncache_consume() { // The memory has been allocated, so when TryConsume fails, need to continue to complete // the consume to ensure the accuracy of the statistics. _mem_tracker.lock()->consume(_untracked_mem); - exceeded(st, _untracked_mem); + exceeded(_untracked_mem, st); } } } diff --git a/be/src/runtime/thread_mem_tracker_mgr.h b/be/src/runtime/thread_mem_tracker_mgr.h index d519ae4eeeec83..5d56453ff260e5 100644 --- a/be/src/runtime/thread_mem_tracker_mgr.h +++ b/be/src/runtime/thread_mem_tracker_mgr.h @@ -28,12 +28,12 @@ namespace doris { typedef void (*ERRCALLBACK)(); struct ConsumeErrCallBackInfo { - std::string action_name; - bool cancel_task; + std::string action_type; + bool cancel_task; // Whether to cancel the task when the current tracker exceeds the limit ERRCALLBACK call_back_func; - ConsumeErrCallBackInfo(std::string action_name, bool cancel_task, ERRCALLBACK call_back_func) - : action_name(action_name), cancel_task(cancel_task), call_back_func(call_back_func) {} + ConsumeErrCallBackInfo(std::string action_type, bool cancel_task, ERRCALLBACK call_back_func) + : action_type(action_type), cancel_task(cancel_task), call_back_func(call_back_func) {} }; // TCMalloc new/delete Hook is counted in the memory_tracker of the current thread. @@ -45,15 +45,19 @@ struct ConsumeErrCallBackInfo { // need to manually call cosume after stop_mem_tracker, and then start_mem_tracker. class ThreadMemTrackerMgr { public: - ThreadMemTrackerMgr() : _mem_tracker(default_mem_tracker()) {} - ~ThreadMemTrackerMgr() { detach(); } + ThreadMemTrackerMgr() : _mem_tracker(default_mem_tracker()) { + _consume_err_call_back = std::make_shared("", false, nullptr); + } + ~ThreadMemTrackerMgr() { detach_task(); } std::shared_ptr default_mem_tracker(); - // After attach, the current thread TCMalloc Hook starts to consume/release query mem_tracker - void attach_query(const std::string& query_id, const TUniqueId& fragment_instance_id); + // After attach, the current thread TCMalloc Hook starts to consume/release task mem_tracker + void attach_task(const std::string& action_type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + std::shared_ptr mem_tracker); - void detach(); + void detach_task(); std::weak_ptr update_tracker(std::weak_ptr mem_tracker); std::shared_ptr update_consume_err_call_back( @@ -73,9 +77,10 @@ class ThreadMemTrackerMgr { void start_mem_tracker() { _stop_mem_tracker = false; } private: - void exceeded_cancel_query(); + // If tryConsume fails due to task mem tracker exceeding the limit, the task must be canceled + void exceeded_cancel_task(const std::string& cancel_details); - void exceeded(Status st, int64_t mem_usage); + void exceeded(int64_t mem_usage, Status st); private: std::weak_ptr _mem_tracker; @@ -94,7 +99,7 @@ class ThreadMemTrackerMgr { std::shared_ptr _consume_err_call_back; - std::string _query_id; + std::string _task_id; TUniqueId _fragment_instance_id; }; diff --git a/be/src/runtime/vectorized_row_batch.cpp b/be/src/runtime/vectorized_row_batch.cpp index f26822833ce96c..06a19fcd9e6ea8 100644 --- a/be/src/runtime/vectorized_row_batch.cpp +++ b/be/src/runtime/vectorized_row_batch.cpp @@ -23,14 +23,12 @@ namespace doris { VectorizedRowBatch::VectorizedRowBatch(const TabletSchema* schema, - const std::vector& cols, int capacity, - const std::shared_ptr& parent_tracker) + const std::vector& cols, int capacity) : _schema(schema), _cols(cols), _capacity(capacity), _limit(capacity) { _selected_in_use = false; _size = 0; - _tracker = MemTracker::create_tracker(-1, "VectorizedRowBatch", parent_tracker); - _mem_pool.reset(new MemPool(_tracker.get())); + _mem_pool.reset(new MemPool()); _selected = reinterpret_cast(new char[sizeof(uint16_t) * _capacity]); diff --git a/be/src/runtime/vectorized_row_batch.h b/be/src/runtime/vectorized_row_batch.h index 2f29f38cc345dc..6819f01c6ab1fc 100644 --- a/be/src/runtime/vectorized_row_batch.h +++ b/be/src/runtime/vectorized_row_batch.h @@ -61,8 +61,7 @@ class ColumnVector { class VectorizedRowBatch { public: - VectorizedRowBatch(const TabletSchema* schema, const std::vector& cols, int capacity, - const std::shared_ptr& parent_tracker = nullptr); + VectorizedRowBatch(const TabletSchema* schema, const std::vector& cols, int capacity); ~VectorizedRowBatch() { for (auto vec : _col_vectors) { @@ -120,7 +119,6 @@ class VectorizedRowBatch { bool _selected_in_use = false; uint8_t _block_status; - std::shared_ptr _tracker; std::unique_ptr _mem_pool; uint16_t _limit; }; diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index a5f27b76930d6c..62b07ee0745cc1 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -290,7 +290,7 @@ int main(int argc, char** argv) { #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER) doris::MemInfo::refresh_current_mem(); #endif - doris::ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->logout_query_mem_tracker(); + doris::ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->logout_task_mem_tracker(); sleep(10); } diff --git a/be/src/testutil/function_utils.cpp b/be/src/testutil/function_utils.cpp index 2ebb4c22f8b94a..28aaeb2455d977 100644 --- a/be/src/testutil/function_utils.cpp +++ b/be/src/testutil/function_utils.cpp @@ -20,7 +20,6 @@ #include #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "udf/udf_internal.h" namespace doris { @@ -28,8 +27,7 @@ namespace doris { FunctionUtils::FunctionUtils() { doris_udf::FunctionContext::TypeDesc return_type; std::vector arg_types; - _mem_tracker.reset(new MemTracker(-1, "function util")); - _memory_pool = new MemPool(_mem_tracker.get()); + _memory_pool = new MemPool(); _fn_ctx = FunctionContextImpl::create_context(_state, _memory_pool, return_type, arg_types, 0, false); } @@ -37,8 +35,7 @@ FunctionUtils::FunctionUtils(RuntimeState* state) { _state = state; doris_udf::FunctionContext::TypeDesc return_type; std::vector arg_types; - _mem_tracker.reset(new MemTracker(-1, "function util")); - _memory_pool = new MemPool(_mem_tracker.get()); + _memory_pool = new MemPool(); _fn_ctx = FunctionContextImpl::create_context(_state, _memory_pool, return_type, arg_types, 0, false); } @@ -46,8 +43,7 @@ FunctionUtils::FunctionUtils(RuntimeState* state) { FunctionUtils::FunctionUtils(const doris_udf::FunctionContext::TypeDesc& return_type, const std::vector& arg_types, int varargs_buffer_size) { - _mem_tracker.reset(new MemTracker(-1, "function util")); - _memory_pool = new MemPool(_mem_tracker.get()); + _memory_pool = new MemPool(); _fn_ctx = FunctionContextImpl::create_context(_state, _memory_pool, return_type, arg_types, varargs_buffer_size, false); } diff --git a/be/src/testutil/function_utils.h b/be/src/testutil/function_utils.h index 30b2a6cdffdbeb..0b0902ddac9d7c 100644 --- a/be/src/testutil/function_utils.h +++ b/be/src/testutil/function_utils.h @@ -23,7 +23,6 @@ namespace doris { class MemPool; -class MemTracker; class RuntimeState; class FunctionUtils { @@ -39,7 +38,6 @@ class FunctionUtils { private: RuntimeState* _state = nullptr; - std::shared_ptr _mem_tracker; MemPool* _memory_pool = nullptr; doris_udf::FunctionContext* _fn_ctx = nullptr; }; diff --git a/be/src/util/arrow/row_batch.cpp b/be/src/util/arrow/row_batch.cpp index 3b0e0ca1db055a..590bf52017491a 100644 --- a/be/src/util/arrow/row_batch.cpp +++ b/be/src/util/arrow/row_batch.cpp @@ -365,9 +365,8 @@ class ToRowBatchConverter : public arrow::ArrayVisitor { public: using arrow::ArrayVisitor::Visit; - ToRowBatchConverter(const arrow::RecordBatch& batch, const RowDescriptor& row_desc, - const std::shared_ptr& tracker) - : _batch(batch), _row_desc(row_desc), _tracker(tracker) {} + ToRowBatchConverter(const arrow::RecordBatch& batch, const RowDescriptor& row_desc) + : _batch(batch), _row_desc(row_desc) {} #define PRIMITIVE_VISIT(TYPE) \ arrow::Status Visit(const arrow::TYPE& array) override { return _visit(array); } @@ -407,7 +406,6 @@ class ToRowBatchConverter : public arrow::ArrayVisitor { private: const arrow::RecordBatch& _batch; const RowDescriptor& _row_desc; - std::shared_ptr _tracker; std::unique_ptr _cur_slot_ref; std::shared_ptr _output; @@ -427,7 +425,7 @@ Status ToRowBatchConverter::convert(std::shared_ptr* result) { // TODO(zc): check if field type match size_t num_rows = _batch.num_rows(); - _output.reset(new RowBatch(_row_desc, num_rows, _tracker.get())); + _output.reset(new RowBatch(_row_desc, num_rows)); _output->commit_rows(num_rows); auto pool = _output->tuple_data_pool(); for (size_t row_id = 0; row_id < num_rows; ++row_id) { @@ -453,9 +451,8 @@ Status ToRowBatchConverter::convert(std::shared_ptr* result) { } Status convert_to_row_batch(const arrow::RecordBatch& batch, const RowDescriptor& row_desc, - const std::shared_ptr& tracker, std::shared_ptr* result) { - ToRowBatchConverter converter(batch, row_desc, tracker); + ToRowBatchConverter converter(batch, row_desc); return converter.convert(result); } diff --git a/be/src/util/arrow/row_batch.h b/be/src/util/arrow/row_batch.h index a7c2f3991d5012..f75b060502f411 100644 --- a/be/src/util/arrow/row_batch.h +++ b/be/src/util/arrow/row_batch.h @@ -35,7 +35,6 @@ class Schema; namespace doris { -class MemTracker; class ObjectPool; class RowBatch; class RowDescriptor; @@ -56,10 +55,8 @@ Status convert_to_arrow_batch(const RowBatch& batch, const std::shared_ptr* result); // Convert an Arrow RecordBatch to a Doris RowBatch. A valid RowDescriptor -// whose schema is the same with RecordBatch's should be given. Memory used -// by result RowBatch will be tracked by tracker. +// whose schema is the same with RecordBatch's should be given. Status convert_to_row_batch(const arrow::RecordBatch& batch, const RowDescriptor& row_desc, - const std::shared_ptr& tracker, std::shared_ptr* result); Status serialize_record_batch(const arrow::RecordBatch& record_batch, std::string* result); diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h index 8015dcaefac3f1..e99f5980fcd32f 100644 --- a/be/src/util/doris_metrics.h +++ b/be/src/util/doris_metrics.h @@ -183,6 +183,7 @@ class DorisMetrics { UIntGauge* compaction_mem_consumption; UIntGauge* load_mem_consumption; + UIntGauge* load_channel_mem_consumption; UIntGauge* query_mem_consumption; UIntGauge* schema_change_mem_consumption; UIntGauge* tablet_meta_mem_consumption; diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp index 4fa54291820ca5..404d6dc2230b29 100644 --- a/be/src/vec/exec/join/vhash_join_node.cpp +++ b/be/src/vec/exec/join/vhash_join_node.cpp @@ -20,6 +20,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/mem_tracker.h" #include "runtime/runtime_filter_mgr.h" +#include "runtime/thread_context.h" #include "util/defer_op.h" #include "vec/core/materialize_block.h" #include "vec/exprs/vexpr.h" @@ -50,7 +51,7 @@ struct ProcessHashTableBuild { Defer defer {[&]() { int64_t bucket_size = hash_table_ctx.hash_table.get_buffer_size_in_cells(); int64_t bucket_bytes = hash_table_ctx.hash_table.get_buffer_size_in_bytes(); - _join_node->_mem_tracker->Consume(bucket_bytes - old_bucket_bytes); + _join_node->_hash_table_mem_tracker->consume(bucket_bytes - old_bucket_bytes); _join_node->_mem_used += bucket_bytes - old_bucket_bytes; COUNTER_SET(_join_node->_build_buckets_counter, bucket_size); }}; @@ -596,6 +597,8 @@ Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status HashJoinNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _hash_table_mem_tracker = MemTracker::create_virtual_tracker(-1, "VSetOperationNode:HashTable"); // Build phase auto build_phase_profile = runtime_profile()->create_child("BuildPhase", true, true); @@ -642,10 +645,11 @@ Status HashJoinNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (_vother_join_conjunct_ptr) (*_vother_join_conjunct_ptr)->close(state); - _mem_tracker->Release(_mem_used); + _hash_table_mem_tracker->release(_mem_used); return ExecNode::close(state); } @@ -783,6 +787,7 @@ Status HashJoinNode::get_next(RuntimeState* state, Block* output_block, bool* eo } Status HashJoinNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -802,6 +807,7 @@ Status HashJoinNode::open(RuntimeState* state) { Status HashJoinNode::_hash_table_build(RuntimeState* state) { RETURN_IF_ERROR(child(1)->open(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB("Hash join, while constructing the hash table"); SCOPED_TIMER(_build_timer); Block block; @@ -811,12 +817,9 @@ Status HashJoinNode::_hash_table_build(RuntimeState* state) { RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(child(1)->get_next(state, &block, &eos)); - _mem_tracker->Consume(block.allocated_bytes()); + _hash_table_mem_tracker->consume(block.allocated_bytes()); _mem_used += block.allocated_bytes(); - RETURN_IF_LIMIT_EXCEEDED(state, "Hash join, while getting next from the child 1."); - RETURN_IF_ERROR(_process_build_block(state, block)); - RETURN_IF_LIMIT_EXCEEDED(state, "Hash join, while constructing the hash table."); } return std::visit( diff --git a/be/src/vec/exec/join/vhash_join_node.h b/be/src/vec/exec/join/vhash_join_node.h index 65b2328e3ff6fe..589bd5294227dc 100644 --- a/be/src/vec/exec/join/vhash_join_node.h +++ b/be/src/vec/exec/join/vhash_join_node.h @@ -197,6 +197,8 @@ class HashJoinNode : public ::doris::ExecNode { RowDescriptor _row_desc_for_other_join_conjunt; + std::shared_ptr _hash_table_mem_tracker; + private: Status _hash_table_build(RuntimeState* state); Status _process_build_block(RuntimeState* state, Block& block); diff --git a/be/src/vec/exec/vaggregation_node.cpp b/be/src/vec/exec/vaggregation_node.cpp index 76b4b349efc4d0..84678c526910ac 100644 --- a/be/src/vec/exec/vaggregation_node.cpp +++ b/be/src/vec/exec/vaggregation_node.cpp @@ -22,6 +22,7 @@ #include "exec/exec_node.h" #include "runtime/mem_pool.h" #include "runtime/row_batch.h" +#include "runtime/thread_context.h" #include "util/defer_op.h" #include "vec/core/block.h" #include "vec/data_types/data_type_nullable.h" @@ -203,11 +204,13 @@ void AggregationNode::_init_hash_method(std::vector& probe_exprs) Status AggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _exec_timer = ADD_TIMER(runtime_profile(), "ExecTime"); _merge_timer = ADD_TIMER(runtime_profile(), "MergeTime"); _expr_timer = ADD_TIMER(runtime_profile(), "ExprTime"); _get_results_timer = ADD_TIMER(runtime_profile(), "GetResultsTime"); + _data_mem_tracker = MemTracker::create_virtual_tracker(-1, "AggregationNode:Data", mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); _intermediate_tuple_desc = state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id); @@ -216,7 +219,7 @@ Status AggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR( VExpr::prepare(_probe_expr_ctxs, state, child(0)->row_desc(), expr_mem_tracker())); - _mem_pool = std::make_unique(mem_tracker().get()); + _mem_pool = std::make_unique(); int j = _probe_expr_ctxs.size(); for (int i = 0; i < j; ++i) { @@ -330,6 +333,7 @@ Status AggregationNode::prepare(RuntimeState* state) { } Status AggregationNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "aggregator, while execute open"); RETURN_IF_ERROR(ExecNode::open(state)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -355,7 +359,6 @@ Status AggregationNode::open(RuntimeState* state) { } RETURN_IF_ERROR(_executor.execute(&block)); _executor.update_memusage(); - RETURN_IF_LIMIT_EXCEEDED(state, "aggregator, while execute open."); } return Status::OK(); @@ -366,6 +369,7 @@ Status AggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* } Status AggregationNode::get_next(RuntimeState* state, Block* block, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "aggregator, while execute get_next"); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (_is_streaming_preagg) { @@ -395,12 +399,12 @@ Status AggregationNode::get_next(RuntimeState* state, Block* block, bool* eos) { } _executor.update_memusage(); - RETURN_IF_LIMIT_EXCEEDED(state, "aggregator, while execute get_next."); return Status::OK(); } Status AggregationNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::close(state)); VExpr::close(_probe_expr_ctxs, state); @@ -555,7 +559,7 @@ Status AggregationNode::_merge_without_key(Block* block) { } void AggregationNode::_update_memusage_without_key() { - mem_tracker()->Consume(_agg_arena_pool.size() - _mem_usage_record.used_in_arena); + _data_mem_tracker->consume(_agg_arena_pool.size() - _mem_usage_record.used_in_arena); _mem_usage_record.used_in_arena = _agg_arena_pool.size(); } @@ -1078,8 +1082,8 @@ void AggregationNode::_update_memusage_with_serialized_key() { std::visit( [&](auto&& agg_method) -> void { auto& data = agg_method.data; - mem_tracker()->Consume(_agg_arena_pool.size() - _mem_usage_record.used_in_arena); - mem_tracker()->Consume(data.get_buffer_size_in_bytes() - + _data_mem_tracker->consume(_agg_arena_pool.size() - _mem_usage_record.used_in_arena); + _data_mem_tracker->consume(data.get_buffer_size_in_bytes() - _mem_usage_record.used_in_state); _mem_usage_record.used_in_state = data.get_buffer_size_in_bytes(); _mem_usage_record.used_in_arena = _agg_arena_pool.size(); @@ -1103,7 +1107,7 @@ void AggregationNode::_close_with_serialized_key() { } void AggregationNode::release_tracker() { - mem_tracker()->Release(_mem_usage_record.used_in_state + _mem_usage_record.used_in_arena); + _data_mem_tracker->release(_mem_usage_record.used_in_state + _mem_usage_record.used_in_arena); } } // namespace doris::vectorized diff --git a/be/src/vec/exec/vaggregation_node.h b/be/src/vec/exec/vaggregation_node.h index 4df933f91717f0..358774d09898a6 100644 --- a/be/src/vec/exec/vaggregation_node.h +++ b/be/src/vec/exec/vaggregation_node.h @@ -406,6 +406,8 @@ class AggregationNode : public ::doris::ExecNode { bool _is_merge; std::unique_ptr _mem_pool; + std::shared_ptr _data_mem_tracker; + size_t _align_aggregate_states = 1; /// The offset to the n-th aggregate function in a row of aggregate functions. Sizes _offsets_of_aggregate_states; diff --git a/be/src/vec/exec/vanalytic_eval_node.cpp b/be/src/vec/exec/vanalytic_eval_node.cpp index 4d69716216c7cb..280acfa7e403dc 100644 --- a/be/src/vec/exec/vanalytic_eval_node.cpp +++ b/be/src/vec/exec/vanalytic_eval_node.cpp @@ -22,6 +22,7 @@ #include "runtime/descriptors.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "udf/udf_internal.h" #include "vec/utils/util.hpp" @@ -142,8 +143,9 @@ Status VAnalyticEvalNode::init(const TPlanNode& tnode, RuntimeState* state) { Status VAnalyticEvalNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); DCHECK(child(0)->row_desc().is_prefix_of(row_desc())); - _mem_pool.reset(new MemPool(mem_tracker().get())); + _mem_pool.reset(new MemPool()); _evaluation_timer = ADD_TIMER(runtime_profile(), "EvaluationTime"); SCOPED_TIMER(_evaluation_timer); @@ -207,6 +209,7 @@ Status VAnalyticEvalNode::prepare(RuntimeState* state) { } Status VAnalyticEvalNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_CANCELLED(state); @@ -223,6 +226,7 @@ Status VAnalyticEvalNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); ExecNode::close(state); _destory_agg_status(); return Status::OK(); @@ -233,6 +237,7 @@ Status VAnalyticEvalNode::get_next(RuntimeState* state, RowBatch* row_batch, boo } Status VAnalyticEvalNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); diff --git a/be/src/vec/exec/vblocking_join_node.cpp b/be/src/vec/exec/vblocking_join_node.cpp index af1adb957ed7e5..3266b5de204c61 100644 --- a/be/src/vec/exec/vblocking_join_node.cpp +++ b/be/src/vec/exec/vblocking_join_node.cpp @@ -22,6 +22,7 @@ #include "exprs/expr.h" #include "gen_cpp/PlanNodes_types.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris::vectorized { @@ -39,8 +40,9 @@ Status VBlockingJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status VBlockingJoinNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); - _build_pool.reset(new MemPool(mem_tracker().get())); + _build_pool.reset(new MemPool()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _left_child_timer = ADD_TIMER(runtime_profile(), "LeftChildTime"); _build_row_counter = ADD_COUNTER(runtime_profile(), "BuildRows", TUnit::UNIT); @@ -62,11 +64,14 @@ Status VBlockingJoinNode::prepare(RuntimeState* state) { Status VBlockingJoinNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); ExecNode::close(state); return Status::OK(); } void VBlockingJoinNode::build_side_thread(RuntimeState* state, std::promise* status) { + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), mem_tracker()); status->set_value(construct_build_side(state)); // Release the thread token as soon as possible (before the main thread joins // on it). This way, if we had a chain of 10 joins using 1 additional thread, @@ -75,6 +80,7 @@ void VBlockingJoinNode::build_side_thread(RuntimeState* state, std::promisetotal_time_counter()); RETURN_IF_CANCELLED(state); diff --git a/be/src/vec/exec/vcross_join_node.cpp b/be/src/vec/exec/vcross_join_node.cpp index 6d48527f73df2f..d03ab09dbd48c8 100644 --- a/be/src/vec/exec/vcross_join_node.cpp +++ b/be/src/vec/exec/vcross_join_node.cpp @@ -23,6 +23,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris::vectorized { @@ -33,6 +34,8 @@ VCrossJoinNode::VCrossJoinNode(ObjectPool* pool, const TPlanNode& tnode, const D Status VCrossJoinNode::prepare(RuntimeState* state) { DCHECK(_join_op == TJoinOp::CROSS_JOIN); RETURN_IF_ERROR(VBlockingJoinNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _block_mem_tracker = MemTracker::create_virtual_tracker(-1, "VCrossJoinNode:Block", mem_tracker()); _num_existing_columns = child(0)->row_desc().num_materialized_slots(); _num_columns_to_add = child(1)->row_desc().num_materialized_slots(); @@ -44,7 +47,7 @@ Status VCrossJoinNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - _mem_tracker->Release(_total_mem_usage); + _block_mem_tracker->release(_total_mem_usage); VBlockingJoinNode::close(state); return Status::OK(); } @@ -52,6 +55,7 @@ Status VCrossJoinNode::close(RuntimeState* state) { Status VCrossJoinNode::construct_build_side(RuntimeState* state) { // Do a full scan of child(1) and store all build row batches. RETURN_IF_ERROR(child(1)->open(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB("Cross join, while getting next from the child 1"); bool eos = false; while (true) { @@ -67,10 +71,8 @@ Status VCrossJoinNode::construct_build_side(RuntimeState* state) { _build_rows += rows; _total_mem_usage += mem_usage; _build_blocks.emplace_back(std::move(block)); - _mem_tracker->Consume(mem_usage); + _block_mem_tracker->consume(mem_usage); } - // to prevent use too many memory - RETURN_IF_LIMIT_EXCEEDED(state, "Cross join, while getting next from the child 1."); if (eos) { break; @@ -89,6 +91,7 @@ void VCrossJoinNode::init_get_next(int left_batch_row) { Status VCrossJoinNode::get_next(RuntimeState* state, Block* block, bool* eos) { RETURN_IF_CANCELLED(state); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); *eos = false; SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/vec/exec/vcross_join_node.h b/be/src/vec/exec/vcross_join_node.h index aeeeb3a2db79bb..1c8998a9f200d4 100644 --- a/be/src/vec/exec/vcross_join_node.h +++ b/be/src/vec/exec/vcross_join_node.h @@ -64,6 +64,8 @@ class VCrossJoinNode final : public VBlockingJoinNode { uint64_t _build_rows = 0; uint64_t _total_mem_usage = 0; + std::shared_ptr _block_mem_tracker; + // Build mutable columns to insert data. // if block can mem reuse, just clear data in block // else build a new block and alloc mem of column from left and right child block diff --git a/be/src/vec/exec/ves_http_scan_node.cpp b/be/src/vec/exec/ves_http_scan_node.cpp index a70acadb89bb2a..cbe2d2097a368e 100644 --- a/be/src/vec/exec/ves_http_scan_node.cpp +++ b/be/src/vec/exec/ves_http_scan_node.cpp @@ -106,7 +106,7 @@ Status VEsHttpScanNode::scanner_scan(std::unique_ptr scanner) { bool scanner_eof = false; const int batch_size = _runtime_state->batch_size(); - std::unique_ptr tuple_pool(new MemPool(mem_tracker().get())); + std::unique_ptr tuple_pool(new MemPool()); size_t slot_num = _tuple_desc->slots().size(); while (!scanner_eof) { diff --git a/be/src/vec/exec/volap_scan_node.cpp b/be/src/vec/exec/volap_scan_node.cpp index 77f02131468cfc..5897de710acd1a 100644 --- a/be/src/vec/exec/volap_scan_node.cpp +++ b/be/src/vec/exec/volap_scan_node.cpp @@ -21,6 +21,7 @@ #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/runtime_filter_mgr.h" +#include "runtime/thread_context.h" #include "util/priority_thread_pool.hpp" #include "vec/core/block.h" #include "vec/exec/volap_scanner.h" @@ -36,6 +37,8 @@ VOlapScanNode::VOlapScanNode(ObjectPool* pool, const TPlanNode& tnode, const Des void VOlapScanNode::transfer_thread(RuntimeState* state) { // scanner open pushdown to scanThread + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), mem_tracker()); Status status = Status::OK(); if (_vconjunct_ctx_ptr) { @@ -69,7 +72,7 @@ void VOlapScanNode::transfer_thread(RuntimeState* state) { auto block_per_scanner = (doris_scanner_row_num + (block_size - 1)) / block_size; auto pre_block_count = std::min(_volap_scanners.size(), static_cast(config::doris_scanner_thread_pool_thread_num)) * block_per_scanner; - + uint64_t buffered_bytes = 0; for (int i = 0; i < pre_block_count; ++i) { auto block = new Block; for (const auto slot_desc : _tuple_desc->slots()) { @@ -80,9 +83,9 @@ void VOlapScanNode::transfer_thread(RuntimeState* state) { slot_desc->col_name())); } _free_blocks.emplace_back(block); - _buffered_bytes += block->allocated_bytes(); + buffered_bytes += block->allocated_bytes(); } - _mem_tracker->Consume(_buffered_bytes); + _block_mem_tracker->consume(buffered_bytes); // read from scanner while (LIKELY(status.ok())) { @@ -139,6 +142,9 @@ void VOlapScanNode::transfer_thread(RuntimeState* state) { } void VOlapScanNode::scanner_thread(VOlapScanner* scanner) { + SCOPED_ATTACH_TASK_THREAD_4ARG(_runtime_state->query_type(), + print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id(), mem_tracker()); int64_t wait_time = scanner->update_wait_worker_timer(); // Do not use ScopedTimer. There is no guarantee that, the counter // (_scan_cpu_timer, the class member) is not destroyed after `_running_thread==0`. @@ -293,6 +299,7 @@ Status VOlapScanNode::start_scan_thread(RuntimeState* state) { _transfer_done = true; return Status::OK(); } + _block_mem_tracker = MemTracker::create_virtual_tracker(-1, "VOlapScanNode:Block"); // ranges constructed from scan keys std::vector> cond_ranges; @@ -337,7 +344,7 @@ Status VOlapScanNode::start_scan_thread(RuntimeState* state) { } VOlapScanner* scanner = new VOlapScanner(state, this, _olap_scan_node.is_preaggregation, - _need_agg_finalize, *scan_range); + _need_agg_finalize, *scan_range, scanner_mem_tracker); // add scanner to pool before doing prepare. // so that scanner can be automatically deconstructed if prepare failed. _scanner_pool.add(scanner); @@ -366,6 +373,7 @@ Status VOlapScanNode::close(RuntimeState* state) { return Status::OK(); } RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // change done status { @@ -386,7 +394,6 @@ Status VOlapScanNode::close(RuntimeState* state) { std::for_each(_materialized_blocks.begin(), _materialized_blocks.end(), std::default_delete()); std::for_each(_scan_blocks.begin(), _scan_blocks.end(), std::default_delete()); std::for_each(_free_blocks.begin(), _free_blocks.end(), std::default_delete()); - _mem_tracker->Release(_buffered_bytes); // OlapScanNode terminate by exception // so that initiative close the Scanner @@ -406,6 +413,7 @@ Status VOlapScanNode::close(RuntimeState* state) { } Status VOlapScanNode::get_next(RuntimeState* state, Block* block, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/vec/exec/volap_scan_node.h b/be/src/vec/exec/volap_scan_node.h index 921399ee6bd5b1..09f77364eda279 100644 --- a/be/src/vec/exec/volap_scan_node.h +++ b/be/src/vec/exec/volap_scan_node.h @@ -63,6 +63,8 @@ class VOlapScanNode final : public OlapScanNode { std::list _volap_scanners; std::mutex _volap_scanners_lock; + std::shared_ptr _block_mem_tracker; + int _max_materialized_blocks; }; } // namespace vectorized diff --git a/be/src/vec/exec/volap_scanner.cpp b/be/src/vec/exec/volap_scanner.cpp index 7b5b31eb52d770..b2ec142dcb5f24 100644 --- a/be/src/vec/exec/volap_scanner.cpp +++ b/be/src/vec/exec/volap_scanner.cpp @@ -30,17 +30,20 @@ #include "vec/exec/volap_scan_node.h" #include "vec/exprs/vexpr_context.h" #include "vec/runtime/vdatetime_value.h" +#include "runtime/thread_context.h" namespace doris::vectorized { VOlapScanner::VOlapScanner(RuntimeState* runtime_state, VOlapScanNode* parent, bool aggregation, - bool need_agg_finalize, const TPaloScanRange& scan_range) - : OlapScanner(runtime_state, parent, aggregation, need_agg_finalize, scan_range) { + bool need_agg_finalize, const TPaloScanRange& scan_range, + std::shared_ptr tracker) + : OlapScanner(runtime_state, parent, aggregation, need_agg_finalize, scan_range, tracker) { } Status VOlapScanner::get_block(RuntimeState* state, vectorized::Block* block, bool* eof) { // only empty block should be here DCHECK(block->rows() == 0); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t raw_rows_threshold = raw_rows_read() + config::doris_scanner_row_num; if (!block->mem_reuse()) { diff --git a/be/src/vec/exec/volap_scanner.h b/be/src/vec/exec/volap_scanner.h index 0c1c4adf854aee..b6ef7e32ff8250 100644 --- a/be/src/vec/exec/volap_scanner.h +++ b/be/src/vec/exec/volap_scanner.h @@ -33,7 +33,8 @@ class VOlapScanNode; class VOlapScanner : public OlapScanner { public: VOlapScanner(RuntimeState* runtime_state, VOlapScanNode* parent, bool aggregation, - bool need_agg_finalize, const TPaloScanRange& scan_range); + bool need_agg_finalize, const TPaloScanRange& scan_range, + std::shared_ptr tracker); Status get_block(RuntimeState* state, vectorized::Block* block, bool* eof); diff --git a/be/src/vec/exec/vset_operation_node.cpp b/be/src/vec/exec/vset_operation_node.cpp index 3e6f73dae7226d..0be24a3ae5f6a7 100644 --- a/be/src/vec/exec/vset_operation_node.cpp +++ b/be/src/vec/exec/vset_operation_node.cpp @@ -17,6 +17,7 @@ #include "vec/exec/vset_operation_node.h" +#include "runtime/thread_context.h" #include "util/defer_op.h" #include "vec/exprs/vexpr.h" namespace doris { @@ -36,10 +37,10 @@ struct HashTableBuild { using KeyGetter = typename HashTableContext::State; using Mapped = typename HashTableContext::Mapped; int64_t old_bucket_bytes = hash_table_ctx.hash_table.get_buffer_size_in_bytes(); - + Defer defer {[&]() { int64_t bucket_bytes = hash_table_ctx.hash_table.get_buffer_size_in_bytes(); - _operation_node->_mem_tracker->Consume(bucket_bytes - old_bucket_bytes); + _operation_node->_hash_table_mem_tracker->consume(bucket_bytes - old_bucket_bytes); _operation_node->_mem_used += bucket_bytes - old_bucket_bytes; }}; @@ -80,10 +81,11 @@ Status VSetOperationNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); for (auto& exprs : _child_expr_lists) { VExpr::close(exprs, state); } - _mem_tracker->Release(_mem_used); + _hash_table_mem_tracker->release(_mem_used); return ExecNode::close(state); } @@ -111,6 +113,7 @@ Status VSetOperationNode::init(const TPlanNode& tnode, RuntimeState* state) { } Status VSetOperationNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); // open result expr lists. @@ -123,6 +126,8 @@ Status VSetOperationNode::open(RuntimeState* state) { Status VSetOperationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _hash_table_mem_tracker = MemTracker::create_virtual_tracker(-1, "VSetOperationNode:HashTable"); SCOPED_TIMER(_runtime_profile->total_time_counter()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _probe_timer = ADD_TIMER(runtime_profile(), "ProbeTime"); @@ -224,6 +229,8 @@ void VSetOperationNode::hash_table_init() { //build a hash table from child(0) Status VSetOperationNode::hash_table_build(RuntimeState* state) { RETURN_IF_ERROR(child(0)->open(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB( + "Set Operation Node, while constructing the hash table"); Block block; bool eos = false; while (!eos) { @@ -233,12 +240,9 @@ Status VSetOperationNode::hash_table_build(RuntimeState* state) { RETURN_IF_ERROR(child(0)->get_next(state, &block, &eos)); size_t allocated_bytes = block.allocated_bytes(); - _mem_tracker->Consume(allocated_bytes); + _hash_table_mem_tracker->consume(allocated_bytes); _mem_used += allocated_bytes; - - RETURN_IF_LIMIT_EXCEEDED(state, "Set Operation Node, while getting next from the child 0."); RETURN_IF_ERROR(process_build_block(block)); - RETURN_IF_LIMIT_EXCEEDED(state, "Set Operation Node, while constructing the hash table."); } return Status::OK(); } diff --git a/be/src/vec/exec/vset_operation_node.h b/be/src/vec/exec/vset_operation_node.h index 93a7f8b61a3825..ea2eb7c8ea0331 100644 --- a/be/src/vec/exec/vset_operation_node.h +++ b/be/src/vec/exec/vset_operation_node.h @@ -89,6 +89,8 @@ class VSetOperationNode : public ExecNode { RuntimeProfile::Counter* _build_timer; // time to build hash table RuntimeProfile::Counter* _probe_timer; // time to probe + std::shared_ptr _hash_table_mem_tracker; + template friend class HashTableBuild; template diff --git a/be/src/vec/exec/vsort_node.cpp b/be/src/vec/exec/vsort_node.cpp index 734af91baac45a..fd93977ea86118 100644 --- a/be/src/vec/exec/vsort_node.cpp +++ b/be/src/vec/exec/vsort_node.cpp @@ -20,6 +20,7 @@ #include "exec/sort_exec_exprs.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/debug_util.h" #include "vec/core/sort_block.h" @@ -43,12 +44,15 @@ Status VSortNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); _runtime_profile->add_info_string("TOP-N", _limit == -1 ? "false" : "true"); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _block_mem_tracker = MemTracker::create_virtual_tracker(-1, "VSortNode:Block", mem_tracker()); RETURN_IF_ERROR(_vsort_exec_exprs.prepare(state, child(0)->row_desc(), _row_descriptor, expr_mem_tracker())); return Status::OK(); } Status VSortNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(_vsort_exec_exprs.open(state)); @@ -74,6 +78,7 @@ Status VSortNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) } Status VSortNode::get_next(RuntimeState* state, Block* block, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); auto status = Status::OK(); @@ -102,7 +107,8 @@ Status VSortNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - _mem_tracker->Release(_total_mem_usage); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _block_mem_tracker->release(_total_mem_usage); _vsort_exec_exprs.close(state); ExecNode::close(state); return Status::OK(); @@ -159,7 +165,7 @@ Status VSortNode::sort_input(RuntimeState* state) { _sorted_blocks.emplace_back(std::move(block)); } - _mem_tracker->Consume(mem_usage); + _block_mem_tracker->consume(mem_usage); RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(state->check_query_state("vsort, while sorting input.")); } diff --git a/be/src/vec/exec/vsort_node.h b/be/src/vec/exec/vsort_node.h index 66876aa149d4e4..f3e39ca1d34d59 100644 --- a/be/src/vec/exec/vsort_node.h +++ b/be/src/vec/exec/vsort_node.h @@ -85,6 +85,8 @@ class VSortNode : public doris::ExecNode { // only valid in TOP-N node uint64_t _num_rows_in_block = 0; std::priority_queue _block_priority_queue; + + std::shared_ptr _block_mem_tracker; }; } // end namespace doris diff --git a/be/src/vec/exprs/vexpr_context.cpp b/be/src/vec/exprs/vexpr_context.cpp index a8f1d5291f7023..113abac69cfaf3 100644 --- a/be/src/vec/exprs/vexpr_context.cpp +++ b/be/src/vec/exprs/vexpr_context.cpp @@ -19,6 +19,7 @@ #include "udf/udf_internal.h" #include "vec/exprs/vexpr.h" +#include "runtime/thread_context.h" namespace doris::vectorized { VExprContext::VExprContext(VExpr* expr) @@ -39,7 +40,9 @@ doris::Status VExprContext::prepare(doris::RuntimeState* state, const doris::RowDescriptor& row_desc, const std::shared_ptr& tracker) { _prepared = true; - _pool.reset(new MemPool(state->instance_mem_tracker().get())); + _mem_tracker = tracker; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + _pool.reset(new MemPool()); return _root->prepare(state, row_desc, this); } @@ -48,6 +51,7 @@ doris::Status VExprContext::open(doris::RuntimeState* state) { if (_opened) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); _opened = true; // Fragment-local state is only initialized for original contexts. Clones inherit the // original's fragment state and only need to have thread-local state initialized. @@ -58,6 +62,7 @@ doris::Status VExprContext::open(doris::RuntimeState* state) { void VExprContext::close(doris::RuntimeState* state) { DCHECK(!_closed); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); FunctionContext::FunctionStateScope scope = _is_clone ? FunctionContext::THREAD_LOCAL : FunctionContext::FRAGMENT_LOCAL; _root->close(state, this, scope); @@ -76,9 +81,10 @@ doris::Status VExprContext::clone(RuntimeState* state, VExprContext** new_ctx) { DCHECK(_prepared); DCHECK(_opened); DCHECK(*new_ctx == nullptr); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); *new_ctx = state->obj_pool()->add(new VExprContext(_root)); - (*new_ctx)->_pool.reset(new MemPool(_pool->mem_tracker())); + (*new_ctx)->_pool.reset(new MemPool()); for (auto& _fn_context : _fn_contexts) { (*new_ctx)->_fn_contexts.push_back(_fn_context->impl()->clone((*new_ctx)->_pool.get())); } @@ -86,6 +92,7 @@ doris::Status VExprContext::clone(RuntimeState* state, VExprContext** new_ctx) { (*new_ctx)->_is_clone = true; (*new_ctx)->_prepared = true; (*new_ctx)->_opened = true; + (*new_ctx)->_mem_tracker = _mem_tracker; return _root->open(state, *new_ctx, FunctionContext::THREAD_LOCAL); } diff --git a/be/src/vec/exprs/vexpr_context.h b/be/src/vec/exprs/vexpr_context.h index 0021779b35b7da..2df377d770a5be 100644 --- a/be/src/vec/exprs/vexpr_context.h +++ b/be/src/vec/exprs/vexpr_context.h @@ -83,6 +83,8 @@ class VExprContext { /// and owned by this VExprContext. std::vector _fn_contexts; + std::shared_ptr _mem_tracker; + /// Pool backing fn_contexts_. Counts against the runtime state's UDF mem tracker. std::unique_ptr _pool; diff --git a/be/src/vec/olap/vgeneric_iterators.cpp b/be/src/vec/olap/vgeneric_iterators.cpp index f0f148da2d77e0..e3d853ac3a35de 100644 --- a/be/src/vec/olap/vgeneric_iterators.cpp +++ b/be/src/vec/olap/vgeneric_iterators.cpp @@ -259,10 +259,7 @@ Status VMergeIteratorContext::_load_next_block() { class VMergeIterator : public RowwiseIterator { public: // VMergeIterator takes the ownership of input iterators - VMergeIterator(std::vector& iters, std::shared_ptr parent) : _origin_iters(iters) { - // use for count the mem use of Block use in Merge - _mem_tracker = MemTracker::CreateTracker(-1, "VMergeIterator", parent, false); - } + VMergeIterator(std::vector& iters) : _origin_iters(iters) {} ~VMergeIterator() override { while (!_merge_heap.empty()) { @@ -350,10 +347,7 @@ class VUnionIterator : public RowwiseIterator { // Iterators' ownership it transfered to this class. // This class will delete all iterators when destructs // Client should not use iterators any more. - VUnionIterator(std::vector& v, std::shared_ptr parent) - : _origin_iters(v.begin(), v.end()) { - _mem_tracker = MemTracker::CreateTracker(-1, "VUnionIterator", parent, false); - } + VUnionIterator(std::vector& v) : _origin_iters(v.begin(), v.end()) {} ~VUnionIterator() override { std::for_each(_origin_iters.begin(), _origin_iters.end(), std::default_delete()); @@ -403,18 +397,18 @@ Status VUnionIterator::next_batch(vectorized::Block* block) { } -RowwiseIterator* new_merge_iterator(std::vector& inputs, std::shared_ptr parent) { +RowwiseIterator* new_merge_iterator(std::vector& inputs) { if (inputs.size() == 1) { return *(inputs.begin()); } - return new VMergeIterator(inputs, parent); + return new VMergeIterator(inputs); } -RowwiseIterator* new_union_iterator(std::vector& inputs, std::shared_ptr parent) { +RowwiseIterator* new_union_iterator(std::vector& inputs) { if (inputs.size() == 1) { return *(inputs.begin()); } - return new VUnionIterator(inputs, parent); + return new VUnionIterator(inputs); } RowwiseIterator* new_auto_increment_iterator(const Schema& schema, size_t num_rows) { diff --git a/be/src/vec/olap/vgeneric_iterators.h b/be/src/vec/olap/vgeneric_iterators.h index 8177a63f8b00e2..eb2dade1be0e6d 100644 --- a/be/src/vec/olap/vgeneric_iterators.h +++ b/be/src/vec/olap/vgeneric_iterators.h @@ -27,14 +27,14 @@ namespace vectorized { // // Inputs iterators' ownership is taken by created merge iterator. And client // should delete returned iterator after usage. -RowwiseIterator* new_merge_iterator(std::vector& inputs, std::shared_ptr parent); +RowwiseIterator* new_merge_iterator(std::vector& inputs); // Create a union iterator for input iterators. Union iterator will read // input iterators one by one. // // Inputs iterators' ownership is taken by created union iterator. And client // should delete returned iterator after usage. -RowwiseIterator* new_union_iterator(std::vector& inputs, std::shared_ptr parent); +RowwiseIterator* new_union_iterator(std::vector& inputs); // Create an auto increment iterator which returns num_rows data in format of schema. // This class aims to be used in unit test. diff --git a/be/src/vec/runtime/vdata_stream_mgr.cpp b/be/src/vec/runtime/vdata_stream_mgr.cpp index 4b0bb5f75c334a..b26d11cd6bd7b4 100644 --- a/be/src/vec/runtime/vdata_stream_mgr.cpp +++ b/be/src/vec/runtime/vdata_stream_mgr.cpp @@ -53,7 +53,7 @@ std::shared_ptr VDataStreamMgr::create_recvr( VLOG_FILE << "creating receiver for fragment=" << fragment_instance_id << ", node=" << dest_node_id; std::shared_ptr recvr(new VDataStreamRecvr( - this, state->instance_mem_tracker(), row_desc, fragment_instance_id, dest_node_id, + this, row_desc, fragment_instance_id, dest_node_id, num_senders, is_merging, buffer_size, profile, sub_plan_query_statistics_recvr)); uint32_t hash_value = get_hash_value(fragment_instance_id, dest_node_id); std::lock_guard l(_lock); diff --git a/be/src/vec/runtime/vdata_stream_recvr.cpp b/be/src/vec/runtime/vdata_stream_recvr.cpp index 59e18d07a13e38..9f52fbe333f103 100644 --- a/be/src/vec/runtime/vdata_stream_recvr.cpp +++ b/be/src/vec/runtime/vdata_stream_recvr.cpp @@ -19,6 +19,7 @@ #include "gen_cpp/data.pb.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "util/uid_util.h" #include "vec/core/block.h" #include "vec/core/sort_cursor.h" @@ -121,7 +122,7 @@ void VDataStreamRecvr::SenderQueue::add_block(const PBlock& pblock, int be_numbe SCOPED_TIMER(_recvr->_deserialize_row_batch_timer); block = new Block(pblock); } - _recvr->_mem_tracker->Consume(block->bytes()); + _recvr->_block_mem_tracker->consume(block->bytes()); VLOG_ROW << "added #rows=" << block->rows() << " batch_size=" << block_byte_size << "\n"; _block_queue.emplace_back(block_byte_size, block); @@ -158,7 +159,7 @@ void VDataStreamRecvr::SenderQueue::add_block(Block* block, bool use_move) { size_t block_size = nblock->bytes(); _block_queue.emplace_back(block_size, nblock); - _recvr->_mem_tracker->Consume(nblock->bytes()); + _recvr->_block_mem_tracker->consume(nblock->bytes()); _data_arrival_cv.notify_one(); if (_recvr->exceeds_limit(block_size)) { @@ -241,10 +242,9 @@ void VDataStreamRecvr::SenderQueue::close() { } VDataStreamRecvr::VDataStreamRecvr( - VDataStreamMgr* stream_mgr, const std::shared_ptr& parent_tracker, - const RowDescriptor& row_desc, const TUniqueId& fragment_instance_id, - PlanNodeId dest_node_id, int num_senders, bool is_merging, int total_buffer_limit, - RuntimeProfile* profile, + VDataStreamMgr* stream_mgr, const RowDescriptor& row_desc, + const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id, int num_senders, + bool is_merging, int total_buffer_limit, RuntimeProfile* profile, std::shared_ptr sub_plan_query_statistics_recvr) : _mgr(stream_mgr), _fragment_instance_id(fragment_instance_id), @@ -256,8 +256,13 @@ VDataStreamRecvr::VDataStreamRecvr( _num_buffered_bytes(0), _profile(profile), _sub_plan_query_statistics_recvr(sub_plan_query_statistics_recvr) { - _mem_tracker = MemTracker::CreateTracker( - _profile, -1, "VDataStreamRecvr:" + print_id(_fragment_instance_id), parent_tracker); + _mem_tracker = + MemTracker::create_tracker(-1, "VDataStreamRecvr:" + print_id(_fragment_instance_id), + nullptr, MemTrackerLevel::VERBOSE, _profile); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + _block_mem_tracker = MemTracker::create_virtual_tracker( + -1, "VDataStreamRecvr:block:" + print_id(_fragment_instance_id), nullptr, + MemTrackerLevel::VERBOSE); // Create one queue per sender if is_merging is true. int num_queues = is_merging ? num_senders : 1; @@ -287,6 +292,7 @@ Status VDataStreamRecvr::create_merger(const std::vector& orderin const std::vector& nulls_first, size_t batch_size, int64_t limit, size_t offset) { DCHECK(_is_merging); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::vector child_block_suppliers; // Create the merger that will a single stream of sorted rows. _merger.reset(new VSortedRunMerger(ordering_expr, is_asc_order, nulls_first, batch_size, limit, @@ -302,16 +308,19 @@ Status VDataStreamRecvr::create_merger(const std::vector& orderin void VDataStreamRecvr::add_block(const PBlock& pblock, int sender_id, int be_number, int64_t packet_seq, ::google::protobuf::Closure** done) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; _sender_queues[use_sender_id]->add_block(pblock, be_number, packet_seq, done); } void VDataStreamRecvr::add_block(Block* block, int sender_id, bool use_move) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; _sender_queues[use_sender_id]->add_block(block, use_move); } Status VDataStreamRecvr::get_next(Block* block, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (!_is_merging) { Block* res = nullptr; RETURN_IF_ERROR(_sender_queues[0]->get_batch(&res)); @@ -325,15 +334,16 @@ Status VDataStreamRecvr::get_next(Block* block, bool* eos) { RETURN_IF_ERROR(_merger->get_next(block, eos)); } - if (LIKELY(_mem_tracker->consumption() >= block->bytes())) { - _mem_tracker->Release(block->bytes()); + if (LIKELY(_block_mem_tracker->consumption() >= block->bytes())) { + _block_mem_tracker->release(block->bytes()); } else { - _mem_tracker->Release(_mem_tracker->consumption()); + _block_mem_tracker->release(_block_mem_tracker->consumption()); } return Status::OK(); } void VDataStreamRecvr::remove_sender(int sender_id, int be_number) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; _sender_queues[use_sender_id]->decrement_senders(be_number); } @@ -349,6 +359,7 @@ void VDataStreamRecvr::close() { return; } _is_closed = true; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); for (int i = 0; i < _sender_queues.size(); ++i) { _sender_queues[i]->close(); } @@ -358,7 +369,7 @@ void VDataStreamRecvr::close() { _mgr = nullptr; _merger.reset(); - _mem_tracker->Release(_mem_tracker->consumption()); + _block_mem_tracker->release(_block_mem_tracker->consumption()); } } // namespace doris::vectorized diff --git a/be/src/vec/runtime/vdata_stream_recvr.h b/be/src/vec/runtime/vdata_stream_recvr.h index 1292b978b52dcb..0b8c188a1bf547 100644 --- a/be/src/vec/runtime/vdata_stream_recvr.h +++ b/be/src/vec/runtime/vdata_stream_recvr.h @@ -50,10 +50,10 @@ class VExprContext; class VDataStreamRecvr { public: - VDataStreamRecvr(VDataStreamMgr* stream_mgr, const std::shared_ptr& parent_tracker, - const RowDescriptor& row_desc, const TUniqueId& fragment_instance_id, - PlanNodeId dest_node_id, int num_senders, bool is_merging, - int total_buffer_limit, RuntimeProfile* profile, + VDataStreamRecvr(VDataStreamMgr* stream_mgr, const RowDescriptor& row_desc, + const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id, + int num_senders, bool is_merging, int total_buffer_limit, + RuntimeProfile* profile, std::shared_ptr sub_plan_query_statistics_recvr); ~VDataStreamRecvr(); @@ -73,7 +73,6 @@ class VDataStreamRecvr { const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } PlanNodeId dest_node_id() const { return _dest_node_id; } const RowDescriptor& row_desc() const { return _row_desc; } - std::shared_ptr mem_tracker() const { return _mem_tracker; } void add_sub_plan_statistics(const PQueryStatistics& statistics, int sender_id) { _sub_plan_query_statistics_recvr->insert(statistics, sender_id); @@ -117,6 +116,7 @@ class VDataStreamRecvr { std::atomic _num_buffered_bytes; std::shared_ptr _mem_tracker; + std::shared_ptr _block_mem_tracker; std::vector _sender_queues; std::unique_ptr _merger; diff --git a/be/src/vec/sink/vdata_stream_sender.cpp b/be/src/vec/sink/vdata_stream_sender.cpp index 8819eaa2362baa..a8031301c7fa8a 100644 --- a/be/src/vec/sink/vdata_stream_sender.cpp +++ b/be/src/vec/sink/vdata_stream_sender.cpp @@ -28,6 +28,7 @@ #include "runtime/mem_tracker.h" #include "runtime/runtime_state.h" #include "util/proto_util.h" +#include "runtime/thread_context.h" #include "vec/common/sip_hash.h" #include "vec/runtime/vdata_stream_mgr.h" #include "vec/runtime/vdata_stream_recvr.h" @@ -343,9 +344,10 @@ Status VDataStreamSender::prepare(RuntimeState* state) { _dest_node_id, instances); _profile = _pool->add(new RuntimeProfile(std::move(title))); SCOPED_TIMER(_profile->total_time_counter()); - _mem_tracker = MemTracker::CreateTracker( - _profile, -1, "VDataStreamSender:" + print_id(state->fragment_instance_id()), - state->instance_mem_tracker()); + _mem_tracker = MemTracker::create_tracker( + -1, "VDataStreamSender:" + print_id(state->fragment_instance_id()), nullptr, + MemTrackerLevel::VERBOSE, _profile); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (_part_type == TPartitionType::UNPARTITIONED || _part_type == TPartitionType::RANDOM) { std::random_device rd; @@ -378,6 +380,7 @@ Status VDataStreamSender::prepare(RuntimeState* state) { } Status VDataStreamSender::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(state != nullptr); RETURN_IF_ERROR(VExpr::open(_partition_expr_ctxs, state)); for (auto iter : _partition_infos) { @@ -391,6 +394,7 @@ Status VDataStreamSender::send(RuntimeState* state, RowBatch* batch) { } Status VDataStreamSender::send(RuntimeState* state, Block* block) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_profile->total_time_counter()); if (_part_type == TPartitionType::UNPARTITIONED || _channels.size() == 1) { // 1. serialize depends on it is not local exchange @@ -501,6 +505,7 @@ Status VDataStreamSender::send(RuntimeState* state, Block* block) { Status VDataStreamSender::close(RuntimeState* state, Status exec_status) { if (_closed) return Status::OK(); _closed = true; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); Status final_st = Status::OK(); for (int i = 0; i < _channels.size(); ++i) { diff --git a/be/test/runtime/buffered_tuple_stream2_test.cpp b/be/test/runtime/buffered_tuple_stream2_test.cpp index e31d935fc1504e..4a522ad84fda4e 100644 --- a/be/test/runtime/buffered_tuple_stream2_test.cpp +++ b/be/test/runtime/buffered_tuple_stream2_test.cpp @@ -117,7 +117,7 @@ class SimpleTupleStreamTest : public testing::Test { } virtual RowBatch* CreateIntBatch(int offset, int num_rows, bool gen_null) { - RowBatch* batch = _pool.add(new RowBatch(*_int_desc, num_rows, _tracker.get())); + RowBatch* batch = _pool.add(new RowBatch(*_int_desc, num_rows)); int tuple_size = _int_desc->tuple_descriptors()[0]->byte_size(); uint8_t* tuple_mem = reinterpret_cast( batch->tuple_data_pool()->allocate(tuple_size * num_rows)); @@ -146,7 +146,7 @@ class SimpleTupleStreamTest : public testing::Test { virtual RowBatch* CreateStringBatch(int offset, int num_rows, bool gen_null) { int tuple_size = sizeof(StringValue) + 1; - RowBatch* batch = _pool.add(new RowBatch(*_string_desc, num_rows, _tracker.get())); + RowBatch* batch = _pool.add(new RowBatch(*_string_desc, num_rows)); uint8_t* tuple_mem = batch->tuple_data_pool()->allocate(tuple_size * num_rows); memset(tuple_mem, 0, tuple_size * num_rows); const int string_tuples = _string_desc->tuple_descriptors().size(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index c49134f64ffdab..740c440536b5ce 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -44,6 +44,7 @@ public class SessionVariable implements Serializable, Writable { static final Logger LOG = LogManager.getLogger(SessionVariable.class); public static final String EXEC_MEM_LIMIT = "exec_mem_limit"; + public static final String QUERY_MEM_LIMIT = "query_mem_limit"; public static final String QUERY_TIMEOUT = "query_timeout"; public static final String ENABLE_PROFILE = "enable_profile"; public static final String SQL_MODE = "sql_mode"; @@ -184,10 +185,14 @@ public class SessionVariable implements Serializable, Writable { @VariableMgr.VarAttr(name = INSERT_VISIBLE_TIMEOUT_MS, needForward = true) public long insertVisibleTimeoutMs = DEFAULT_INSERT_VISIBLE_TIMEOUT_MS; - // max memory used on every backend. + // max instance memory used on every backend. @VariableMgr.VarAttr(name = EXEC_MEM_LIMIT) public long maxExecMemByte = 2147483648L; + // max query memory used on every backend. + @VariableMgr.VarAttr(name = QUERY_MEM_LIMIT) + public long maxQueryMemByte = 0L; + @VariableMgr.VarAttr(name = ENABLE_SPILLING) public boolean enableSpilling = false; @@ -427,6 +432,10 @@ public long getMaxExecMemByte() { return maxExecMemByte; } + public long getMaxQueryMemByte() { + return maxQueryMemByte; + } + public long getLoadMemLimit() { return loadMemLimit; } @@ -553,6 +562,14 @@ public void setMaxExecMemByte(long maxExecMemByte) { } } + public void setMaxQueryMemByte(long maxExecMemByte) { + if (maxExecMemByte < MIN_EXEC_MEM_LIMIT) { + this.maxExecMemByte = MIN_EXEC_MEM_LIMIT; + } else { + this.maxExecMemByte = maxExecMemByte; + } + } + public boolean isSqlQuoteShowCreate() { return sqlQuoteShowCreate; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java index 1031a55c344db8..77d734304add7e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java @@ -18,7 +18,7 @@ package org.apache.doris.qe; import org.apache.doris.catalog.Catalog; -import org.apache.doris.common.FeConstants; +// import org.apache.doris.common.FeConstants; import org.apache.doris.common.Pair; import org.apache.doris.common.Reference; import org.apache.doris.common.UserException; @@ -192,7 +192,7 @@ public static void addToBlacklist(Long backendID, String reason) { return; } - blacklistBackends.put(backendID, Pair.create(FeConstants.heartbeat_interval_second + 1, reason)); + // blacklistBackends.put(backendID, Pair.create(FeConstants.heartbeat_interval_second + 1, reason)); LOG.warn("add backend {} to black list. reason: {}", backendID, reason); } From 20d7a982d7fafc1e5a34bb5db2b479835c96c39b Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Wed, 2 Mar 2022 15:18:59 +0800 Subject: [PATCH 14/14] performance and stability --- be/src/common/config.h | 2 +- be/src/exec/aggregation_node.cpp | 8 +- be/src/exec/analytic_eval_node.cpp | 8 +- be/src/exec/assert_num_rows_node.cpp | 4 +- be/src/exec/blocking_join_node.cpp | 4 +- be/src/exec/broker_scan_node.cpp | 8 +- be/src/exec/cross_join_node.cpp | 2 +- be/src/exec/csv_scan_node.cpp | 8 +- be/src/exec/es_http_scan_node.cpp | 8 +- be/src/exec/es_scan_node.cpp | 8 +- be/src/exec/except_node.cpp | 2 +- be/src/exec/exchange_node.cpp | 8 +- be/src/exec/hash_join_node.cpp | 8 +- be/src/exec/hash_table.cpp | 2 +- be/src/exec/intersect_node.cpp | 2 +- be/src/exec/merge_join_node.cpp | 8 +- be/src/exec/merge_node.cpp | 8 +- be/src/exec/mysql_scan_node.cpp | 8 +- be/src/exec/odbc_scan_node.cpp | 8 +- be/src/exec/olap_scan_node.cpp | 8 +- be/src/exec/olap_scanner.cpp | 2 +- be/src/http/default_path_handlers.cpp | 2 +- be/src/olap/base_compaction.cpp | 3 - be/src/olap/compaction.cpp | 3 +- be/src/olap/cumulative_compaction.cpp | 3 - be/src/olap/lru_cache.cpp | 6 +- .../olap/rowset/segment_v2/column_reader.cpp | 4 +- be/src/olap/rowset/segment_v2/segment.cpp | 4 +- be/src/runtime/CMakeLists.txt | 2 + be/src/runtime/bufferpool/buffer_allocator.cc | 4 +- .../runtime/bufferpool/reservation_tracker.cc | 2 +- be/src/runtime/data_stream_recvr.cc | 1 - be/src/runtime/disk_io_mgr.cc | 20 +- be/src/runtime/disk_io_mgr.h | 2 +- be/src/runtime/exec_env.h | 14 +- be/src/runtime/exec_env_init.cpp | 18 +- be/src/runtime/fragment_mgr.cpp | 5 + be/src/runtime/initial_reservations.cc | 5 +- be/src/runtime/load_channel_mgr.cpp | 2 +- be/src/runtime/mem_pool.cpp | 35 +-- be/src/runtime/mem_tracker.cpp | 75 +++-- be/src/runtime/mem_tracker.h | 104 ++++++- be/src/runtime/mem_tracker_task_pool.cpp | 2 + be/src/runtime/memory/chunk.h | 1 - be/src/runtime/memory/chunk_allocator.cpp | 30 +- be/src/runtime/memory/chunk_allocator.h | 8 +- be/src/runtime/plan_fragment_executor.cpp | 10 +- be/src/runtime/row_batch.cpp | 136 +-------- be/src/runtime/runtime_filter_mgr.cpp | 2 +- be/src/runtime/tablets_channel.cpp | 2 +- be/src/runtime/tcmalloc_hook.h | 21 +- be/src/runtime/thread_context.cpp | 32 +++ be/src/runtime/thread_context.h | 157 +++++++---- be/src/runtime/thread_mem_tracker_mgr.cpp | 104 ++----- be/src/runtime/thread_mem_tracker_mgr.h | 262 ++++++++++++++++-- be/src/runtime/threadlocal.cc | 84 ++++++ be/src/runtime/threadlocal.h | 122 ++++++++ be/src/service/doris_main.cpp | 66 +++++ be/src/service/http_service.cpp | 2 +- be/test/runtime/test_env.cc | 1 - build.sh | 2 +- env.sh | 2 +- 62 files changed, 968 insertions(+), 516 deletions(-) create mode 100644 be/src/runtime/thread_context.cpp create mode 100644 be/src/runtime/threadlocal.cc create mode 100644 be/src/runtime/threadlocal.h diff --git a/be/src/common/config.h b/be/src/common/config.h index 78fa344e2ad4af..c188d9a4aec64b 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -611,7 +611,7 @@ CONF_mInt16(mem_tracker_level, "0"); // smaller than this value will continue to accumulate. specified as number of bytes. // Decreasing this value will increase the frequency of consume/release. // Increasing this value will cause MemTracker statistics to be inaccurate. -CONF_mInt32(mem_tracker_consume_min_size_bytes, "1048576"); +CONF_mInt32(mem_tracker_consume_min_size_bytes, "2097152"); // When MemTracker is a negative value, it is considered that a memory leak has occurred, // but the actual MemTracker records inaccurately will also cause a negative value, diff --git a/be/src/exec/aggregation_node.cpp b/be/src/exec/aggregation_node.cpp index 0d3ed04ca50565..51b88830a2230a 100644 --- a/be/src/exec/aggregation_node.cpp +++ b/be/src/exec/aggregation_node.cpp @@ -78,7 +78,7 @@ Status AggregationNode::init(const TPlanNode& tnode, RuntimeState* state) { Status AggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _get_results_timer = ADD_TIMER(runtime_profile(), "GetResultsTime"); _hash_table_buckets_counter = ADD_COUNTER(runtime_profile(), "BuildBuckets", TUnit::UNIT); @@ -143,7 +143,7 @@ Status AggregationNode::prepare(RuntimeState* state) { } Status AggregationNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); @@ -230,7 +230,7 @@ Status AggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* // 3. `child(0)->rows_returned() == 0` mean not data from child // in level two aggregation node should return nullptr result // level one aggregation node set `eos = true` return directly - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (UNLIKELY(!_needs_finalize && _singleton_output_tuple != nullptr && child(0)->rows_returned() == 0)) { *eos = true; @@ -292,7 +292,7 @@ Status AggregationNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // Iterate through the remaining rows in the hash table and call Serialize/Finalize on // them in order to free any memory allocated by UDAs. Finalize() requires a dst tuple diff --git a/be/src/exec/analytic_eval_node.cpp b/be/src/exec/analytic_eval_node.cpp index 3d12cb48fd6ec3..ff9cc412673a20 100644 --- a/be/src/exec/analytic_eval_node.cpp +++ b/be/src/exec/analytic_eval_node.cpp @@ -142,7 +142,7 @@ Status AnalyticEvalNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); DCHECK(child(0)->row_desc().is_prefix_of(row_desc())); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _child_tuple_desc = child(0)->row_desc().tuple_descriptors()[0]; _curr_tuple_pool.reset(new MemPool()); _prev_tuple_pool.reset(new MemPool()); @@ -185,7 +185,7 @@ Status AnalyticEvalNode::prepare(RuntimeState* state) { } Status AnalyticEvalNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_CANCELLED(state); @@ -814,7 +814,7 @@ inline int64_t AnalyticEvalNode::num_output_rows_ready() const { } Status AnalyticEvalNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); @@ -859,7 +859,7 @@ Status AnalyticEvalNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (_input_stream.get() != nullptr) { _input_stream->close(); diff --git a/be/src/exec/assert_num_rows_node.cpp b/be/src/exec/assert_num_rows_node.cpp index d25a0c071da1fc..4f9de8ad77a2d6 100644 --- a/be/src/exec/assert_num_rows_node.cpp +++ b/be/src/exec/assert_num_rows_node.cpp @@ -49,7 +49,7 @@ Status AssertNumRowsNode::prepare(RuntimeState* state) { } Status AssertNumRowsNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); // ISSUE-3435 @@ -58,7 +58,7 @@ Status AssertNumRowsNode::open(RuntimeState* state) { } Status AssertNumRowsNode::get_next(RuntimeState* state, RowBatch* output_batch, bool* eos) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); SCOPED_TIMER(_runtime_profile->total_time_counter()); output_batch->reset(); diff --git a/be/src/exec/blocking_join_node.cpp b/be/src/exec/blocking_join_node.cpp index 0f5d4f626dd284..e54721dd861c64 100644 --- a/be/src/exec/blocking_join_node.cpp +++ b/be/src/exec/blocking_join_node.cpp @@ -46,7 +46,7 @@ BlockingJoinNode::~BlockingJoinNode() { Status BlockingJoinNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _build_pool.reset(new MemPool()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); @@ -90,7 +90,7 @@ void BlockingJoinNode::build_side_thread(RuntimeState* state, std::promisetotal_time_counter()); // RETURN_IF_ERROR(Expr::open(_conjuncts, state)); diff --git a/be/src/exec/broker_scan_node.cpp b/be/src/exec/broker_scan_node.cpp index bfdce6ca3f4b96..adc959e23632f5 100644 --- a/be/src/exec/broker_scan_node.cpp +++ b/be/src/exec/broker_scan_node.cpp @@ -61,7 +61,7 @@ Status BrokerScanNode::init(const TPlanNode& tnode, RuntimeState* state) { Status BrokerScanNode::prepare(RuntimeState* state) { VLOG_QUERY << "BrokerScanNode prepare"; RETURN_IF_ERROR(ScanNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // get tuple desc _runtime_state = state; _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -88,7 +88,7 @@ Status BrokerScanNode::prepare(RuntimeState* state) { } Status BrokerScanNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); @@ -109,7 +109,7 @@ Status BrokerScanNode::start_scanners() { } Status BrokerScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); // check if CANCELLED. if (state->is_cancelled()) { @@ -194,7 +194,7 @@ Status BrokerScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); _scan_finished.store(true); diff --git a/be/src/exec/cross_join_node.cpp b/be/src/exec/cross_join_node.cpp index a4f00e0e4d8b78..4fbbc871a01574 100644 --- a/be/src/exec/cross_join_node.cpp +++ b/be/src/exec/cross_join_node.cpp @@ -84,7 +84,7 @@ void CrossJoinNode::init_get_next(TupleRow* first_left_row) { Status CrossJoinNode::get_next(RuntimeState* state, RowBatch* output_batch, bool* eos) { // RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT, state)); RETURN_IF_CANCELLED(state); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); *eos = false; // TOOD(zhaochun) // RETURN_IF_ERROR(state->check_query_state()); diff --git a/be/src/exec/csv_scan_node.cpp b/be/src/exec/csv_scan_node.cpp index d9ac8ea0483a4e..afebe91e5bc282 100644 --- a/be/src/exec/csv_scan_node.cpp +++ b/be/src/exec/csv_scan_node.cpp @@ -129,7 +129,7 @@ Status CsvScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // add timer _split_check_timer = ADD_TIMER(_runtime_profile, "split check timer"); @@ -212,7 +212,7 @@ Status CsvScanNode::prepare(RuntimeState* state) { } Status CsvScanNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); VLOG_CRITICAL << "CsvScanNode::Open"; @@ -235,7 +235,7 @@ Status CsvScanNode::open(RuntimeState* state) { } Status CsvScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "CsvScanNode::GetNext"; if (nullptr == state || nullptr == row_batch || nullptr == eos) { return Status::InternalError("input is nullptr pointer"); @@ -324,7 +324,7 @@ Status CsvScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "CsvScanNode::Close"; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); diff --git a/be/src/exec/es_http_scan_node.cpp b/be/src/exec/es_http_scan_node.cpp index eae73138cd68cc..42a122e8988bde 100644 --- a/be/src/exec/es_http_scan_node.cpp +++ b/be/src/exec/es_http_scan_node.cpp @@ -68,7 +68,7 @@ Status EsHttpScanNode::init(const TPlanNode& tnode, RuntimeState* state) { Status EsHttpScanNode::prepare(RuntimeState* state) { VLOG_QUERY << "EsHttpScanNode prepare"; RETURN_IF_ERROR(ScanNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _runtime_state = state; _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -115,7 +115,7 @@ Status EsHttpScanNode::build_conjuncts_list() { } Status EsHttpScanNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); @@ -184,7 +184,7 @@ Status EsHttpScanNode::collect_scanners_status() { } Status EsHttpScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (state->is_cancelled()) { std::unique_lock l(_batch_queue_lock); @@ -272,7 +272,7 @@ Status EsHttpScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); _scan_finished.store(true); diff --git a/be/src/exec/es_scan_node.cpp b/be/src/exec/es_scan_node.cpp index a548a0b1cacf16..a50e1d905eb45f 100644 --- a/be/src/exec/es_scan_node.cpp +++ b/be/src/exec/es_scan_node.cpp @@ -68,7 +68,7 @@ Status EsScanNode::prepare(RuntimeState* state) { VLOG_CRITICAL << "EsScanNode::Prepare"; RETURN_IF_ERROR(ScanNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); if (_tuple_desc == nullptr) { std::stringstream ss; @@ -82,7 +82,7 @@ Status EsScanNode::prepare(RuntimeState* state) { } Status EsScanNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "EsScanNode::Open"; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); @@ -207,7 +207,7 @@ Status EsScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); // create tuple @@ -260,7 +260,7 @@ Status EsScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) Status EsScanNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "EsScanNode::Close"; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/except_node.cpp b/be/src/exec/except_node.cpp index 992ab60a068e6a..f3fc6695f078f7 100644 --- a/be/src/exec/except_node.cpp +++ b/be/src/exec/except_node.cpp @@ -90,7 +90,7 @@ Status ExceptNode::open(RuntimeState* state) { Status ExceptNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eos) { RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); *eos = true; if (reached_limit()) { diff --git a/be/src/exec/exchange_node.cpp b/be/src/exec/exchange_node.cpp index e421415b83ce56..759246c55bf7bc 100644 --- a/be/src/exec/exchange_node.cpp +++ b/be/src/exec/exchange_node.cpp @@ -58,7 +58,7 @@ Status ExchangeNode::init(const TPlanNode& tnode, RuntimeState* state) { Status ExchangeNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _convert_row_batch_timer = ADD_TIMER(runtime_profile(), "ConvertRowBatchTime"); // TODO: figure out appropriate buffer size DCHECK_GT(_num_senders, 0); @@ -76,7 +76,7 @@ Status ExchangeNode::prepare(RuntimeState* state) { } Status ExchangeNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); if (_is_merging) { @@ -105,7 +105,7 @@ Status ExchangeNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (_is_merging) { _sort_exec_exprs.close(state); } @@ -132,7 +132,7 @@ Status ExchangeNode::fill_input_row_batch(RuntimeState* state) { Status ExchangeNode::get_next(RuntimeState* state, RowBatch* output_batch, bool* eos) { RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (reached_limit()) { diff --git a/be/src/exec/hash_join_node.cpp b/be/src/exec/hash_join_node.cpp index 41bde1e45e0756..16c42166e5db62 100644 --- a/be/src/exec/hash_join_node.cpp +++ b/be/src/exec/hash_join_node.cpp @@ -96,7 +96,7 @@ Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status HashJoinNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _build_pool.reset(new MemPool()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); @@ -157,7 +157,7 @@ Status HashJoinNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); // Must reset _probe_batch in close() to release resources @@ -219,7 +219,7 @@ Status HashJoinNode::construct_hash_table(RuntimeState* state) { } Status HashJoinNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -777,11 +777,9 @@ Status HashJoinNode::process_build_batch(RuntimeState* state, RowBatch* build_ba _build_pool.get(), false); } } - RETURN_IF_LIMIT_EXCEEDED(state, "Hash join, while constructing the hash table."); } else { // take ownership of tuple data of build_batch _build_pool->acquire_data(build_batch->tuple_data_pool(), false); - RETURN_IF_LIMIT_EXCEEDED(state, "Hash join, while constructing the hash table."); for (int i = 0; i < build_batch->num_rows(); ++i) { _hash_tbl->insert(build_batch->get_row(i)); diff --git a/be/src/exec/hash_table.cpp b/be/src/exec/hash_table.cpp index b821a4a7cce9e0..022bbc301af20b 100644 --- a/be/src/exec/hash_table.cpp +++ b/be/src/exec/hash_table.cpp @@ -48,7 +48,7 @@ HashTable::HashTable(const std::vector& build_expr_ctxs, DCHECK_EQ(_build_expr_ctxs.size(), _probe_expr_ctxs.size()); DCHECK_EQ((num_buckets & (num_buckets - 1)), 0) << "num_buckets must be a power of 2"; - _mem_tracker = MemTracker::create_virtual_tracker(-1, mem_tracker->label() + ":HashTable", + _mem_tracker = MemTracker::create_virtual_tracker(-1, mem_tracker->label() + "HashTable", mem_tracker); _buckets.resize(num_buckets); _num_buckets = num_buckets; diff --git a/be/src/exec/intersect_node.cpp b/be/src/exec/intersect_node.cpp index b943b28f85fae7..98660740a4d3f2 100644 --- a/be/src/exec/intersect_node.cpp +++ b/be/src/exec/intersect_node.cpp @@ -88,7 +88,7 @@ Status IntersectNode::open(RuntimeState* state) { } Status IntersectNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eos) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/merge_join_node.cpp b/be/src/exec/merge_join_node.cpp index 72ae19ac414822..bf6e78ca16677f 100644 --- a/be/src/exec/merge_join_node.cpp +++ b/be/src/exec/merge_join_node.cpp @@ -72,7 +72,7 @@ Status MergeJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status MergeJoinNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // build and probe exprs are evaluated in the context of the rows produced by our // right and left children, respectively @@ -141,7 +141,7 @@ Status MergeJoinNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); Expr::close(_left_expr_ctxs, state); Expr::close(_right_expr_ctxs, state); @@ -150,7 +150,7 @@ Status MergeJoinNode::close(RuntimeState* state) { } Status MergeJoinNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(ExecNode::open(state)); @@ -172,7 +172,7 @@ Status MergeJoinNode::open(RuntimeState* state) { } Status MergeJoinNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eos) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/merge_node.cpp b/be/src/exec/merge_node.cpp index b8232e00132eb3..b2a12b33862604 100644 --- a/be/src/exec/merge_node.cpp +++ b/be/src/exec/merge_node.cpp @@ -61,7 +61,7 @@ Status MergeNode::init(const TPlanNode& tnode, RuntimeState* state) { Status MergeNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); DCHECK(_tuple_desc != nullptr); @@ -92,7 +92,7 @@ Status MergeNode::prepare(RuntimeState* state) { } Status MergeNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); // Prepare const expr lists. for (int i = 0; i < _const_result_expr_ctx_lists.size(); ++i) { @@ -108,7 +108,7 @@ Status MergeNode::open(RuntimeState* state) { } Status MergeNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -189,7 +189,7 @@ Status MergeNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // don't call ExecNode::close(), it always closes all children _child_row_batch.reset(nullptr); for (int i = 0; i < _const_result_expr_ctx_lists.size(); ++i) { diff --git a/be/src/exec/mysql_scan_node.cpp b/be/src/exec/mysql_scan_node.cpp index 0ea0e5df56c50b..349fd3750da901 100644 --- a/be/src/exec/mysql_scan_node.cpp +++ b/be/src/exec/mysql_scan_node.cpp @@ -54,7 +54,7 @@ Status MysqlScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // get tuple desc _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -101,7 +101,7 @@ Status MysqlScanNode::prepare(RuntimeState* state) { } Status MysqlScanNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); VLOG_CRITICAL << "MysqlScanNode::Open"; @@ -149,7 +149,7 @@ Status MysqlScanNode::write_text_slot(char* value, int value_length, SlotDescrip } Status MysqlScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "MysqlScanNode::GetNext"; if (nullptr == state || nullptr == row_batch || nullptr == eos) { @@ -245,7 +245,7 @@ Status MysqlScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/odbc_scan_node.cpp b/be/src/exec/odbc_scan_node.cpp index 379e8aee8451a7..054dc825f213b6 100644 --- a/be/src/exec/odbc_scan_node.cpp +++ b/be/src/exec/odbc_scan_node.cpp @@ -56,7 +56,7 @@ Status OdbcScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // get tuple desc _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -94,7 +94,7 @@ Status OdbcScanNode::prepare(RuntimeState* state) { } Status OdbcScanNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); VLOG_CRITICAL << _scan_node_type << "::Open"; @@ -131,7 +131,7 @@ Status OdbcScanNode::write_text_slot(char* value, int value_length, SlotDescript Status OdbcScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { VLOG_CRITICAL << _scan_node_type << "::GetNext"; - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (nullptr == state || nullptr == row_batch || nullptr == eos) { return Status::InternalError("input is nullptr pointer"); @@ -236,7 +236,7 @@ Status OdbcScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index dceb6171d6ba77..fa29b048bb346a 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -171,7 +171,7 @@ void OlapScanNode::_init_counter(RuntimeState* state) { Status OlapScanNode::prepare(RuntimeState* state) { init_scan_profile(); RETURN_IF_ERROR(ScanNode::prepare(state)); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // create scanner profile // create timer _tablet_counter = ADD_COUNTER(runtime_profile(), "TabletCount ", TUnit::UNIT); @@ -216,7 +216,7 @@ Status OlapScanNode::prepare(RuntimeState* state) { } Status OlapScanNode::open(RuntimeState* state) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "OlapScanNode::Open"; SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); @@ -260,7 +260,7 @@ Status OlapScanNode::open(RuntimeState* state) { } Status OlapScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -375,7 +375,7 @@ Status OlapScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); // change done status diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp index 682886ed17f84e..37cd62e97f4aa5 100644 --- a/be/src/exec/olap_scanner.cpp +++ b/be/src/exec/olap_scanner.cpp @@ -259,7 +259,7 @@ Status OlapScanner::_init_return_columns() { } Status OlapScanner::get_batch(RuntimeState* state, RowBatch* batch, bool* eof) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // 2. Allocate Row's Tuple buf uint8_t* tuple_buf = batch->tuple_data_pool()->allocate(state->batch_size() * _tuple_desc->byte_size()); diff --git a/be/src/http/default_path_handlers.cpp b/be/src/http/default_path_handlers.cpp index 3bb77d3057a5f1..2b7803344b6a9d 100644 --- a/be/src/http/default_path_handlers.cpp +++ b/be/src/http/default_path_handlers.cpp @@ -144,7 +144,7 @@ void mem_tracker_handler(const WebPageHandler::ArgumentMap& args, std::stringstr (*output) << "\n"; std::vector> trackers; - MemTracker::list_root_trackers(&trackers); + MemTracker::list_process_trackers(&trackers); for (const shared_ptr& tracker : trackers) { string parent = tracker->parent() == nullptr ? "none" : tracker->parent()->label(); string limit_str; diff --git a/be/src/olap/base_compaction.cpp b/be/src/olap/base_compaction.cpp index 7472a807279e28..7647527d1b1357 100644 --- a/be/src/olap/base_compaction.cpp +++ b/be/src/olap/base_compaction.cpp @@ -29,7 +29,6 @@ BaseCompaction::BaseCompaction(TabletSharedPtr tablet) BaseCompaction::~BaseCompaction() {} OLAPStatus BaseCompaction::prepare_compact() { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (!_tablet->init_succeeded()) { return OLAP_ERR_INPUT_PARAMETER_ERROR; } @@ -51,7 +50,6 @@ OLAPStatus BaseCompaction::prepare_compact() { } OLAPStatus BaseCompaction::execute_compact_impl() { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); MutexLock lock(_tablet->get_base_lock(), TRY_LOCK); if (!lock.own_lock()) { LOG(WARNING) << "another base compaction is running. tablet=" << _tablet->full_name(); @@ -83,7 +81,6 @@ OLAPStatus BaseCompaction::execute_compact_impl() { } OLAPStatus BaseCompaction::pick_rowsets_to_compact() { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); _input_rowsets.clear(); _tablet->pick_candidate_rowsets_to_base_compaction(&_input_rowsets); if (_input_rowsets.size() <= 1) { diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 8aff5b6dded6d5..7b3239eed85e3f 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -29,7 +29,7 @@ namespace doris { Compaction::Compaction(TabletSharedPtr tablet, const std::string& label) : _mem_tracker( - MemTracker::create_tracker(-1, label, nullptr, MemTrackerLevel::TASK)), + MemTracker::create_tracker(-1, label, nullptr, MemTrackerLevel::INSTANCE)), _readers_tracker(MemTracker::create_tracker( -1, "CompactionReaderTracker:" + std::to_string(tablet->tablet_id()), _mem_tracker)), @@ -44,6 +44,7 @@ Compaction::Compaction(TabletSharedPtr tablet, const std::string& label) Compaction::~Compaction() {} OLAPStatus Compaction::compact() { + SCOPED_ATTACH_TASK_THREAD_2ARG(ThreadContext::TaskType::COMPACTION, _mem_tracker); RETURN_NOT_OK(prepare_compact()); RETURN_NOT_OK(execute_compact()); return OLAP_SUCCESS; diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index 7990d75c95f0ff..bce0148a834246 100644 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -30,7 +30,6 @@ CumulativeCompaction::CumulativeCompaction(TabletSharedPtr tablet) CumulativeCompaction::~CumulativeCompaction() {} OLAPStatus CumulativeCompaction::prepare_compact() { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (!_tablet->init_succeeded()) { return OLAP_ERR_CUMULATIVE_INVALID_PARAMETERS; } @@ -58,7 +57,6 @@ OLAPStatus CumulativeCompaction::prepare_compact() { } OLAPStatus CumulativeCompaction::execute_compact_impl() { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); MutexLock lock(_tablet->get_cumulative_lock(), TRY_LOCK); if (!lock.own_lock()) { LOG(INFO) << "The tablet is under cumulative compaction. tablet=" << _tablet->full_name(); @@ -96,7 +94,6 @@ OLAPStatus CumulativeCompaction::execute_compact_impl() { } OLAPStatus CumulativeCompaction::pick_rowsets_to_compact() { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::vector candidate_rowsets; _tablet->pick_candidate_rowsets_to_cumulative_compaction( diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index 92fdf1dcd65f26..78e856a18ce435 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -320,8 +320,8 @@ Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, // note that the cache might get larger than its capacity if not enough // space was freed auto old = _table.insert(e); - DCHECK(thread_local_ctx.thread_mem_tracker()->parent_task_mem_tracker() == nullptr); - Status st = source_mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), charge); + DCHECK(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()->parent_task_mem_tracker() != nullptr); + source_mem_tracker->transfer_to(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker(), charge); _usage += e->total_size; if (old != nullptr) { old->in_cache = false; @@ -476,7 +476,7 @@ ShardedLRUCache::~ShardedLRUCache() { Cache::Handle* ShardedLRUCache::insert(const CacheKey& key, void* value, size_t charge, void (*deleter)(const CacheKey& key, void* value), CachePriority priority) { - std::shared_ptr source_mem_tracker = thread_local_ctx.thread_mem_tracker(); + std::shared_ptr source_mem_tracker = thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker(); SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const uint32_t hash = _hash_slice(key); return _shards[_shard(hash)]->insert(key, hash, value, charge, deleter, priority, diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 30848d497eeb15..85d9b77e72e579 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -457,9 +457,7 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, bool FileColumnIterator::FileColumnIterator(ColumnReader* reader) : _reader(reader) {} -FileColumnIterator::~FileColumnIterator() { - _opts.mem_tracker->release(_opts.mem_tracker->consumption()); -} +FileColumnIterator::~FileColumnIterator() {} Status FileColumnIterator::seek_to_first() { RETURN_IF_ERROR(_reader->seek_to_first(&_page_iter)); diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index d0d6836c02a8e7..377c561e26d2b7 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -50,8 +50,8 @@ Status Segment::open(const FilePathDesc& path_desc, uint32_t segment_id, const T Segment::Segment(const FilePathDesc& path_desc, uint32_t segment_id, const TabletSchema* tablet_schema) : _path_desc(path_desc), _segment_id(segment_id), _tablet_schema(tablet_schema) { - _mem_tracker = MemTracker::create_tracker(-1, "Segment", - StorageEngine::instance()->tablet_mem_tracker()); + _mem_tracker = MemTracker::create_virtual_tracker( + -1, "Segment", StorageEngine::instance()->tablet_mem_tracker()); } Segment::~Segment() { diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt index c38c2a116e24c6..a1ca44b15e51d2 100644 --- a/be/src/runtime/CMakeLists.txt +++ b/be/src/runtime/CMakeLists.txt @@ -46,8 +46,10 @@ set(RUNTIME_FILES runtime_state.cpp runtime_filter_mgr.cpp string_value.cpp + thread_context.cpp thread_mem_tracker_mgr.cpp thread_resource_mgr.cpp + threadlocal.cc decimalv2_value.cpp large_int_value.cpp collection_value.cpp diff --git a/be/src/runtime/bufferpool/buffer_allocator.cc b/be/src/runtime/bufferpool/buffer_allocator.cc index fc639b46c21a71..58bd873f74f5e0 100644 --- a/be/src/runtime/bufferpool/buffer_allocator.cc +++ b/be/src/runtime/bufferpool/buffer_allocator.cc @@ -221,7 +221,7 @@ Status BufferPool::BufferAllocator::Allocate(ClientHandle* client, int64_t len, COUNTER_UPDATE(client->impl_->counters().cumulative_allocations, 1); RETURN_IF_ERROR(AllocateInternal(len, buffer)); - thread_local_ctx.consume_mem(len); + // thread_local_ctx.get()->consume_mem(len); DCHECK(buffer->is_open()); buffer->client_ = client; return Status::OK(); @@ -385,7 +385,7 @@ void BufferPool::BufferAllocator::Free(BufferHandle&& handle) { handle.client_ = nullptr; // Buffer is no longer associated with a client. FreeBufferArena* arena = per_core_arenas_[handle.home_core_].get(); handle.Poison(); - thread_local_ctx.release_mem(handle.len()); + // thread_local_ctx.get()->release_mem(handle.len()); arena->AddFreeBuffer(std::move(handle)); } diff --git a/be/src/runtime/bufferpool/reservation_tracker.cc b/be/src/runtime/bufferpool/reservation_tracker.cc index cb1b5f929f268c..1e56441a0d8fd4 100644 --- a/be/src/runtime/bufferpool/reservation_tracker.cc +++ b/be/src/runtime/bufferpool/reservation_tracker.cc @@ -60,7 +60,7 @@ void ReservationTracker::InitChildTracker(RuntimeProfile* profile, ReservationTr std::lock_guard l(lock_); DCHECK(!initialized_); parent_ = parent; - mem_tracker_ = mem_tracker; + mem_tracker_ = nullptr; // TODO(zxy) remove ReservationTracker later reservation_limit_ = reservation_limit; reservation_ = 0; diff --git a/be/src/runtime/data_stream_recvr.cc b/be/src/runtime/data_stream_recvr.cc index f11b178fda30e0..f80d12918541bb 100644 --- a/be/src/runtime/data_stream_recvr.cc +++ b/be/src/runtime/data_stream_recvr.cc @@ -492,7 +492,6 @@ void DataStreamRecvr::add_batch(RowBatch* batch, int sender_id, bool use_move) { } void DataStreamRecvr::remove_sender(int sender_id, int be_number) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; _sender_queues[use_sender_id]->decrement_senders(be_number); } diff --git a/be/src/runtime/disk_io_mgr.cc b/be/src/runtime/disk_io_mgr.cc index 585d88d62660da..f575d17df7da07 100644 --- a/be/src/runtime/disk_io_mgr.cc +++ b/be/src/runtime/disk_io_mgr.cc @@ -234,7 +234,7 @@ void DiskIoMgr::BufferDescriptor::set_mem_tracker(std::shared_ptr tr if (_mem_tracker.get() == tracker.get()) { return; } - Status st = _mem_tracker->transfer_to(tracker, _buffer_len); + _mem_tracker->transfer_to(tracker, _buffer_len); _mem_tracker = std::move(tracker); } @@ -274,6 +274,9 @@ DiskIoMgr::DiskIoMgr() // std::min((uint64_t)config::max_cached_file_handles, FileSystemUtil::max_num_file_handles()), // &HdfsCachedFileHandle::release) { { + _mem_tracker = + MemTracker::create_tracker(-1, "DiskIO", nullptr, MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t max_buffer_size_scaled = bit_ceil(_max_buffer_size, _min_buffer_size); _free_buffers.resize(bit_log2(max_buffer_size_scaled) + 1); int num_local_disks = (config::num_disks == 0 ? DiskInfo::num_disks() : config::num_disks); @@ -294,6 +297,9 @@ DiskIoMgr::DiskIoMgr(int num_local_disks, int threads_per_disk, int min_buffer_s // _file_handle_cache(::min(config::max_cached_file_handles, // FileSystemUtil::max_num_file_handles()), &HdfsCachedFileHandle::release) { { + _mem_tracker = + MemTracker::create_tracker(-1, "DiskIO", nullptr, MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t max_buffer_size_scaled = bit_ceil(_max_buffer_size, _min_buffer_size); _free_buffers.resize(bit_log2(max_buffer_size_scaled) + 1); if (num_local_disks == 0) { @@ -304,6 +310,7 @@ DiskIoMgr::DiskIoMgr(int num_local_disks, int threads_per_disk, int min_buffer_s } DiskIoMgr::~DiskIoMgr() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); _shut_down = true; // Notify all worker threads and shut them down. for (int i = 0; i < _disk_queues.size(); ++i) { @@ -359,14 +366,13 @@ DiskIoMgr::~DiskIoMgr() { } Status DiskIoMgr::init(const int64_t mem_limit) { - _mem_tracker = - MemTracker::create_tracker(mem_limit, "DiskIO", nullptr, MemTrackerLevel::OVERVIEW); SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + _mem_tracker->set_limit(mem_limit); _cached_buffers_mem_tracker = MemTracker::create_tracker( mem_limit, "DiskIO:CachedBuffers", _mem_tracker, MemTrackerLevel::OVERVIEW); // If we hit the process limit, see if we can reclaim some memory by removing // previously allocated (but unused) io buffers. - ExecEnv::GetInstance()->process_mem_tracker()->add_gc_function( + MemTracker::get_process_tracker()->add_gc_function( std::bind(&DiskIoMgr::gc_io_buffers, this, std::placeholders::_1)); for (int i = 0; i < _disk_queues.size(); ++i) { @@ -388,7 +394,7 @@ Status DiskIoMgr::init(const int64_t mem_limit) { // _disk_thread_group.AddThread(new Thread("disk-io-mgr", ss.str(), // &DiskIoMgr::work_loop, this, _disk_queues[i])); _disk_thread_group.add_thread( - new std::thread(std::bind(&DiskIoMgr::work_loop, this, _disk_queues[i]))); + new std::thread(std::bind(&DiskIoMgr::work_loop, this, _disk_queues[i], _mem_tracker))); } } _request_context_cache.reset(new RequestContextCache(this)); @@ -652,6 +658,7 @@ Status DiskIoMgr::read(RequestContext* reader, ScanRange* range, BufferDescripto } void DiskIoMgr::return_buffer(BufferDescriptor* buffer_desc) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(buffer_desc != nullptr); if (!buffer_desc->_status.ok()) { DCHECK(buffer_desc->_buffer == nullptr); @@ -984,7 +991,7 @@ void DiskIoMgr::handle_read_finished(DiskQueue* disk_queue, RequestContext* read state.decrement_request_thread(); } -void DiskIoMgr::work_loop(DiskQueue* disk_queue) { +void DiskIoMgr::work_loop(DiskQueue* disk_queue, std::shared_ptr mem_tracker) { // The thread waits until there is work or the entire system is being shut down. // If there is work, performs the read or write requested and re-enqueues the // requesting context. @@ -996,6 +1003,7 @@ void DiskIoMgr::work_loop(DiskQueue* disk_queue) { // re-enqueues the request. // 3. Perform the read or write as specified. // Cancellation checking needs to happen in both steps 1 and 3. + SCOPED_ATTACH_TASK_THREAD_2ARG(ThreadContext::TaskType::QUERY, mem_tracker); while (!_shut_down) { RequestContext* worker_context = nullptr; ; diff --git a/be/src/runtime/disk_io_mgr.h b/be/src/runtime/disk_io_mgr.h index fc386896f1a1c1..9b716abdae6e7d 100644 --- a/be/src/runtime/disk_io_mgr.h +++ b/be/src/runtime/disk_io_mgr.h @@ -808,7 +808,7 @@ class DiskIoMgr { // Disk worker thread loop. This function retrieves the next range to process on // the disk queue and invokes read_range() or Write() depending on the type of Range(). // There can be multiple threads per disk running this loop. - void work_loop(DiskQueue* queue); + void work_loop(DiskQueue* queue, std::shared_ptr mem_tracker); // This is called from the disk thread to get the next range to process. It will // wait until a scan range and buffer are available, or a write range is available. diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index 8fa0bf781d375d..a2d51625fdb645 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -60,6 +60,9 @@ class WebPageHandler; class StreamLoadExecutor; class RoutineLoadTaskExecutor; class SmallFileMgr; +class ThreadContext; + +static std::vector free_thread_ctx; class BackendServiceClient; class FrontendServiceClient; @@ -118,12 +121,16 @@ class ExecEnv { return nullptr; } - std::shared_ptr process_mem_tracker() { return _process_mem_tracker; } + // std::shared_ptr process_mem_tracker() { return _process_mem_tracker; } + // MemTracker* process_mem_tracker_raw() { return _process_mem_tracker.get(); } + std::shared_ptr new_process_mem_tracker() { return _new_process_mem_tracker; } std::shared_ptr query_pool_mem_tracker() { return _query_pool_mem_tracker; } std::shared_ptr load_pool_mem_tracker() { return _load_pool_mem_tracker; } MemTrackerTaskPool* task_pool_mem_tracker_registry() { return _task_pool_mem_tracker_registry.get(); } + std::vector free_thread_ctx() { return _free_thread_ctx; } + // ThreadContext* get_thread_local_ctx() { return thread_local_ctx(); } ThreadResourceMgr* thread_mgr() { return _thread_mgr; } PriorityThreadPool* scan_thread_pool() { return _scan_thread_pool; } ThreadPool* limited_scan_thread_pool() { return _limited_scan_thread_pool.get(); } @@ -188,13 +195,16 @@ class ExecEnv { // The ancestor of all trackers in the process. It is the only child of the root tracker. // All manually created trackers should specify the process tracker as the parent. - std::shared_ptr _process_mem_tracker = nullptr; + // std::shared_ptr _process_mem_tracker = nullptr; + std::shared_ptr _new_process_mem_tracker = nullptr; // The ancestor for all querys tracker. std::shared_ptr _query_pool_mem_tracker = nullptr; // The ancestor for all load tracker. std::shared_ptr _load_pool_mem_tracker = nullptr; std::unique_ptr _task_pool_mem_tracker_registry; + std::vector _free_thread_ctx; + // The following two thread pools are used in different scenarios. // _scan_thread_pool is a priority thread pool. // Scanner threads for common queries will use this thread pool, diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index c7248803c22d67..7f75c6b2d111bc 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -149,7 +149,7 @@ Status ExecEnv::_init(const std::vector& store_paths) { _small_file_mgr->init(); _init_mem_tracker(); - RETURN_IF_ERROR(_load_channel_mgr->init(_process_mem_tracker->limit())); + RETURN_IF_ERROR(_load_channel_mgr->init(MemTracker::get_process_tracker()->limit())); _heartbeat_flags = new HeartbeatFlags(); _register_metrics(); _is_init = true; @@ -176,16 +176,20 @@ Status ExecEnv::_init_mem_tracker() { << ". Using physical memory instead"; global_memory_limit_bytes = MemInfo::physical_mem(); } - _process_mem_tracker = - MemTracker::create_tracker(global_memory_limit_bytes, "Process", - MemTracker::get_root_tracker(), MemTrackerLevel::OVERVIEW); + MemTracker::get_process_tracker()->set_limit(global_memory_limit_bytes); + // _process_mem_tracker = + // MemTracker::create_tracker(global_memory_limit_bytes, "Process", + // MemTracker::get_root_tracker(), MemTrackerLevel::OVERVIEW); + _new_process_mem_tracker = + MemTracker::create_virtual_tracker(global_memory_limit_bytes, "NewProcess", + nullptr, MemTrackerLevel::OVERVIEW); _query_pool_mem_tracker = - MemTracker::create_tracker(global_memory_limit_bytes, "QueryPool", _process_mem_tracker, + MemTracker::create_tracker(global_memory_limit_bytes, "QueryPool", MemTracker::get_process_tracker(), MemTrackerLevel::OVERVIEW); REGISTER_HOOK_METRIC(query_mem_consumption, [this]() { return _query_pool_mem_tracker->consumption(); }); _load_pool_mem_tracker = MemTracker::create_tracker( - global_memory_limit_bytes, "LoadPool", _process_mem_tracker, MemTrackerLevel::OVERVIEW); + global_memory_limit_bytes, "LoadPool", MemTracker::get_process_tracker(), MemTrackerLevel::OVERVIEW); REGISTER_HOOK_METRIC(load_mem_consumption, [this]() { return _load_pool_mem_tracker->consumption(); }); LOG(INFO) << "Using global memory limit: " @@ -252,7 +256,7 @@ Status ExecEnv::_init_mem_tracker() { SegmentLoader::create_global_instance(config::segment_cache_capacity); // 4. init other managers - RETURN_IF_ERROR(_disk_io_mgr->init(_process_mem_tracker->limit())); + RETURN_IF_ERROR(_disk_io_mgr->init(global_memory_limit_bytes)); RETURN_IF_ERROR(_tmp_file_mgr->init()); // TODO(zc): The current memory usage configuration is a bit confusing, diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 660e61ae944b89..74c819abc9ea35 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -251,6 +251,11 @@ Status FragmentExecState::execute() { Status FragmentExecState::cancel_before_execute() { // set status as 'abort', cuz cancel() won't effect the status arg of DataSink::close(). + // TODO(zxy) 2ARG + SCOPED_ATTACH_TASK_THREAD_4ARG(executor()->runtime_state()->query_type(), + print_id(query_id()), + fragment_instance_id(), + executor()->runtime_state()->instance_mem_tracker()); _executor.set_abort(); _executor.cancel(); if (_pipe != nullptr) { diff --git a/be/src/runtime/initial_reservations.cc b/be/src/runtime/initial_reservations.cc index 21d0e66aaecc8a..86b1f2f1b65e6c 100644 --- a/be/src/runtime/initial_reservations.cc +++ b/be/src/runtime/initial_reservations.cc @@ -40,7 +40,8 @@ InitialReservations::InitialReservations(ObjectPool* obj_pool, : initial_reservation_mem_tracker_( MemTracker::create_tracker(-1, "InitialReservations", query_mem_tracker)), remaining_initial_reservation_claims_(initial_reservation_total_claims) { - initial_reservations_.InitChildTracker(nullptr, query_reservation, nullptr, + initial_reservations_.InitChildTracker(nullptr, query_reservation, + initial_reservation_mem_tracker_.get(), numeric_limits::max()); } @@ -82,7 +83,5 @@ void InitialReservations::Return(BufferPool::ClientHandle* src, int64_t bytes) { void InitialReservations::ReleaseResources() { initial_reservations_.Close(); - // TODO(HW): Close() is private. make this tracker shared later - // initial_reservation_mem_tracker_->Close(); } } // namespace doris diff --git a/be/src/runtime/load_channel_mgr.cpp b/be/src/runtime/load_channel_mgr.cpp index a2b109fefe5cf7..08eabfbdd7e89c 100644 --- a/be/src/runtime/load_channel_mgr.cpp +++ b/be/src/runtime/load_channel_mgr.cpp @@ -86,7 +86,7 @@ LoadChannelMgr::~LoadChannelMgr() { Status LoadChannelMgr::init(int64_t process_mem_limit) { int64_t load_mem_limit = calc_process_max_load_memory(process_mem_limit); _mem_tracker = MemTracker::create_tracker(load_mem_limit, "LoadChannelMgr", - ExecEnv::GetInstance()->process_mem_tracker(), + MemTracker::get_process_tracker(), MemTrackerLevel::OVERVIEW); SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); REGISTER_HOOK_METRIC(load_channel_mem_consumption, [this]() { return _mem_tracker->consumption(); }); diff --git a/be/src/runtime/mem_pool.cpp b/be/src/runtime/mem_pool.cpp index 06396851837d92..9bf2d74b36af54 100644 --- a/be/src/runtime/mem_pool.cpp +++ b/be/src/runtime/mem_pool.cpp @@ -61,18 +61,17 @@ MemPool::MemPool() total_allocated_bytes_(0), total_reserved_bytes_(0), peak_allocated_bytes_(0), - _mem_tracker(thread_local_ctx.thread_mem_tracker()) {} + _mem_tracker(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()) {} MemPool::ChunkInfo::ChunkInfo(const Chunk& chunk_) : chunk(chunk_), allocated_bytes(0) { DorisMetrics::instance()->memory_pool_bytes_total->increment(chunk.size); } MemPool::~MemPool() { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t total_bytes_released = 0; for (auto& chunk : chunks_) { total_bytes_released += chunk.chunk.size; - ChunkAllocator::instance()->free(chunk.chunk); + ChunkAllocator::instance()->free(chunk.chunk, _mem_tracker); } DorisMetrics::instance()->memory_pool_bytes_total->increment(-total_bytes_released); } @@ -88,11 +87,10 @@ void MemPool::clear() { } void MemPool::free_all() { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t total_bytes_released = 0; for (auto& chunk : chunks_) { total_bytes_released += chunk.chunk.size; - ChunkAllocator::instance()->free(chunk.chunk); + ChunkAllocator::instance()->free(chunk.chunk, _mem_tracker); } chunks_.clear(); next_chunk_size_ = INITIAL_CHUNK_SIZE; @@ -104,7 +102,6 @@ void MemPool::free_all() { } Status MemPool::find_chunk(size_t min_size, bool check_limits) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // Try to allocate from a free chunk. We may have free chunks after the current chunk // if Clear() was called. The current chunk may be free if ReturnPartialAllocation() // was called. The first free chunk (if there is one) can therefore be either the @@ -145,7 +142,7 @@ Status MemPool::find_chunk(size_t min_size, bool check_limits) { // Allocate a new chunk. Return early if allocate fails. Chunk chunk; - RETURN_IF_ERROR(ChunkAllocator::instance()->allocate(chunk_size, &chunk, check_limits)); + RETURN_IF_ERROR(ChunkAllocator::instance()->allocate(chunk_size, &chunk, _mem_tracker, check_limits)); ASAN_POISON_MEMORY_REGION(chunk.data, chunk_size); // Put it before the first free chunk. If no free chunks, it goes at the end. if (first_free_idx == static_cast(chunks_.size())) { @@ -164,7 +161,6 @@ Status MemPool::find_chunk(size_t min_size, bool check_limits) { } void MemPool::acquire_data(MemPool* src, bool keep_current) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(src->check_integrity(false)); int num_acquired_chunks = 0; if (keep_current) { @@ -183,16 +179,17 @@ void MemPool::acquire_data(MemPool* src, bool keep_current) { auto end_chunk = src->chunks_.begin() + num_acquired_chunks; int64_t total_transferred_bytes = 0; - // There is no limit check, assuming that both ends of acquire_data are in the same query. for (auto i = src->chunks_.begin(); i != end_chunk; ++i) { total_transferred_bytes += i->chunk.size; - Status st = i->chunk.mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), - i->chunk.size); - i->chunk.mem_tracker = thread_local_ctx.thread_mem_tracker(); } src->total_reserved_bytes_ -= total_transferred_bytes; total_reserved_bytes_ += total_transferred_bytes; + // Skip unnecessary atomic ops if the mem_trackers are the same. + if (src->_mem_tracker != _mem_tracker) { + src->_mem_tracker->transfer_to(_mem_tracker, total_transferred_bytes); + } + // insert new chunks after current_chunk_idx_ auto insert_chunk = chunks_.begin() + current_chunk_idx_ + 1; chunks_.insert(insert_chunk, src->chunks_.begin(), end_chunk); @@ -218,7 +215,8 @@ void MemPool::acquire_data(MemPool* src, bool keep_current) { } void MemPool::exchange_data(MemPool* other) { - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + int64_t delta_size = other->total_reserved_bytes_ - total_reserved_bytes_; + other->_mem_tracker->transfer_to(_mem_tracker, delta_size); std::swap(current_chunk_idx_, other->current_chunk_idx_); std::swap(next_chunk_size_, other->next_chunk_size_); @@ -226,17 +224,6 @@ void MemPool::exchange_data(MemPool* other) { std::swap(total_reserved_bytes_, other->total_reserved_bytes_); std::swap(peak_allocated_bytes_, other->peak_allocated_bytes_); std::swap(chunks_, other->chunks_); - - // There is no limit check, assuming that both ends of acquire_data are in the same query. - for (auto i = chunks_.begin(); i != chunks_.end(); ++i) { - Status st = i->chunk.mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), - i->chunk.size); - i->chunk.mem_tracker = thread_local_ctx.thread_mem_tracker(); - } - for (auto i = other->chunks_.begin(); i != other->chunks_.end(); ++i) { - Status st = i->chunk.mem_tracker->transfer_to(other->_mem_tracker, i->chunk.size); - i->chunk.mem_tracker = other->_mem_tracker; - } } std::string MemPool::debug_string() { diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index 17397428c1daef..a6695e4b18b97c 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -35,26 +35,33 @@ namespace doris { const std::string MemTracker::COUNTER_NAME = "PeakMemoryUsage"; -// The ancestor for all trackers. Every tracker is visible from the root down. -// The consume/release of child tracker will not be synchronized to root tracker. +// The ancestor for all trackers. Every tracker is visible from the process down. +// The consume/release of child tracker will not be synchronized to process tracker. // It is used to independently statistics the real memory of the process in TCMalloc New/Delete Hook. -static std::shared_ptr root_tracker; -static GoogleOnceType root_tracker_once = GOOGLE_ONCE_INIT; +static std::shared_ptr process_tracker; +static MemTracker* raw_process_tracker; +static GoogleOnceType process_tracker_once = GOOGLE_ONCE_INIT; + +void MemTracker::create_process_tracker() { + process_tracker.reset(new MemTracker(-1, "Process", nullptr, MemTrackerLevel::OVERVIEW, nullptr)); + process_tracker->init(); + raw_process_tracker = process_tracker.get(); +} -void MemTracker::create_root_tracker() { - root_tracker.reset(new MemTracker(-1, "Root", nullptr, MemTrackerLevel::OVERVIEW, nullptr)); - root_tracker->init(); +std::shared_ptr MemTracker::get_process_tracker() { + GoogleOnceInit(&process_tracker_once, &MemTracker::create_process_tracker); + return process_tracker; } -std::shared_ptr MemTracker::get_root_tracker() { - GoogleOnceInit(&root_tracker_once, &MemTracker::create_root_tracker); - return root_tracker; +MemTracker* MemTracker::get_raw_process_tracker() { + GoogleOnceInit(&process_tracker_once, &MemTracker::create_process_tracker); + return raw_process_tracker; } -void MemTracker::list_root_trackers(std::vector>* trackers) { +void MemTracker::list_process_trackers(std::vector>* trackers) { trackers->clear(); std::deque> to_process; - to_process.push_front(get_root_tracker()); + to_process.push_front(get_process_tracker()); while (!to_process.empty()) { std::shared_ptr t = to_process.back(); to_process.pop_back(); @@ -79,13 +86,11 @@ std::shared_ptr MemTracker::create_tracker(int64_t byte_limit, const const std::shared_ptr& parent, MemTrackerLevel level, RuntimeProfile* profile) { - std::shared_ptr reset_parent = parent; - if (!reset_parent) { - reset_parent = thread_local_ctx.thread_mem_tracker(); - } + std::shared_ptr reset_parent = parent ? parent : thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker(); + DCHECK(reset_parent); std::shared_ptr tracker(new MemTracker( - byte_limit, reset_parent->has_virtual_ancestor() == false ? label : "" + label, + byte_limit, label, reset_parent, level > reset_parent->_level ? level : reset_parent->_level, profile)); reset_parent->add_child_tracker(tracker); tracker->init(); @@ -95,9 +100,12 @@ std::shared_ptr MemTracker::create_tracker(int64_t byte_limit, const std::shared_ptr MemTracker::create_virtual_tracker( int64_t byte_limit, const std::string& label, const std::shared_ptr& parent, MemTrackerLevel level) { + std::shared_ptr reset_parent = parent ? parent : thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker(); + DCHECK(reset_parent); + std::shared_ptr tracker(new MemTracker( - byte_limit, "" + label, - parent == nullptr ? thread_local_ctx.thread_mem_tracker() : parent, level, nullptr)); + byte_limit, "[Virtual]-" + label, reset_parent, level, nullptr)); + reset_parent->add_child_tracker(tracker); tracker->init_virtual(); return tracker; } @@ -109,7 +117,7 @@ MemTracker::MemTracker(int64_t byte_limit, const std::string& label) MemTracker::MemTracker(int64_t byte_limit, const std::string& label, const std::shared_ptr& parent, MemTrackerLevel level, RuntimeProfile* profile) - : _limit(byte_limit), _label(label), _parent(parent), _level(level) { + : _limit(byte_limit), _label(label), _id(_label + std::to_string(GetCurrentTimeMicros()) + std::to_string(rand())), _parent(parent), _level(level) { if (profile == nullptr) { _consumption = std::make_shared(TUnit::BYTES); } else { @@ -120,7 +128,7 @@ MemTracker::MemTracker(int64_t byte_limit, const std::string& label, void MemTracker::init() { DCHECK_GE(_limit, -1); MemTracker* tracker = this; - while (tracker != nullptr) { + while (tracker != nullptr && tracker->_virtual == false) { _all_trackers.push_back(tracker); if (tracker->has_limit()) _limit_trackers.push_back(tracker); tracker = tracker->_parent.get(); @@ -138,14 +146,14 @@ void MemTracker::init_virtual() { MemTracker::~MemTracker() { // TCMalloc hook will be triggered during destructor memtracker, may cause crash. - if (_label == "Root") GLOBAL_STOP_THREAD_LOCAL_MEM_TRACKER(); + if (_label == "Process") GLOBAL_STOP_THREAD_LOCAL_MEM_TRACKER(); if (!_virtual && parent()) { if (consumption() != 0) { memory_leak_check(this); // At present, it can only guarantee the accurate recording of the Instance tracker, // lower layer has the problem of repeated release of different trackers, as explained above. if (_level <= MemTrackerLevel::INSTANCE) { - _parent->release(consumption()); + // _parent->release(consumption()); } } @@ -157,9 +165,10 @@ MemTracker::~MemTracker() { _child_tracker_it = _parent->_child_trackers.end(); } } + consume(_untracked_mem); } -void MemTracker::transfer_to_relative(std::shared_ptr dst, int64_t bytes) { +void MemTracker::transfer_to_relative(const std::shared_ptr& dst, int64_t bytes) { DCHECK_EQ(_all_trackers.back(), dst->_all_trackers.back()) << "Must have same ancestor"; DCHECK(!dst->has_limit()); // Find the common ancestor and update trackers between 'this'/'dst' and @@ -179,22 +188,6 @@ void MemTracker::transfer_to_relative(std::shared_ptr dst, int64_t b dst->consume(bytes, common_ancestor); } -Status MemTracker::transfer_to(std::shared_ptr dst, int64_t bytes) { - // Must release first, then consume - release(bytes); - Status st = dst->try_consume(bytes); - if (!st) { - consume(bytes); - return st; - } - return Status::OK(); -} - -void MemTracker::transfer_to_force(std::shared_ptr dst, int64_t bytes) { - release(bytes); - dst->consume(bytes); -} - // Calling this on the query tracker results in output like: // // Query(4a4c81fedaed337d:4acadfda00000000) Limit=10.00 GB Total=508.28 MB Peak=508.45 MB @@ -267,7 +260,7 @@ std::string MemTracker::log_usage(int max_recursive_depth, Status MemTracker::mem_limit_exceeded(RuntimeState* state, const std::string& details, int64_t failed_allocation_size, Status failed_alloc) { - MemTracker* process_tracker = ExecEnv::GetInstance()->process_mem_tracker().get(); + MemTracker* process_tracker = MemTracker::get_raw_process_tracker(); std::string detail = "Memory exceed limit. fragment={}, details={}, on backend={}. Memory left in process " "limit={}."; diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index e2ce85c5a65bd1..22309518091192 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -89,10 +89,11 @@ class MemTracker { ~MemTracker(); // Returns a list of all the valid trackers. - static void list_root_trackers(std::vector>* trackers); + static void list_process_trackers(std::vector>* trackers); - // Gets a shared_ptr to the "root" tracker, creating it if necessary. - static std::shared_ptr get_root_tracker(); + // Gets a shared_ptr to the "process" tracker, creating it if necessary. + static std::shared_ptr get_process_tracker(); + static MemTracker* get_raw_process_tracker(); inline Status check_sys_mem_info(int64_t bytes) { if (MemInfo::initialized() && MemInfo::current_mem() + bytes >= MemInfo::mem_limit()) { @@ -158,6 +159,61 @@ class MemTracker { return Status::OK(); } + int64_t add_untracked_mem(int64_t bytes) { + _untracked_mem += bytes; + if (std::abs(_untracked_mem) >= config::mem_tracker_consume_min_size_bytes) { // || + // _untracked_mem <= -config::mem_tracker_consume_min_size_bytes) { + // std::lock_guard l(_untracked_mem_lock); + // int64_t consume_bytes = _untracked_mem; + // _untracked_mem -= consume_bytes; + // return consume_bytes; + + return _untracked_mem.exchange(0); + + // do { + // consume_bytes = _untracked_mem; + // } while (!_untracked_mem.compare_exchange_weak(consume_bytes, 0)); + // return consume_bytes; + } + return 0; + } + + void release_cache(int64_t bytes) { + int64_t consume_bytes = add_untracked_mem(bytes); + if (consume_bytes != 0) { + release(consume_bytes); + } + } + + void consume_cache(int64_t bytes) { + int64_t consume_bytes = add_untracked_mem(bytes); + if (consume_bytes != 0) { + consume(consume_bytes); + } + // _untracked_mem += bytes; + // if (std::abs(_untracked_mem) >= config::mem_tracker_consume_min_size_bytes) { + // consume(_untracked_mem.exchange(0)); + // } + // consume(_untracked_mem.exchange(0)); + } + + WARN_UNUSED_RESULT + Status try_consume_cache(int64_t bytes) { + if (bytes <= 0) { + release_cache(-bytes); + return Status::OK(); + } + int64_t consume_bytes = add_untracked_mem(bytes); + if (consume_bytes != 0) { + Status st = try_consume(consume_bytes); + if (!st) { + _untracked_mem += consume_bytes; + return st; + } + } + return Status::OK(); + } + // Decreases consumption of this tracker and its ancestors by 'bytes'. // up to (but not including) end_tracker. void release(int64_t bytes, MemTracker* end_tracker = nullptr) { @@ -190,11 +246,25 @@ class MemTracker { // updating all ancestors up to the first shared ancestor. Must not be used if // 'dst' has a limit, or an ancestor with a limit, that is not a common // ancestor with the tracker, because this does not check memory limits. - void transfer_to_relative(std::shared_ptr dst, int64_t bytes); + void transfer_to_relative(const std::shared_ptr& dst, int64_t bytes); + WARN_UNUSED_RESULT - Status transfer_to(std::shared_ptr dst, int64_t bytes); + Status try_transfer_to(const std::shared_ptr& dst, int64_t bytes) { + // Must release first, then consume + consume_cache(-bytes); + Status st = dst->try_consume_cache(bytes); + if (!st) { + consume_cache(bytes); + return st; + } + return Status::OK(); + } + // Forced transfer, 'dst' may limit exceed, and more ancestor trackers will be updated. - void transfer_to_force(std::shared_ptr dst, int64_t bytes); + void transfer_to(const std::shared_ptr& dst, int64_t bytes) { + consume_cache(-bytes); + dst->consume_cache(bytes); + } // Returns true if a valid limit of this tracker or one of its ancestors is exceeded. MemTracker* limit_exceeded_tracker() const { @@ -233,6 +303,7 @@ class MemTracker { bool limit_exceeded() const { return _limit >= 0 && _limit < consumption(); } int64_t limit() const { return _limit; } + void set_limit(int64_t limit) { _limit = limit; } bool has_limit() const { return _limit >= 0; } Status check_limit(int64_t bytes) { @@ -302,6 +373,10 @@ class MemTracker { return tracker == nullptr ? false : true; } + std::string id() { + return _id; + } + std::string debug_string() { std::stringstream msg; msg << "limit: " << _limit << "; " @@ -329,8 +404,8 @@ class MemTracker { inline Status try_gc_memory(int64_t bytes) { if (UNLIKELY(gc_memory(_limit - bytes))) { return Status::MemoryLimitExceeded( - fmt::format("label={} TryConsume failed size={}, used={}, limit={}", - label(), bytes, _consumption->current_value(), _limit)); + fmt::format("label={} TryConsume failed size={}, used={}, limit={}", label(), + bytes, _consumption->current_value(), _limit)); } VLOG_NOTICE << "GC succeeded, TryConsume bytes=" << bytes << " consumption=" << _consumption->current_value() << " limit=" << _limit; @@ -377,14 +452,16 @@ class MemTracker { } } - // Creates the root tracker. - static void create_root_tracker(); + // Creates the process tracker. + static void create_process_tracker(); // Limit on memory consumption, in bytes. If limit_ == -1, there is no consumption limit. - const int64_t _limit; + int64_t _limit; std::string _label; + std::string _id; + std::shared_ptr _parent; // The parent of this tracker. MemTrackerLevel _level; @@ -393,6 +470,11 @@ class MemTracker { std::shared_ptr _consumption; // in bytes + // Consume size smaller than mem_tracker_consume_min_size_bytes will continue to accumulate + // to avoid frequent calls to consume/release of MemTracker. + std::atomic _untracked_mem = 0; + SpinLock _untracked_mem_lock; + std::vector _all_trackers; // this tracker plus all of its ancestors std::vector _limit_trackers; // _all_trackers with valid limits diff --git a/be/src/runtime/mem_tracker_task_pool.cpp b/be/src/runtime/mem_tracker_task_pool.cpp index 1208161d297b73..9c46eac8382f91 100644 --- a/be/src/runtime/mem_tracker_task_pool.cpp +++ b/be/src/runtime/mem_tracker_task_pool.cpp @@ -68,6 +68,8 @@ void MemTrackerTaskPool::logout_task_mem_tracker() { // No RuntimeState uses this task MemTracker, it is only referenced by this map, delete it if (it->second.use_count() == 1) { if (!config::memory_leak_detection || it->second->consumption() == 0) { + // + it->second->parent()->consume(-it->second->consumption(), MemTracker::get_process_tracker().get()); expired_tasks.emplace_back(it->first); } else { LOG(WARNING) << "Memory tracker " << it->second->debug_string() << " Memory leak " diff --git a/be/src/runtime/memory/chunk.h b/be/src/runtime/memory/chunk.h index 3be766981b28ac..249136ad29af78 100644 --- a/be/src/runtime/memory/chunk.h +++ b/be/src/runtime/memory/chunk.h @@ -32,7 +32,6 @@ struct Chunk { uint8_t* data = nullptr; size_t size = 0; int core_id = -1; - std::shared_ptr mem_tracker = nullptr; }; } // namespace doris diff --git a/be/src/runtime/memory/chunk_allocator.cpp b/be/src/runtime/memory/chunk_allocator.cpp index 77717ee7a79955..1082d577c8dc3f 100644 --- a/be/src/runtime/memory/chunk_allocator.cpp +++ b/be/src/runtime/memory/chunk_allocator.cpp @@ -133,14 +133,13 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit) INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_system_free_cost_ns); } -Status ChunkAllocator::allocate(size_t size, Chunk* chunk, bool check_limits) { +Status ChunkAllocator::allocate(size_t size, Chunk* chunk, const std::shared_ptr& tracker, bool check_limits) { // fast path: allocate from current core arena - chunk->mem_tracker = thread_local_ctx.thread_mem_tracker(); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + std::shared_ptr reset_tracker = tracker ? tracker : thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker(); if (check_limits) { - RETURN_IF_ERROR(thread_local_ctx.thread_mem_tracker()->transfer_to(chunk->mem_tracker, size)); + RETURN_IF_ERROR(_mem_tracker->try_transfer_to(reset_tracker, size)); } else { - thread_local_ctx.thread_mem_tracker()->transfer_to_force(chunk->mem_tracker, size); + _mem_tracker->transfer_to(reset_tracker, size); } int core_id = CpuInfo::get_current_core(); @@ -170,27 +169,31 @@ Status ChunkAllocator::allocate(size_t size, Chunk* chunk, bool check_limits) { int64_t cost_ns = 0; { + // SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); SCOPED_RAW_TIMER(&cost_ns); // allocate from system allocator + // _mem_tracker->consume_cache(size); chunk->data = SystemAllocator::allocate(size); } chunk_pool_system_alloc_count->increment(1); chunk_pool_system_alloc_cost_ns->increment(cost_ns); if (chunk->data == nullptr) { - Status st = chunk->mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), size); + reset_tracker->transfer_to(_mem_tracker, size); return Status::MemoryAllocFailed( fmt::format("ChunkAllocator failed to allocate chunk {} bytes", size)); } return Status::OK(); } -void ChunkAllocator::free(Chunk& chunk) { +void ChunkAllocator::free(Chunk& chunk, const std::shared_ptr& tracker) { if (chunk.core_id == -1) { return; } - DCHECK(chunk.mem_tracker != nullptr); - SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); - Status st = chunk.mem_tracker->transfer_to(thread_local_ctx.thread_mem_tracker(), chunk.size); + if (tracker) { + tracker->transfer_to(_mem_tracker, chunk.size); + } else { + thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()->transfer_to(_mem_tracker, chunk.size); + } int64_t old_reserved_bytes = _reserved_bytes; int64_t new_reserved_bytes = 0; do { @@ -198,7 +201,9 @@ void ChunkAllocator::free(Chunk& chunk) { if (new_reserved_bytes > _reserve_bytes_limit) { int64_t cost_ns = 0; { + // SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); SCOPED_RAW_TIMER(&cost_ns); + // _mem_tracker->release_cache(chunk.size); SystemAllocator::free(chunk.data, chunk.size); } chunk_pool_system_free_count->increment(1); @@ -209,11 +214,10 @@ void ChunkAllocator::free(Chunk& chunk) { } while (!_reserved_bytes.compare_exchange_weak(old_reserved_bytes, new_reserved_bytes)); _arenas[chunk.core_id]->push_free_chunk(chunk.data, chunk.size); - chunk.mem_tracker = nullptr; } -Status ChunkAllocator::allocate_align(size_t size, Chunk* chunk, bool check_limits) { - return allocate(BitUtil::RoundUpToPowerOfTwo(size), chunk, check_limits); +Status ChunkAllocator::allocate_align(size_t size, Chunk* chunk, const std::shared_ptr& tracker, bool check_limits) { + return allocate(BitUtil::RoundUpToPowerOfTwo(size), chunk, tracker, check_limits); } } // namespace doris diff --git a/be/src/runtime/memory/chunk_allocator.h b/be/src/runtime/memory/chunk_allocator.h index 4c141358e4f142..366e66ccd57ece 100644 --- a/be/src/runtime/memory/chunk_allocator.h +++ b/be/src/runtime/memory/chunk_allocator.h @@ -65,12 +65,14 @@ class ChunkAllocator { // Allocate a Chunk with a power-of-two length "size". // Return true if success and allocated chunk is saved in "chunk". // Otherwise return false. - Status allocate(size_t size, Chunk* chunk, bool check_limits = false); + Status allocate(size_t size, Chunk* chunk, + const std::shared_ptr& tracker = std::shared_ptr(), bool check_limits = false); - Status allocate_align(size_t size, Chunk* chunk, bool check_limits = false); + Status allocate_align(size_t size, Chunk* chunk, + const std::shared_ptr& tracker = std::shared_ptr(), bool check_limits = false); // Free chunk allocated from this allocator - void free(Chunk& chunk); + void free(Chunk& chunk, const std::shared_ptr& tracker = std::shared_ptr()); private: static ChunkAllocator* _s_instance; diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp index 4837770ba738a9..ede3241885b432 100644 --- a/be/src/runtime/plan_fragment_executor.cpp +++ b/be/src/runtime/plan_fragment_executor.cpp @@ -92,7 +92,9 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, RETURN_IF_ERROR(_runtime_state->init_mem_trackers(_query_id)); SCOPED_ATTACH_TASK_THREAD_4ARG(_runtime_state->query_type(), print_id(_runtime_state->query_id()), - _runtime_state->fragment_instance_id(), _runtime_state->instance_mem_tracker()); + _runtime_state->fragment_instance_id(), _runtime_state->instance_mem_tracker()); + // SCOPED_ATTACH_TASK_THREAD_4ARGP(_runtime_state->query_type(), print_id(_runtime_state->query_id()), + // _runtime_state->fragment_instance_id(), _runtime_state->instance_mem_tracker()); _runtime_state->set_be_number(request.backend_num); if (request.__isset.backend_id) { _runtime_state->set_backend_id(request.backend_id); @@ -121,13 +123,13 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, bytes_limit = 2 * 1024 * 1024 * 1024L; } - if (bytes_limit > _exec_env->process_mem_tracker()->limit()) { + if (bytes_limit > MemTracker::get_process_tracker()->limit()) { LOG(WARNING) << "Query memory limit " << PrettyPrinter::print(bytes_limit, TUnit::BYTES) << " exceeds process memory limit of " - << PrettyPrinter::print(_exec_env->process_mem_tracker()->limit(), + << PrettyPrinter::print(MemTracker::get_process_tracker()->limit(), TUnit::BYTES) << ". Using process memory limit instead"; - bytes_limit = _exec_env->process_mem_tracker()->limit(); + bytes_limit = MemTracker::get_process_tracker()->limit(); } RETURN_IF_ERROR(_runtime_state->create_block_mgr()); diff --git a/be/src/runtime/row_batch.cpp b/be/src/runtime/row_batch.cpp index eed731b84bedca..7bb1795caa5362 100644 --- a/be/src/runtime/row_batch.cpp +++ b/be/src/runtime/row_batch.cpp @@ -42,7 +42,7 @@ const int RowBatch::AT_CAPACITY_MEM_USAGE = 8 * 1024 * 1024; const int RowBatch::FIXED_LEN_BUFFER_LIMIT = AT_CAPACITY_MEM_USAGE / 2; RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity) - : _mem_tracker(thread_local_ctx.thread_mem_tracker()), + : _mem_tracker(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()), _has_in_flight_row(false), _num_rows(0), _num_uncommitted_rows(0), @@ -75,7 +75,7 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity) // to allocated string data in special mempool // (change via python script that runs over Data_types.cc) RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch) - : _mem_tracker(thread_local_ctx.thread_mem_tracker()), + : _mem_tracker(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()), _has_in_flight_row(false), _num_rows(input_batch.num_rows()), _num_uncommitted_rows(0), @@ -216,138 +216,6 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch) } } -// TODO: we want our input_batch's tuple_data to come from our (not yet implemented) -// global runtime memory segment; how do we get thrift to allocate it from there? -// maybe change line (in Data_types.cc generated from Data.thrift) -// xfer += iprot->readString(this->tuple_data[_i9]); -// to allocated string data in special mempool -// (change via python script that runs over Data_types.cc) -RowBatch::RowBatch(const RowDescriptor& row_desc, const TRowBatch& input_batch) - : _mem_tracker(thread_local_ctx.thread_mem_tracker()), - _has_in_flight_row(false), - _num_rows(input_batch.num_rows), - _num_uncommitted_rows(0), - _capacity(_num_rows), - _flush(FlushMode::NO_FLUSH_RESOURCES), - _needs_deep_copy(false), - _num_tuples_per_row(input_batch.row_tuples.size()), - _row_desc(row_desc), - _auxiliary_mem_usage(0), - _need_to_return(false), - _tuple_data_pool() { - _tuple_ptrs_size = _num_rows * input_batch.row_tuples.size() * sizeof(Tuple*); - DCHECK_GT(_tuple_ptrs_size, 0); - // TODO: switch to Init() pattern so we can check memory limit and return Status. - if (config::enable_partitioned_aggregation) { - SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); - _mem_tracker->consume(_tuple_ptrs_size); - _tuple_ptrs = (Tuple**)malloc(_tuple_ptrs_size); - DCHECK(_tuple_ptrs != nullptr); - } else { - _tuple_ptrs = (Tuple**)_tuple_data_pool.allocate(_tuple_ptrs_size); - } - - char* tuple_data = nullptr; - if (input_batch.is_compressed) { - // Decompress tuple data into data pool - const char* compressed_data = input_batch.tuple_data.c_str(); - size_t compressed_size = input_batch.tuple_data.size(); - size_t uncompressed_size = 0; - bool success = - snappy::GetUncompressedLength(compressed_data, compressed_size, &uncompressed_size); - DCHECK(success) << "snappy::GetUncompressedLength failed"; - tuple_data = (char*)_tuple_data_pool.allocate(uncompressed_size); - success = snappy::RawUncompress(compressed_data, compressed_size, tuple_data); - DCHECK(success) << "snappy::RawUncompress failed"; - } else { - // Tuple data uncompressed, copy directly into data pool - tuple_data = (char*)_tuple_data_pool.allocate(input_batch.tuple_data.size()); - memcpy(tuple_data, input_batch.tuple_data.c_str(), input_batch.tuple_data.size()); - } - - // convert input_batch.tuple_offsets into pointers - int tuple_idx = 0; - for (auto offset : input_batch.tuple_offsets) { - if (offset == -1) { - _tuple_ptrs[tuple_idx++] = nullptr; - } else { - _tuple_ptrs[tuple_idx++] = convert_to(tuple_data + offset); - } - } - - // Check whether we have slots that require offset-to-pointer conversion. - if (!_row_desc.has_varlen_slots()) { - return; - } - - const auto& tuple_descs = _row_desc.tuple_descriptors(); - - // For every unique tuple, convert string offsets contained in tuple data into - // pointers. Tuples were serialized in the order we are deserializing them in, - // so the first occurrence of a tuple will always have a higher offset than any tuple - // we already converted. - for (int i = 0; i < _num_rows; ++i) { - TupleRow* row = get_row(i); - for (size_t j = 0; j < tuple_descs.size(); ++j) { - auto desc = tuple_descs[j]; - if (desc->string_slots().empty() && desc->collection_slots().empty()) { - continue; - } - - Tuple* tuple = row->get_tuple(j); - if (tuple == nullptr) { - continue; - } - - for (auto slot : desc->string_slots()) { - DCHECK(slot->type().is_string_type()); - StringValue* string_val = tuple->get_string_slot(slot->tuple_offset()); - - int offset = convert_to(string_val->ptr); - string_val->ptr = tuple_data + offset; - - // Why we do this mask? Field len of StringValue is changed from int to size_t in - // Doris 0.11. When upgrading, some bits of len sent from 0.10 is random value, - // this works fine in version 0.10, however in 0.11 this will lead to an invalid - // length. So we make the high bits zero here. - string_val->len &= 0x7FFFFFFFL; - } - - // copy collection slot - for (auto slot_collection : desc->collection_slots()) { - DCHECK(slot_collection->type().is_collection_type()); - CollectionValue* array_val = - tuple->get_collection_slot(slot_collection->tuple_offset()); - - int offset = convert_to(array_val->data()); - array_val->set_data(tuple_data + offset); - int null_offset = convert_to(array_val->null_signs()); - array_val->set_null_signs(convert_to(tuple_data + null_offset)); - - const TypeDescriptor& item_type = slot_collection->type().children.at(0); - if (!item_type.is_string_type()) { - continue; - } - - // copy string item - for (size_t k = 0; k < array_val->length(); ++k) { - if (array_val->is_null_at(k)) { - continue; - } - - StringValue* dst_item_v = convert_to( - (uint8_t*)array_val->data() + k * item_type.get_slot_size()); - - if (dst_item_v->len != 0) { - int offset = convert_to(dst_item_v->ptr); - dst_item_v->ptr = tuple_data + offset; - } - } - } - } - } -} - void RowBatch::clear() { if (_cleared) { return; diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index e8b6d8b36b7fae..cfadb41133e84c 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -160,7 +160,7 @@ Status RuntimeFilterMergeControllerEntity::_init_with_desc( cntVal->filter->init_with_desc(&cntVal->runtime_filter_desc, query_options, _fragment_instance_id); cntVal->_tracker = MemTracker::create_tracker( - -1, thread_local_ctx.thread_mem_tracker()->label() + ":FilterID:" + filter_id); + -1, thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()->label() + ":FilterID:" + filter_id); _filter_map.emplace(filter_id, cntVal); return Status::OK(); } diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp index e11371ceafe20a..369430e89ea56b 100644 --- a/be/src/runtime/tablets_channel.cpp +++ b/be/src/runtime/tablets_channel.cpp @@ -35,7 +35,7 @@ std::atomic TabletsChannel::_s_tablet_writer_count; TabletsChannel::TabletsChannel(const TabletsChannelKey& key, bool is_high_priority) : _key(key), _state(kInitialized), _closed_senders(64), _is_high_priority(is_high_priority) { - _mem_tracker = MemTracker::create_tracker(-1, "TabletsChannel:" + key.index_id); + _mem_tracker = MemTracker::create_tracker(-1, "TabletsChannel:" + std::to_string(key.index_id)); static std::once_flag once_flag; std::call_once(once_flag, [] { REGISTER_HOOK_METRIC(tablet_writer_count, [&]() { return _s_tablet_writer_count.load(); }); diff --git a/be/src/runtime/tcmalloc_hook.h b/be/src/runtime/tcmalloc_hook.h index 3b8bc2d53c8f91..ac24208e748ee6 100644 --- a/be/src/runtime/tcmalloc_hook.h +++ b/be/src/runtime/tcmalloc_hook.h @@ -21,15 +21,7 @@ #include "runtime/thread_context.h" -void new_hook(const void* ptr, size_t size) { - doris::thread_local_ctx.consume_mem(tc_nallocx(size, 0)); -} - -void delete_hook(const void* ptr) { - doris::thread_local_ctx.release_mem(tc_malloc_size(const_cast(ptr))); -} - -// Notice: modify the command in New/Delete Hook should be careful enough!!!, +// Notice: modify the command in New/Delete Hook should be careful enough!, // and should be as simple as possible, otherwise it may cause weird errors. E.g: // 1. The first New Hook call of the process may be before some variables of // the process are initialized. @@ -38,6 +30,17 @@ void delete_hook(const void* ptr) { // 3. TCMalloc hook will be triggered during the process of initializing/Destructor // memtracker shared_ptr, Using the object pointed to by this memtracker shared_ptr // in TCMalloc hook may cause crash. +// 4. Modifying additional thread local variables in ThreadContext construction and +// destructor to control the behavior of consume can lead to unexpected behavior, +// like this: if (LIKELY(doris::thread_mem_tracker_mgr_init)) { +void new_hook(const void* ptr, size_t size) { + doris::thread_local_ctx.get()->consume_mem(tc_nallocx(size, 0)); +} + +void delete_hook(const void* ptr) { + doris::thread_local_ctx.get()->release_mem(tc_malloc_size(const_cast(ptr))); +} + void init_hook() { MallocHook::AddNewHook(&new_hook); MallocHook::AddDeleteHook(&delete_hook); diff --git a/be/src/runtime/thread_context.cpp b/be/src/runtime/thread_context.cpp new file mode 100644 index 00000000000000..8aec6898c76ae1 --- /dev/null +++ b/be/src/runtime/thread_context.cpp @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/thread_context.h" + +namespace doris { + +DEFINE_STATIC_THREAD_LOCAL(ThreadContext, ThreadContextPtr, thread_local_ctx); + +ThreadContextPtr::ThreadContextPtr() { + INIT_STATIC_THREAD_LOCAL(ThreadContext, thread_local_ctx); +} + +ThreadContext* ThreadContextPtr::get() { + return thread_local_ctx; +} + +} diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index 72bd87afa3558a..f4550b95e99750 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -21,6 +21,7 @@ #include #include "runtime/thread_mem_tracker_mgr.h" +#include "runtime/threadlocal.h" // Attach to task when thread starts #define SCOPED_ATTACH_TASK_THREAD_2ARG(type, mem_tracker) \ @@ -28,9 +29,17 @@ #define SCOPED_ATTACH_TASK_THREAD_4ARG(query_type, task_id, fragment_instance_id, mem_tracker) \ auto VARNAME_LINENUM(attach_task_thread) = \ AttachTaskThread(query_type, task_id, fragment_instance_id, mem_tracker) +#define SCOPED_ATTACH_TASK_THREAD_4ARGP(query_type, task_id, fragment_instance_id, mem_tracker) \ + auto VARNAME_LINENUM(attach_task_thread) = \ + AttachTaskThreadP(query_type, task_id, fragment_instance_id, mem_tracker) // Toggle MemTracker during thread execution +// 必须在 SCOPED_ATTACH_TASK_THREAD 中切换,否则可能导致缓存的mem tracker不被释放 +// Must-see Notes: 不能频繁 SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER 和 SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER, +// 因为变量的创建和析构顺序,以及内存的申请和释放顺序可能和指令的执行顺序不同(待进一步调研),这可能导致内存统计的tracker或位置不符合预期; #define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker) \ auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARGP(mem_tracker) \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTrackerP(mem_tracker) #define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker, action_type) \ do { \ auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker); \ @@ -55,7 +64,7 @@ auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker(true) #define GLOBAL_STOP_THREAD_LOCAL_MEM_TRACKER() \ auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker(false) -#define CHECK_MEM_LIMIT(size) thread_local_ctx.thread_mem_tracker()->check_limit(size) +#define CHECK_MEM_LIMIT(size) thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()->check_limit(size) namespace doris { @@ -83,10 +92,10 @@ class ThreadContext { ThreadContext() : _thread_id(std::this_thread::get_id()), _type(TaskType::UNKNOWN) { _thread_mem_tracker_mgr.reset(new ThreadMemTrackerMgr()); } - ~ThreadContext() {} void attach(const TaskType& type, const std::string& task_id, const TUniqueId& fragment_instance_id, std::shared_ptr mem_tracker) { + DCHECK(_type == TaskType::UNKNOWN && _task_id == ""); _type = type; _task_id = task_id; _fragment_instance_id = fragment_instance_id; @@ -107,49 +116,17 @@ class ThreadContext { const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } void consume_mem(int64_t size) { - if (_thread_mem_tracker_mgr != nullptr) { + if (thread_mem_tracker_mgr_init == true) { _thread_mem_tracker_mgr->cache_consume(size); } } void release_mem(int64_t size) { - if (_thread_mem_tracker_mgr != nullptr) { + if (thread_mem_tracker_mgr_init == true) { _thread_mem_tracker_mgr->cache_consume(-size); } } - std::shared_ptr thread_mem_tracker() { - return _thread_mem_tracker_mgr->mem_tracker().lock(); - } - std::weak_ptr update_thread_tracker(std::weak_ptr mem_tracker) { - return _thread_mem_tracker_mgr->update_tracker(mem_tracker); - } - std::shared_ptr update_thread_tracker_call_back( - const std::string& action_type, bool cancel_task, ERRCALLBACK err_call_back_func) { - return _thread_mem_tracker_mgr->update_consume_err_call_back(action_type, cancel_task, - err_call_back_func); - } - std::shared_ptr update_thread_tracker_call_back( - std::shared_ptr tracker_call_back) { - return _thread_mem_tracker_mgr->update_consume_err_call_back(tracker_call_back); - } - void start_mem_tracker() { - if (_thread_mem_tracker_mgr != nullptr) { - _thread_mem_tracker_mgr->start_mem_tracker(); - } - } - void stop_mem_tracker() { - if (_thread_mem_tracker_mgr != nullptr) { - _thread_mem_tracker_mgr->stop_mem_tracker(); - } - } - -private: - std::thread::id _thread_id; - TaskType _type; - std::string _task_id; - TUniqueId _fragment_instance_id; - // After _thread_mem_tracker_mgr is initialized, the current thread TCMalloc Hook starts to // consume/release mem_tracker. // Note that the use of shared_ptr will cause a crash. The guess is that there is an @@ -157,9 +134,35 @@ class ThreadContext { // to nullptr, but the object it points to is not initialized. At this time, when the memory // is released somewhere, the TCMalloc hook is triggered to cause the crash. std::unique_ptr _thread_mem_tracker_mgr; + // ThreadMemTrackerMgr* _thread_mem_tracker_mgr; + +private: + std::thread::id _thread_id; + TaskType _type; + std::string _task_id; + TUniqueId _fragment_instance_id; +}; + +// inline thread_local ThreadContext thread_local_ctx; +// inline BLOCK_STATIC_THREAD_LOCAL2(ThreadContext, thread_local_ctx); + +// static ThreadContext* load_tls() { +// thread_mem_tracker_mgr_init = false; +// BLOCK_STATIC_THREAD_LOCAL(ThreadContext, ctx2); +// thread_mem_tracker_mgr_init = true; +// return ctx2; +// } + +class ThreadContextPtr { +public: + ThreadContextPtr(); + + ThreadContext* get(); +private: + DECLARE_STATIC_THREAD_LOCAL(ThreadContext, thread_local_ctx); }; -inline thread_local ThreadContext thread_local_ctx; +inline thread_local ThreadContextPtr thread_local_ctx; inline const std::string task_type_string(ThreadContext::TaskType type) { switch (type) { @@ -181,14 +184,14 @@ inline const std::string ThreadContext::get_type() const { class AttachTaskThread { public: explicit AttachTaskThread(const ThreadContext::TaskType& type, - std::shared_ptr mem_tracker) { + const std::shared_ptr mem_tracker) { DCHECK(mem_tracker != nullptr); init(type, "", TUniqueId(), mem_tracker); } explicit AttachTaskThread(const TQueryType::type& query_type, const std::string& task_id, const TUniqueId& fragment_instance_id, - std::shared_ptr mem_tracker) { + const std::shared_ptr& mem_tracker) { DCHECK(task_id != "" && fragment_instance_id != TUniqueId() && mem_tracker != nullptr); if (query_type == TQueryType::SELECT) { init(ThreadContext::TaskType::QUERY, task_id, fragment_instance_id, mem_tracker); @@ -197,33 +200,71 @@ class AttachTaskThread { } } - void init(const ThreadContext::TaskType& type, const std::string& task_id = "", - const TUniqueId& fragment_instance_id = TUniqueId(), - std::shared_ptr mem_tracker = nullptr) { - thread_local_ctx.attach(type, task_id, fragment_instance_id, mem_tracker); + void init(const ThreadContext::TaskType& type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + const std::shared_ptr& mem_tracker) { + // thread_local_ctx.get()->attach(type, task_id, fragment_instance_id, mem_tracker); } - ~AttachTaskThread() { thread_local_ctx.detach(); } + // ~AttachTaskThread() { thread_local_ctx.get()->detach(); } }; -class SwitchThreadMemTracker { +class AttachTaskThreadP { public: - explicit SwitchThreadMemTracker(std::shared_ptr mem_tracker) { + explicit AttachTaskThreadP(const ThreadContext::TaskType& type, + const std::shared_ptr mem_tracker) { DCHECK(mem_tracker != nullptr); - if (mem_tracker != thread_local_ctx.thread_mem_tracker()) { - _old_mem_tracker = thread_local_ctx.update_thread_tracker(mem_tracker); + init(type, "", TUniqueId(), mem_tracker); + } + + explicit AttachTaskThreadP(const TQueryType::type& query_type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + const std::shared_ptr& mem_tracker) { + DCHECK(task_id != "" && fragment_instance_id != TUniqueId() && mem_tracker != nullptr); + if (query_type == TQueryType::SELECT) { + init(ThreadContext::TaskType::QUERY, task_id, fragment_instance_id, mem_tracker); + } else if (query_type == TQueryType::LOAD) { + init(ThreadContext::TaskType::LOAD, task_id, fragment_instance_id, mem_tracker); } } + void init(const ThreadContext::TaskType& type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + const std::shared_ptr& mem_tracker) { + // thread_local_ctx.get()->attach(type, task_id, fragment_instance_id, mem_tracker); + } + + // ~AttachTaskThreadP() { thread_local_ctx.get()->detach(); } +}; + +class SwitchThreadMemTracker { +public: + explicit SwitchThreadMemTracker(const std::shared_ptr& mem_tracker) { + DCHECK(mem_tracker != nullptr); + // _old_tracker_id = thread_local_ctx.get()->_thread_mem_tracker_mgr->update_tracker(mem_tracker); + } + ~SwitchThreadMemTracker() { - std::shared_ptr p = _old_mem_tracker.lock(); - if (p) { - thread_local_ctx.update_thread_tracker(_old_mem_tracker); - } + // thread_local_ctx.get()->_thread_mem_tracker_mgr->set_tracker_id(_old_tracker_id); + } + +private: + std::string _old_tracker_id; +}; + +class SwitchThreadMemTrackerP { +public: + explicit SwitchThreadMemTrackerP(const std::shared_ptr& mem_tracker) { + DCHECK(mem_tracker != nullptr); + // _old_tracker_id = thread_local_ctx.get()->_thread_mem_tracker_mgr->update_trackerP(mem_tracker); + } + + ~SwitchThreadMemTrackerP() { + // thread_local_ctx.get()->_thread_mem_tracker_mgr->set_tracker_idP(_old_tracker_id); } private: - std::weak_ptr _old_mem_tracker; + std::string _old_tracker_id; }; class SwitchThreadMemTrackerCallBack { @@ -246,26 +287,26 @@ class SwitchThreadMemTrackerCallBack { void init(const std::string& action_type = std::string(), bool cancel_work = true, ERRCALLBACK err_call_back_func = nullptr) { - _old_tracker_call_back = thread_local_ctx.update_thread_tracker_call_back( + _old_tracker_call_back = thread_local_ctx.get()->_thread_mem_tracker_mgr->update_consume_err_call_back( action_type, cancel_work, err_call_back_func); } ~SwitchThreadMemTrackerCallBack() { - thread_local_ctx.update_thread_tracker_call_back(_old_tracker_call_back); + thread_local_ctx.get()->_thread_mem_tracker_mgr->update_consume_err_call_back(_old_tracker_call_back); } private: - std::shared_ptr _old_tracker_call_back; + ConsumeErrCallBackInfo _old_tracker_call_back; }; class StopThreadMemTracker { public: explicit StopThreadMemTracker(const bool scope = true) : _scope(scope) { - thread_local_ctx.stop_mem_tracker(); + thread_mem_tracker_mgr_init = false; } ~StopThreadMemTracker() { - if (_scope == true) thread_local_ctx.start_mem_tracker(); + if (_scope == true) thread_mem_tracker_mgr_init = true; } private: diff --git a/be/src/runtime/thread_mem_tracker_mgr.cpp b/be/src/runtime/thread_mem_tracker_mgr.cpp index 2c3696e7f85d23..39915a3c0211ad 100644 --- a/be/src/runtime/thread_mem_tracker_mgr.cpp +++ b/be/src/runtime/thread_mem_tracker_mgr.cpp @@ -22,76 +22,42 @@ namespace doris { -std::shared_ptr ThreadMemTrackerMgr::default_mem_tracker() { - std::shared_ptr process_tracker = ExecEnv::GetInstance()->process_mem_tracker(); - if (process_tracker != nullptr) { - return process_tracker; - } else { - return MemTracker::get_root_tracker(); - } -} - void ThreadMemTrackerMgr::attach_task(const std::string& action_type, const std::string& task_id, const TUniqueId& fragment_instance_id, - std::shared_ptr mem_tracker) { - DCHECK(task_id != "" && fragment_instance_id != TUniqueId()); + const std::shared_ptr& mem_tracker) { _task_id = task_id; _fragment_instance_id = fragment_instance_id; - _consume_err_call_back = std::make_shared(action_type, true, nullptr); + _consume_err_call_back.update(action_type, true, nullptr); if (mem_tracker == nullptr) { #ifdef BE_TEST if (ExecEnv::GetInstance()->task_pool_mem_tracker_registry() == nullptr) { return; } #endif - update_tracker( - ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_task_mem_tracker( - task_id)); + _temp_task_mem_tracker = ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_task_mem_tracker(task_id); + update_tracker(_temp_task_mem_tracker); } else { update_tracker(mem_tracker); } } void ThreadMemTrackerMgr::detach_task() { - update_tracker(default_mem_tracker()); _task_id = ""; _fragment_instance_id = TUniqueId(); - _consume_err_call_back = std::make_shared("", false, nullptr); -} - -std::weak_ptr ThreadMemTrackerMgr::update_tracker( - std::weak_ptr mem_tracker) { - if (_untracked_mem != 0) { - noncache_consume(); - _untracked_mem = 0; - } - DCHECK(!_mem_tracker.expired()); - DCHECK(!mem_tracker.expired()); - std::weak_ptr old_mem_tracker = _mem_tracker.lock(); - _mem_tracker = mem_tracker; - return old_mem_tracker; -} - -std::shared_ptr ThreadMemTrackerMgr::update_consume_err_call_back( - const std::string& action_type, bool cancel_task, ERRCALLBACK call_back_func) { - std::shared_ptr old_consume_err_call_back = _consume_err_call_back; - _consume_err_call_back = - std::make_shared(action_type, cancel_task, call_back_func); - return old_consume_err_call_back; -} - -std::shared_ptr ThreadMemTrackerMgr::update_consume_err_call_back( - std::shared_ptr consume_err_call_back) { - std::shared_ptr old_consume_err_call_back = _consume_err_call_back; - _consume_err_call_back = consume_err_call_back; - return old_consume_err_call_back; + _consume_err_call_back.init(); + clear_untracked_mems(); + _tracker_id = "process"; + _untracked_mems.clear(); + _untracked_mems["process"] = 0; + _mem_trackers.clear(); + _mem_trackers["process"] = MemTracker::get_process_tracker(); } void ThreadMemTrackerMgr::exceeded_cancel_task(const std::string& cancel_details) { - std::shared_ptr task_mem_tracker = + _temp_task_mem_tracker = ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_task_mem_tracker( _task_id); - if (task_mem_tracker != nullptr && task_mem_tracker->limit_exceeded() && + if (_temp_task_mem_tracker != nullptr && _temp_task_mem_tracker->limit_exceeded() && _fragment_instance_id != TUniqueId() && ExecEnv::GetInstance()->initialized() && ExecEnv::GetInstance()->fragment_mgr()->is_canceling(_fragment_instance_id).ok()) { ExecEnv::GetInstance()->fragment_mgr()->cancel( @@ -102,48 +68,14 @@ void ThreadMemTrackerMgr::exceeded_cancel_task(const std::string& cancel_details } void ThreadMemTrackerMgr::exceeded(int64_t mem_usage, Status st) { - auto rst = _mem_tracker.lock()->mem_limit_exceeded( - nullptr, "In TCMalloc Hook, " + _consume_err_call_back->action_type, mem_usage, st); - if (_consume_err_call_back->call_back_func != nullptr) { - _consume_err_call_back->call_back_func(); + auto rst = _mem_trackers[_tracker_id]->mem_limit_exceeded( + nullptr, "In TCMalloc Hook, " + _consume_err_call_back.action_type, mem_usage, st); + if (_consume_err_call_back.call_back_func != nullptr) { + _consume_err_call_back.call_back_func(); } - if (_task_id != "" && _consume_err_call_back->cancel_task == true) { + if (_task_id != "" && _consume_err_call_back.cancel_task == true) { exceeded_cancel_task(rst.to_string()); } } -void ThreadMemTrackerMgr::noncache_consume() { - _stop_mem_tracker = true; - { - // Ensure thread safety - auto tracker = _mem_tracker.lock(); - // The first time get_root_tracker is called after the main thread starts, == nullptr. - if (tracker) { - Status st = _mem_tracker.lock()->try_consume(_untracked_mem); - if (!st) { - // The memory has been allocated, so when TryConsume fails, need to continue to complete - // the consume to ensure the accuracy of the statistics. - _mem_tracker.lock()->consume(_untracked_mem); - exceeded(_untracked_mem, st); - } - } - } - _stop_mem_tracker = false; -} - -void ThreadMemTrackerMgr::cache_consume(int64_t size) { - if (_stop_mem_tracker == true) { - return; - } - _untracked_mem += size; - // When some threads `0 < _untracked_mem < _tracker_consume_cache_size` - // and some threads `_untracked_mem <= -_tracker_consume_cache_size` trigger consumption(), - // it will cause tracker->consumption to be temporarily less than 0. - if (_untracked_mem >= _tracker_consume_cache_size || - _untracked_mem <= -_tracker_consume_cache_size) { - noncache_consume(); - _untracked_mem = 0; - } -} - } // namespace doris diff --git a/be/src/runtime/thread_mem_tracker_mgr.h b/be/src/runtime/thread_mem_tracker_mgr.h index 5d56453ff260e5..9869c1aeabeac7 100644 --- a/be/src/runtime/thread_mem_tracker_mgr.h +++ b/be/src/runtime/thread_mem_tracker_mgr.h @@ -32,10 +32,33 @@ struct ConsumeErrCallBackInfo { bool cancel_task; // Whether to cancel the task when the current tracker exceeds the limit ERRCALLBACK call_back_func; + ConsumeErrCallBackInfo() { + init(); + } + ConsumeErrCallBackInfo(std::string action_type, bool cancel_task, ERRCALLBACK call_back_func) : action_type(action_type), cancel_task(cancel_task), call_back_func(call_back_func) {} + + void update(std::string new_action_type, bool new_cancel_task, ERRCALLBACK new_call_back_func) { + action_type = new_action_type; + cancel_task = new_cancel_task; + call_back_func = new_call_back_func; + } + + void init() { + action_type = ""; + cancel_task = false; + call_back_func = nullptr; + } }; +// If there is a memory new/delete operation in the consume method, it may enter infinite recursion. +// Note: After the tracker is stopped, the memory alloc in the consume method should be released in time, +// otherwise the MemTracker statistics will be inaccurate. +// In some cases, we want to turn off thread automatic memory statistics, manually call consume. +// In addition, when ~RootTracker, TCMalloc delete hook release RootTracker will crash. +inline thread_local bool thread_mem_tracker_mgr_init = false; + // TCMalloc new/delete Hook is counted in the memory_tracker of the current thread. // // In the original design, the MemTracker consume method is called before the memory is allocated. @@ -45,25 +68,87 @@ struct ConsumeErrCallBackInfo { // need to manually call cosume after stop_mem_tracker, and then start_mem_tracker. class ThreadMemTrackerMgr { public: - ThreadMemTrackerMgr() : _mem_tracker(default_mem_tracker()) { - _consume_err_call_back = std::make_shared("", false, nullptr); + ThreadMemTrackerMgr() { + _mem_trackers["process"] = MemTracker::get_process_tracker(); + _untracked_mems["process"] = 0; + _tracker_id = "process"; + thread_mem_tracker_mgr_init = true; + } + ~ThreadMemTrackerMgr() { + clear_untracked_mems(); + thread_mem_tracker_mgr_init = false; } - ~ThreadMemTrackerMgr() { detach_task(); } - std::shared_ptr default_mem_tracker(); + void clear_untracked_mems() { + for(auto untracked_mem : _untracked_mems) { + // auto tracker = _mem_trackers[untracked_mem.first].lock(); + if (untracked_mem.second != 0) { + // if (_mem_trackers[untracked_mem.first]) { + _mem_trackers[untracked_mem.first]->consume(untracked_mem.second); + // _mem_trackers[untracked_mem.first]->consume(untracked_mem.second); + // } else { + // DCHECK(_tracker_id == "process"); + // _root_mem_tracker->consume(untracked_mem.second); + // } + // if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + // ExecEnv::GetInstance()->new_process_mem_tracker()->consume(untracked_mem.second); + // } + } + } + _mem_trackers[_tracker_id]->consume(_untracked_mem); + _untracked_mem = 0; + // if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + // ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem); + // } + if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem2); + _untracked_mem2 = 0; + } + } // After attach, the current thread TCMalloc Hook starts to consume/release task mem_tracker void attach_task(const std::string& action_type, const std::string& task_id, const TUniqueId& fragment_instance_id, - std::shared_ptr mem_tracker); + const std::shared_ptr& mem_tracker); void detach_task(); - std::weak_ptr update_tracker(std::weak_ptr mem_tracker); - std::shared_ptr update_consume_err_call_back( - const std::string& action_name, bool cancel_task, ERRCALLBACK call_back_func); - std::shared_ptr update_consume_err_call_back( - std::shared_ptr consume_err_call_back); + // Must be fast enough!!! + // Thread update_tracker may be called very frequently, adding a memory copy will be slow. + std::string update_tracker(const std::shared_ptr& mem_tracker); + std::string update_trackerP(const std::shared_ptr& mem_tracker); + + void set_tracker_id(const std::string& tracker_id) { + // DCHECK(_untracked_mem == 0); + if (tracker_id != _tracker_id) { + // _untracked_mems[_tracker_id] += _untracked_mem; + _mem_trackers[_tracker_id]->consume_cache(_untracked_mem); + // if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + // ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem2); + // } + _untracked_mem = 0; + _tracker_id = tracker_id; + } + // std::swap(_untracked_mems[_tracker_id], _untracked_mem); + + // _untracked_mem = _untracked_mems[_tracker_id]; + // _untracked_mems[_tracker_id] = 0; + } + + void set_tracker_idP(const std::string& tracker_id) { + if (tracker_id != _tracker_id) { + _mem_trackers[_tracker_id]->consume_cache(_untracked_mem); + _untracked_mem = 0; + _tracker_id = tracker_id; + } + } + + inline ConsumeErrCallBackInfo update_consume_err_call_back( + const std::string& action_type, bool cancel_task, ERRCALLBACK call_back_func); + + inline void update_consume_err_call_back(ConsumeErrCallBackInfo& consume_err_call_back) { + _consume_err_call_back = consume_err_call_back; + } // Note that, If call the memory allocation operation in TCMalloc new/delete Hook, // such as calling LOG/iostream/sstream/stringstream/etc. related methods, @@ -72,9 +157,18 @@ class ThreadMemTrackerMgr { void noncache_consume(); - std::weak_ptr mem_tracker() { return _mem_tracker; } - void stop_mem_tracker() { _stop_mem_tracker = true; } - void start_mem_tracker() { _stop_mem_tracker = false; } + // Frequent weak_ptr.lock() is expensive + std::shared_ptr mem_tracker() { + // if (_shared_mem_tracker == nullptr || _shared_mem_tracker->id() != _tracker_id) { + // _shared_mem_tracker = _mem_trackers[_tracker_id]; + // } + // if (_mem_trackers[_tracker_id]) { + return _mem_trackers[_tracker_id]; + // } else { + // DCHECK(_tracker_id == "process"); + // return MemTracker::get_root_tracker(); + // } + } private: // If tryConsume fails due to task mem tracker exceeding the limit, the task must be canceled @@ -83,24 +177,146 @@ class ThreadMemTrackerMgr { void exceeded(int64_t mem_usage, Status st); private: - std::weak_ptr _mem_tracker; + // 避免shared ptr use count 费 + std::unordered_map> _mem_trackers; + // lable + timestamp + std::string _tracker_id; + // MemTracker* _process_mem_tracker; - // Consume size smaller than _tracker_consume_cache_size will continue to accumulate + // Consume size smaller than mem_tracker_consume_min_size_bytes will continue to accumulate // to avoid frequent calls to consume/release of MemTracker. + std::unordered_map _untracked_mems; + // Cache untracked mem, only update to _untracked_mems when switching mem tracker. + // Frequent calls to unordered_map _untracked_mems[] in cache_consume will degrade performance. int64_t _untracked_mem = 0; - int64_t _tracker_consume_cache_size = config::mem_tracker_consume_min_size_bytes; + int64_t _untracked_mem2 = 0; - // If there is a memory new/delete operation in the consume method, it may enter infinite recursion. - // Note: After the tracker is stopped, the memory alloc in the consume method should be released in time, - // otherwise the MemTracker statistics will be inaccurate. - // In some cases, we want to turn off thread automatic memory statistics, manually call consume. - // In addition, when ~RootTracker, TCMalloc delete hook release RootTracker will crash. - bool _stop_mem_tracker = false; + ConsumeErrCallBackInfo _consume_err_call_back; - std::shared_ptr _consume_err_call_back; + // Avoid memory allocation in functions and fall into an infinite loop + std::string _temp_tracker_id; + ConsumeErrCallBackInfo _temp_consume_err_call_back; + std::shared_ptr _temp_task_mem_tracker; std::string _task_id; TUniqueId _fragment_instance_id; }; +inline std::string ThreadMemTrackerMgr::update_tracker(const std::shared_ptr& mem_tracker) { + DCHECK(mem_tracker != nullptr); + DCHECK(_mem_trackers[_tracker_id]); + _temp_tracker_id = mem_tracker->id(); + if (_temp_tracker_id == _tracker_id) { + return _tracker_id; + } + if (_mem_trackers.find(_temp_tracker_id) == _mem_trackers.end()) { + _mem_trackers[_temp_tracker_id] = mem_tracker; + _untracked_mems[_temp_tracker_id] = 0; + } + // _untracked_mems[_tracker_id] += _untracked_mem; + _mem_trackers[_tracker_id]->consume(_untracked_mem); + // if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + // ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem2); + // } + _untracked_mem = 0; + std::swap(_tracker_id, _temp_tracker_id); + // if (_mem_trackers.find(_temp_tracker_id) == _mem_trackers.end()) { + // _mem_trackers[_temp_tracker_id] = mem_tracker; + // _untracked_mems[_tracker_id] += _untracked_mem; + // _untracked_mem = 0; + // // std::swap(_untracked_mems[_tracker_id], _untracked_mem); + // // DCHECK(_untracked_mem == 0); + // std::swap(_tracker_id, _temp_tracker_id); + // _untracked_mems[_tracker_id] = 0; + // } else { + // // std::swap(_untracked_mems[_tracker_id], _untracked_mem); + // // DCHECK(_untracked_mem == 0); + // _untracked_mems[_tracker_id] += _untracked_mem; + // _untracked_mem = 0; + // std::swap(_tracker_id, _temp_tracker_id); + // // std::swap(_untracked_mems[_tracker_id], _untracked_mem); + // } + DCHECK(_mem_trackers[_temp_tracker_id]); + return _temp_tracker_id; // old tracker_id + // return _tracker_id; +} + +inline std::string ThreadMemTrackerMgr::update_trackerP(const std::shared_ptr& mem_tracker) { + _temp_tracker_id = mem_tracker->id(); + if (_temp_tracker_id == _tracker_id) { + return _tracker_id; + } + if (_mem_trackers.find(_temp_tracker_id) == _mem_trackers.end()) { + _mem_trackers[_temp_tracker_id] = mem_tracker; + _untracked_mems[_temp_tracker_id] = 0; + } + _mem_trackers[_tracker_id]->consume(_untracked_mem); + _untracked_mem = 0; + std::swap(_tracker_id, _temp_tracker_id); + DCHECK(_mem_trackers[_temp_tracker_id]); + return _temp_tracker_id; // old tracker_id + // return _tracker_id; +} + +inline ConsumeErrCallBackInfo ThreadMemTrackerMgr::update_consume_err_call_back( + const std::string& action_type, bool cancel_task, ERRCALLBACK call_back_func) { + _temp_consume_err_call_back = _consume_err_call_back; + _consume_err_call_back.update(action_type, cancel_task, call_back_func); + return _temp_consume_err_call_back; +} + +inline void ThreadMemTrackerMgr::cache_consume(int64_t size) { + // _untracked_mems[_tracker_id] += size; + _untracked_mem += size; + _untracked_mem2 += size; + // When some threads `0 < _untracked_mem < config::mem_tracker_consume_min_size_bytes` + // and some threads `_untracked_mem <= -config::mem_tracker_consume_min_size_bytes` trigger consumption(), + // it will cause tracker->consumption to be temporarily less than 0. + if (_untracked_mem >= config::mem_tracker_consume_min_size_bytes || + _untracked_mem <= -config::mem_tracker_consume_min_size_bytes) { + // DCHECK(_mem_trackers.find(_tracker_id) != _mem_trackers.end()); + thread_mem_tracker_mgr_init = false; + if (_untracked_mems[_tracker_id] != 0) { + _untracked_mem += _untracked_mems[_tracker_id]; + _untracked_mems[_tracker_id] = 0; + } + noncache_consume(); + // _untracked_mem = 0; + // _untracked_mem2 = 0; + thread_mem_tracker_mgr_init = true; + } + + + if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + if (_untracked_mem2 >= config::mem_tracker_consume_min_size_bytes || + _untracked_mem2 <= -config::mem_tracker_consume_min_size_bytes) { + ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem2); + _untracked_mem2 = 0; + } + } +} + +inline void ThreadMemTrackerMgr::noncache_consume() { + // if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + // ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem2); + // } + // Ensure thread safety + // auto tracker = _mem_trackers[_tracker_id].lock(); + if (_mem_trackers[_tracker_id]) { + Status st = _mem_trackers[_tracker_id]->try_consume(_untracked_mem); + if (!st) { + // The memory has been allocated, so when TryConsume fails, need to continue to complete + // the consume to ensure the accuracy of the statistics. + _mem_trackers[_tracker_id]->consume(_untracked_mem); + exceeded(_untracked_mem, st); + } + _untracked_mem = 0; + } + // else { + // DCHECK(_tracker_id == "process"); + // _mem_trackers["process"] = ExecEnv::GetInstance()->process_mem_tracker(); + // _root_mem_tracker->consume(_untracked_mems[_tracker_id]); + // } +} + } // namespace doris diff --git a/be/src/runtime/threadlocal.cc b/be/src/runtime/threadlocal.cc new file mode 100644 index 00000000000000..ac2bf2e62a9094 --- /dev/null +++ b/be/src/runtime/threadlocal.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "runtime/threadlocal.h" + +#include + +#include +#include +#include + +#include "common/logging.h" +#include "gutil/once.h" +#include "util/errno.h" + +namespace doris { + +// One key used by the entire process to attach destructors on thread exit. +static pthread_key_t destructors_key; + +// The above key must only be initialized once per process. +static GoogleOnceType once = GOOGLE_ONCE_INIT; + +namespace { + +// List of destructors for all thread locals instantiated on a given thread. +struct PerThreadDestructorList { + void (*destructor)(void*); + void* arg; + PerThreadDestructorList* next; +}; + +} // anonymous namespace + +// Call all the destructors associated with all THREAD_LOCAL instances in this +// thread. +static void invoke_destructors(void* t) { + PerThreadDestructorList* d = reinterpret_cast(t); + while (d != nullptr) { + d->destructor(d->arg); + PerThreadDestructorList* next = d->next; + delete d; + d = next; + } +} + +// This key must be initialized only once. +static void create_key() { + int ret = pthread_key_create(&destructors_key, &invoke_destructors); + // Linux supports up to 1024 keys, we will use only one for all thread locals. + CHECK_EQ(0, ret) << "pthread_key_create() failed, cannot add destructor to thread: " + << "error " << ret << ": " << errno_to_string(ret); +} + +// Adds a destructor to the list. +void add_destructor(void (*destructor)(void*), void* arg) { + GoogleOnceInit(&once, &create_key); + + // Returns NULL if nothing is set yet. + std::unique_ptr p(new PerThreadDestructorList()); + p->destructor = destructor; + p->arg = arg; + p->next = reinterpret_cast(pthread_getspecific(destructors_key)); + int ret = pthread_setspecific(destructors_key, p.release()); + // The only time this check should fail is if we are out of memory, or if + // somehow key creation failed, which should be caught by the above CHECK. + CHECK_EQ(0, ret) << "pthread_setspecific() failed, cannot update destructor list: " + << "error " << ret << ": " << errno_to_string(ret); +} + +} // namespace doris diff --git a/be/src/runtime/threadlocal.h b/be/src/runtime/threadlocal.h new file mode 100644 index 00000000000000..53956589595ae4 --- /dev/null +++ b/be/src/runtime/threadlocal.h @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef THREADLOCAL_H_ +#define THREADLOCAL_H_ + +// Block-scoped static thread local implementation. +// +// Usage is similar to a C++11 thread_local. The BLOCK_STATIC_THREAD_LOCAL macro +// defines a thread-local pointer to the specified type, which is lazily +// instantiated by any thread entering the block for the first time. The +// constructor for the type T is invoked at macro execution time, as expected, +// and its destructor is invoked when the corresponding thread's Runnable +// returns, or when the thread exits. +// +// Inspired by Poco , +// Andrew Tomazos , and +// the C++11 thread_local API. +// +// Example usage: +// +// // Invokes a 3-arg constructor on SomeClass: +// BLOCK_STATIC_THREAD_LOCAL(SomeClass, instance, arg1, arg2, arg3); +// instance->DoSomething(); +// +#define BLOCK_STATIC_THREAD_LOCAL(T, t, ...) \ + static __thread T* t; \ + do { \ + if (PREDICT_FALSE(t == NULL)) { \ + t = new T(__VA_ARGS__); \ + add_destructor(destroy, t); \ + } \ + } while (false) + +// Class-scoped static thread local implementation. +// +// Very similar in implementation to the above block-scoped version, but +// requires a bit more syntax and vigilance to use properly. +// +// DECLARE_STATIC_THREAD_LOCAL(Type, instance_var_) must be placed in the +// class header, as usual for variable declarations. +// +// Because these variables are static, they must also be defined in the impl +// file with DEFINE_STATIC_THREAD_LOCAL(Type, Classname, instance_var_), +// which is very much like defining any static member, i.e. int Foo::member_. +// +// Finally, each thread must initialize the instance before using it by calling +// INIT_STATIC_THREAD_LOCAL(Type, instance_var_, ...). This is a cheap +// call, and may be invoked at the top of any method which may reference a +// thread-local variable. +// +// Due to all of these requirements, you should probably declare TLS members +// as private. +// +// Example usage: +// +// // foo.h +// #include "kudu/utils/file.h" +// class Foo { +// public: +// void DoSomething(std::string s); +// private: +// DECLARE_STATIC_THREAD_LOCAL(utils::File, file_); +// }; +// +// // foo.cc +// #include "kudu/foo.h" +// DEFINE_STATIC_THREAD_LOCAL(utils::File, Foo, file_); +// void Foo::WriteToFile(std::string s) { +// // Call constructor if necessary. +// INIT_STATIC_THREAD_LOCAL(utils::File, file_, "/tmp/file_location.txt"); +// file_->Write(s); +// } + +// Goes in the class declaration (usually in a header file). +// dtor must be destructed _after_ t, so it gets defined first. +// Uses a mangled variable name for dtor since it must also be a member of the +// class. +#define DECLARE_STATIC_THREAD_LOCAL(T, t) static __thread T* t + +// You must also define the instance in the .cc file. +#define DEFINE_STATIC_THREAD_LOCAL(T, Class, t) __thread T* Class::t + +// Must be invoked at least once by each thread that will access t. +#define INIT_STATIC_THREAD_LOCAL(T, t, ...) \ + do { \ + if (PREDICT_FALSE(t == NULL)) { \ + t = new T(__VA_ARGS__); \ + add_destructor(destroy, t); \ + } \ + } while (false) + +// Internal implementation below. + +namespace doris { + +// Add a destructor to the list. +void add_destructor(void (*destructor)(void*), void* arg); + +// Destroy the passed object of type T. +template +static void destroy(void* t) { + // With tcmalloc, this should be pretty cheap (same thread as new). + delete reinterpret_cast(t); +} + +} // namespace doris + +#endif // THREADLOCAL_H_ diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 62b07ee0745cc1..c181db12ed90b9 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -47,6 +47,7 @@ #include "runtime/heartbeat_flags.h" #include "runtime/minidump.h" #include "runtime/tcmalloc_hook.h" +#include "runtime/thread_context.h" #include "service/backend_options.h" #include "service/backend_service.h" #include "service/brpc_service.h" @@ -290,6 +291,71 @@ int main(int argc, char** argv) { #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER) doris::MemInfo::refresh_current_mem(); #endif + // LOG(WARNING) << "free_thread_ctx 1111 " << doris::free_thread_ctx.size(); + // { + // // std::lock_guard l(doris::free_thread_ctx_lock); + // for (auto it = doris::free_thread_ctx.begin(); it != doris::free_thread_ctx.end();) { + // // if (pthread_kill((*it)->pthread_id(), 0) == ESRCH) { + // // if (!(*it)->pthread_id() || pthread_tryjoin_np((*it)->pthread_id(), NULL) != EBUSY) { + // // delete *it; + // // it = doris::free_thread_ctx.erase(it); + // // } else { + // // it++; + // // } + // delete *it; + // it = doris::free_thread_ctx.erase(it); + // } + // } + // LOG(WARNING) << "free_thread_ctx 2222 " << doris::free_thread_ctx.size(); + + // LOG(WARNING) << "free_thread_ctx 1111 " << doris::ExecEnv::GetInstance()->free_thread_ctx().size(); + // { + // // std::lock_guard l(doris::free_thread_ctx_lock); + // for (auto it = doris::ExecEnv::GetInstance()->free_thread_ctx().begin(); it != doris::ExecEnv::GetInstance()->free_thread_ctx().end();) { + // // if (pthread_kill((*it)->pthread_id(), 0) == ESRCH) { + // // if (!(*it)->pthread_id() || pthread_tryjoin_np((*it)->pthread_id(), NULL) != EBUSY) { + // // delete *it; + // // it = doris::free_thread_ctx.erase(it); + // // } else { + // // it++; + // // } + // delete *it; + // it = doris::ExecEnv::GetInstance()->free_thread_ctx().erase(it); + // } + // } + // LOG(WARNING) << "free_thread_ctx 2222 " << doris::ExecEnv::GetInstance()->free_thread_ctx().size(); + + // LOG(WARNING) << "free2_thread_ctx 1111 " << doris::free2_thread_ctx.size(); + // { + // // std::lock_guard l(doris::free_thread_ctx_lock); + // for (auto it = doris::free2_thread_ctx.begin(); it != doris::free2_thread_ctx.end();) { + // LOG(WARNING) << "free2_thread_ctx 33333 " << *it; + // LOG(WARNING) << "free2_thread_ctx 44444 " << (**it == nullptr); + // if (*it == nullptr) { + // // delete *it; + // it = doris::free2_thread_ctx.erase(it); + // } else { + // it++; + // } + // } + // } + // LOG(WARNING) << "free2_thread_ctx 2222 " << doris::free2_thread_ctx.size(); + + // LOG(WARNING) << "free3_thread_ctx 1111 " << doris::free3_thread_ctx.size(); + // { + // // std::lock_guard l(doris::free_thread_ctx_lock); + // for (auto it = doris::free3_thread_ctx.begin(); it != doris::free3_thread_ctx.end();) { + // LOG(WARNING) << "free3_thread_ctx 33333 " << (*it).first << " : " << (*it).second; + // if ((*it).first != 0) { + // // delete *it; + // it = doris::free3_thread_ctx.erase(it); + // } else { + // it++; + // } + // } + // } + // LOG(WARNING) << "free3_thread_ctx 2222 " << doris::free3_thread_ctx.size(); + doris::ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->logout_task_mem_tracker(); sleep(10); } diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index e19427ff92472c..253b130d1ee944 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -54,7 +54,7 @@ HttpService::HttpService(ExecEnv* env, int port, int num_threads) HttpService::~HttpService() {} Status HttpService::start() { - add_default_path_handlers(_web_page_handler.get(), _env->process_mem_tracker()); + add_default_path_handlers(_web_page_handler.get(), MemTracker::get_process_tracker()); // register load MiniLoadAction* miniload_action = _pool.add(new MiniLoadAction(_env)); diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc index fd51c40b699c99..6068e529863614 100644 --- a/be/test/runtime/test_env.cc +++ b/be/test/runtime/test_env.cc @@ -36,7 +36,6 @@ TestEnv::TestEnv() _exec_env = ExecEnv::GetInstance(); _exec_env->_thread_mgr = new ThreadResourceMgr(2); _exec_env->_buffer_reservation = new ReservationTracker(); - _exec_env->_process_mem_tracker = MemTracker::create_tracker(-1, "TestEnv"); _exec_env->_task_pool_mem_tracker_registry.reset(new MemTrackerTaskPool()); _exec_env->_disk_io_mgr = new DiskIoMgr(1, 1, 1, 10); _exec_env->disk_io_mgr()->init(-1); diff --git a/build.sh b/build.sh index 2c883c8ec5b443..a54e866eaef948 100755 --- a/build.sh +++ b/build.sh @@ -217,7 +217,7 @@ fi echo "Build generated code" cd ${DORIS_HOME}/gensrc # DO NOT using parallel make(-j) for gensrc -python --version +/home/disk3/zxy/tools/Python-2.7.10/python --version make # Clean and build Backend diff --git a/env.sh b/env.sh index 439de9e0ff3e6a..c4fcb20d4a97bc 100755 --- a/env.sh +++ b/env.sh @@ -42,7 +42,7 @@ if [[ -z ${DORIS_THIRDPARTY} ]]; then fi # check python -export PYTHON=python +export PYTHON=/home/disk3/zxy/tools/Python-2.7.10/python if ! ${PYTHON} --version; then export PYTHON=python2.7 if ! ${PYTHON} --version; then