From f37758e5e0a11168ac7d96b5d947455a62aed4cf Mon Sep 17 00:00:00 2001
From: Xinyi Zou <zouxinyi@baidu.com>
Date: Fri, 30 Oct 2020 02:21:37 +0800
Subject: [PATCH 1/4] [Feature] Running Profile OLAP_SCAN_NODE node layering
 and enhance readability

---
 be/src/exec/olap_scan_node.cpp                |  59 +++----
 be/src/exec/olap_scan_node.h                  |   1 +
 be/src/exec/olap_scanner.cpp                  |   7 +-
 be/src/exec/scan_node.cpp                     |  30 ++--
 be/src/exec/scan_node.h                       |  30 +---
 be/src/olap/olap_common.h                     |   1 +
 .../rowset/segment_v2/segment_iterator.cpp    |   2 +-
 .../en/administrator-guide/running-profile.md | 149 +++++++-----------
 .../administrator-guide/running-profile.md    | 149 +++++++-----------
 9 files changed, 171 insertions(+), 257 deletions(-)

diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp
index 43acfa82cb9b6a..f36f22537f960a 100644
--- a/be/src/exec/olap_scan_node.cpp
+++ b/be/src/exec/olap_scan_node.cpp
@@ -88,43 +88,44 @@ Status OlapScanNode::init(const TPlanNode& tnode, RuntimeState* state) {
 }
 
 void OlapScanNode::_init_counter(RuntimeState* state) {
-    ADD_TIMER(_runtime_profile, "ShowHintsTime");
+    ADD_TIMER(_scanner_profile, "ShowHintsTime_V1");
 
-    _reader_init_timer = ADD_TIMER(_runtime_profile, "ReaderInitTime");
-    _read_compressed_counter = ADD_COUNTER(_runtime_profile, "CompressedBytesRead", TUnit::BYTES);
-    _read_uncompressed_counter = ADD_COUNTER(_runtime_profile, "UncompressedBytesRead", TUnit::BYTES);
-    _block_load_timer = ADD_TIMER(_runtime_profile, "BlockLoadTime");
-    _block_load_counter = ADD_COUNTER(_runtime_profile, "BlocksLoad", TUnit::UNIT);
-    _block_fetch_timer = ADD_TIMER(_runtime_profile, "BlockFetchTime");
-    _raw_rows_counter = ADD_COUNTER(_runtime_profile, "RawRowsRead", TUnit::UNIT);
-    _block_convert_timer = ADD_TIMER(_runtime_profile, "BlockConvertTime");
-    _block_seek_timer = ADD_TIMER(_runtime_profile, "BlockSeekTime");
-    _block_seek_counter = ADD_COUNTER(_runtime_profile, "BlockSeekCount", TUnit::UNIT);
+    _reader_init_timer = ADD_TIMER(_scanner_profile, "ReaderInitTime");
+    _read_compressed_counter = ADD_COUNTER(_segment_profile, "CompressedBytesRead", TUnit::BYTES);
+    _read_uncompressed_counter = ADD_COUNTER(_segment_profile, "UncompressedBytesRead", TUnit::BYTES);
+    _block_load_timer = ADD_TIMER(_segment_profile, "BlockLoadTime");
+    _block_load_counter = ADD_COUNTER(_segment_profile, "BlocksLoad", TUnit::UNIT);
+    _block_fetch_timer = ADD_TIMER(_scanner_profile, "BlockFetchTime");
+    _raw_rows_counter = ADD_COUNTER(_segment_profile, "RawRowsRead", TUnit::UNIT);
+    _block_convert_timer = ADD_TIMER(_scanner_profile, "BlockConvertTime");
+    _block_seek_timer = ADD_TIMER(_segment_profile, "BlockSeekTime");
+    _block_seek_counter = ADD_COUNTER(_segment_profile, "BlockSeekCount", TUnit::UNIT);
 
-    _rows_vec_cond_counter = ADD_COUNTER(_runtime_profile, "RowsVectorPredFiltered", TUnit::UNIT);
-    _vec_cond_timer = ADD_TIMER(_runtime_profile, "VectorPredEvalTime");
+    _rows_vec_cond_counter = ADD_COUNTER(_segment_profile, "RowsVectorPredFiltered", TUnit::UNIT);
+    _vec_cond_timer = ADD_TIMER(_segment_profile, "VectorPredEvalTime");
 
-    _stats_filtered_counter = ADD_COUNTER(_runtime_profile, "RowsStatsFiltered", TUnit::UNIT);
-    _bf_filtered_counter = ADD_COUNTER(_runtime_profile, "RowsBloomFilterFiltered", TUnit::UNIT);
-    _del_filtered_counter = ADD_COUNTER(_runtime_profile, "RowsDelFiltered", TUnit::UNIT);
-    _key_range_filtered_counter = ADD_COUNTER(_runtime_profile, "RowsKeyRangeFiltered", TUnit::UNIT);
+    _stats_filtered_counter = ADD_COUNTER(_segment_profile, "RowsStatsFiltered", TUnit::UNIT);
+    _bf_filtered_counter = ADD_COUNTER(_segment_profile, "RowsBloomFilterFiltered", TUnit::UNIT);
+    _del_filtered_counter = ADD_COUNTER(_scanner_profile, "RowsDelFiltered", TUnit::UNIT);
+    _conditions_filtered_counter = ADD_COUNTER(_segment_profile, "RowsConditionsFiltered", TUnit::UNIT);
+    _key_range_filtered_counter = ADD_COUNTER(_segment_profile, "RowsKeyRangeFiltered", TUnit::UNIT);
 
-    _io_timer = ADD_TIMER(_runtime_profile, "IOTimer");
-    _decompressor_timer = ADD_TIMER(_runtime_profile, "DecompressorTimer");
-    _index_load_timer = ADD_TIMER(_runtime_profile, "IndexLoadTime");
+    _io_timer = ADD_TIMER(_segment_profile, "IOTimer");
+    _decompressor_timer = ADD_TIMER(_segment_profile, "DecompressorTimer");
+    _index_load_timer = ADD_TIMER(_segment_profile, "IndexLoadTime_V1");
 
-    _scan_timer = ADD_TIMER(_runtime_profile, "ScanTime");
+    _scan_timer = ADD_TIMER(_scanner_profile, "ScanTime");
 
-    _total_pages_num_counter = ADD_COUNTER(_runtime_profile, "TotalPagesNum", TUnit::UNIT);
-    _cached_pages_num_counter = ADD_COUNTER(_runtime_profile, "CachedPagesNum", TUnit::UNIT);
+    _total_pages_num_counter = ADD_COUNTER(_segment_profile, "TotalPagesNum", TUnit::UNIT);
+    _cached_pages_num_counter = ADD_COUNTER(_segment_profile, "CachedPagesNum", TUnit::UNIT);
 
-    _bitmap_index_filter_counter = ADD_COUNTER(_runtime_profile, "RowsBitmapIndexFiltered", TUnit::UNIT);
-    _bitmap_index_filter_timer = ADD_TIMER(_runtime_profile, "BitmapIndexFilterTimer");
+    _bitmap_index_filter_counter = ADD_COUNTER(_segment_profile, "RowsBitmapIndexFiltered", TUnit::UNIT);
+    _bitmap_index_filter_timer = ADD_TIMER(_segment_profile, "BitmapIndexFilterTimer");
 
     _num_scanners = ADD_COUNTER(_runtime_profile, "NumScanners", TUnit::UNIT);
 
-    _filtered_segment_counter = ADD_COUNTER(_runtime_profile, "NumSegmentFiltered", TUnit::UNIT);
-    _total_segment_counter = ADD_COUNTER(_runtime_profile, "NumSegmentTotal", TUnit::UNIT);
+    _filtered_segment_counter = ADD_COUNTER(_segment_profile, "NumSegmentFiltered", TUnit::UNIT);
+    _total_segment_counter = ADD_COUNTER(_segment_profile, "NumSegmentTotal", TUnit::UNIT);
 }
 
 Status OlapScanNode::prepare(RuntimeState* state) {
@@ -134,7 +135,7 @@ Status OlapScanNode::prepare(RuntimeState* state) {
     _tablet_counter =
         ADD_COUNTER(runtime_profile(), "TabletCount ", TUnit::UNIT);
     _rows_pushed_cond_filtered_counter =
-        ADD_COUNTER(_runtime_profile, "RowsPushedCondFiltered", TUnit::UNIT);
+        ADD_COUNTER(_scanner_profile, "RowsPushedCondFiltered", TUnit::UNIT);
     _init_counter(state);
     _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id);
     if (_tuple_desc == NULL) {
@@ -567,7 +568,7 @@ static Status get_hints(
         return Status::InternalError(ss.str());
     }
 
-    RuntimeProfile::Counter* show_hints_timer = profile->get_counter("ShowHintsTime");
+    RuntimeProfile::Counter* show_hints_timer = profile->get_counter("ShowHintsTime_V1");
     std::vector<std::vector<OlapTuple>> ranges;
     bool have_valid_range = false;
     for (auto& key_range : scan_key_range) {
diff --git a/be/src/exec/olap_scan_node.h b/be/src/exec/olap_scan_node.h
index 959b3c5adde5eb..df4d529ad94cf4 100644
--- a/be/src/exec/olap_scan_node.h
+++ b/be/src/exec/olap_scan_node.h
@@ -267,6 +267,7 @@ class OlapScanNode : public ScanNode {
     RuntimeProfile::Counter* _stats_filtered_counter = nullptr;
     RuntimeProfile::Counter* _bf_filtered_counter = nullptr;
     RuntimeProfile::Counter* _del_filtered_counter = nullptr;
+    RuntimeProfile::Counter* _conditions_filtered_counter = nullptr;
     RuntimeProfile::Counter* _key_range_filtered_counter = nullptr;
 
     RuntimeProfile::Counter* _block_seek_timer = nullptr;
diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp
index bdf2c2fa060cff..06f48c6514ee69 100644
--- a/be/src/exec/olap_scanner.cpp
+++ b/be/src/exec/olap_scanner.cpp
@@ -34,11 +34,6 @@
 
 namespace doris {
 
-static const std::string SCANNER_THREAD_TOTAL_WALLCLOCK_TIME =
-    "ScannerThreadsTotalWallClockTime";
-static const std::string MATERIALIZE_TUPLE_TIMER =
-    "MaterializeTupleTime(*)";
-
 OlapScanner::OlapScanner(
         RuntimeState* runtime_state,
         OlapScanNode* parent,
@@ -481,6 +476,8 @@ void OlapScanner::update_counter() {
     COUNTER_UPDATE(_parent->_stats_filtered_counter, _reader->stats().rows_stats_filtered);
     COUNTER_UPDATE(_parent->_bf_filtered_counter, _reader->stats().rows_bf_filtered);
     COUNTER_UPDATE(_parent->_del_filtered_counter, _reader->stats().rows_del_filtered);
+    COUNTER_UPDATE(_parent->_conditions_filtered_counter, _reader->stats().rows_conditions_filtered);
+    
     COUNTER_UPDATE(_parent->_key_range_filtered_counter, _reader->stats().rows_key_range_filtered);
 
     COUNTER_UPDATE(_parent->_index_load_timer, _reader->stats().index_load_ns);
diff --git a/be/src/exec/scan_node.cpp b/be/src/exec/scan_node.cpp
index 6021abfe1dbbdf..d2d90fe47b6761 100644
--- a/be/src/exec/scan_node.cpp
+++ b/be/src/exec/scan_node.cpp
@@ -23,45 +23,37 @@ namespace doris {
 
 const string ScanNode::_s_bytes_read_counter = "BytesRead";
 const string ScanNode::_s_rows_read_counter = "RowsRead";
-const string ScanNode::_s_total_read_timer = "TotalRawReadTime(*)";
-const string ScanNode::_s_total_throughput_counter = "TotalReadThroughput";
 const string ScanNode::_s_materialize_tuple_timer = "MaterializeTupleTime(*)";
-const string ScanNode::_s_per_read_thread_throughput_counter =
-    "PerReadThreadRawHdfsThroughput";
 const string ScanNode::_s_num_disks_accessed_counter = "NumDiskAccess";
 const string ScanNode::_s_scanner_thread_counters_prefix = "ScannerThreads";
 const string ScanNode::_s_scanner_thread_total_wallclock_time =
     "ScannerThreadsTotalWallClockTime";
 
-const string ScanNode::_s_num_scanner_threads_started ="NumScannerThreadsStarted";
-
 Status ScanNode::prepare(RuntimeState* state) {
+    init_scan_profile();
     RETURN_IF_ERROR(ExecNode::prepare(state));
 
-    _scanner_thread_counters =
-        ADD_THREAD_COUNTERS(runtime_profile(), _s_scanner_thread_counters_prefix);
     _bytes_read_counter =
-        ADD_COUNTER(runtime_profile(), _s_bytes_read_counter, TUnit::BYTES);
+        ADD_COUNTER(_segment_profile, _s_bytes_read_counter, TUnit::BYTES);
     //TODO: The _rows_read_counter == RowsReturned counter in exec node, there is no need to keep both of them
     _rows_read_counter =
-        ADD_COUNTER(runtime_profile(), _s_rows_read_counter, TUnit::UNIT);
-    _read_timer = ADD_TIMER(runtime_profile(), _s_total_read_timer);
+        ADD_COUNTER(_scanner_profile, _s_rows_read_counter, TUnit::UNIT);
 #ifndef BE_TEST
-    _total_throughput_counter = runtime_profile()->add_rate_counter(
-                                    _s_total_throughput_counter, _bytes_read_counter);
 #endif
     _materialize_tuple_timer = ADD_CHILD_TIMER(runtime_profile(), _s_materialize_tuple_timer,
                                _s_scanner_thread_total_wallclock_time);
-    _per_read_thread_throughput_counter = runtime_profile()->add_derived_counter(
-            _s_per_read_thread_throughput_counter, TUnit::BYTES_PER_SECOND,
-            boost::bind<int64_t>(&RuntimeProfile::units_per_second,
-                                 _bytes_read_counter,
-                                 _read_timer),
-            "");
     _num_disks_accessed_counter =
         ADD_COUNTER(runtime_profile(), _s_num_disks_accessed_counter, TUnit::UNIT);
 
     return Status::OK();
 }
 
+void ScanNode::init_scan_profile() {
+    _scanner_profile.reset(new RuntimeProfile("OlapScanner"));
+    runtime_profile()->add_child(_scanner_profile.get(), true, NULL);
+
+    _segment_profile.reset(new RuntimeProfile("SegmentIterator"));
+    _scanner_profile->add_child(_segment_profile.get(), true, NULL);
+}
+
 }
diff --git a/be/src/exec/scan_node.h b/be/src/exec/scan_node.h
index 54ef5dc70c26ec..98a754b3b382e8 100644
--- a/be/src/exec/scan_node.h
+++ b/be/src/exec/scan_node.h
@@ -96,49 +96,29 @@ class ScanNode : public ExecNode {
     RuntimeProfile::Counter* rows_read_counter() const {
         return _rows_read_counter;
     }
-    RuntimeProfile::Counter* read_timer() const {
-        return _read_timer;
-    }
-    RuntimeProfile::Counter* total_throughput_counter() const {
-        return _total_throughput_counter;
-    }
-    RuntimeProfile::Counter* per_read_thread_throughput_counter() const {
-        return _per_read_thread_throughput_counter;
-    }
     RuntimeProfile::Counter* materialize_tuple_timer() const {
         return _materialize_tuple_timer;
     }
-    RuntimeProfile::ThreadCounters* scanner_thread_counters() const {
-        return _scanner_thread_counters;
-    }
+    // OLAP_SCAN_NODE profile layering: OLAP_SCAN_NODE, OlapScanner, and SegmentIterator according to the calling relationship
+    void init_scan_profile();
 
     // names of ScanNode common counters
     static const std::string _s_bytes_read_counter;
     static const std::string _s_rows_read_counter;
-    static const std::string _s_total_read_timer;
-    static const std::string _s_total_throughput_counter;
-    static const std::string _s_per_read_thread_throughput_counter;
     static const std::string _s_num_disks_accessed_counter;
     static const std::string _s_materialize_tuple_timer;
     static const std::string _s_scanner_thread_counters_prefix;
     static const std::string _s_scanner_thread_total_wallclock_time;
-    static const std::string _s_average_io_mgr_queue_capacity;
-    static const std::string _s_num_scanner_threads_started;
 
 protected:
     RuntimeProfile::Counter* _bytes_read_counter; // # bytes read from the scanner
     // # rows/tuples read from the scanner (including those discarded by eval_conjucts())
     RuntimeProfile::Counter* _rows_read_counter;
-    RuntimeProfile::Counter* _read_timer; // total read time
-    // Wall based aggregate read throughput [bytes/sec]
-    RuntimeProfile::Counter* _total_throughput_counter;
-    // Per thread read throughput [bytes/sec]
-    RuntimeProfile::Counter* _per_read_thread_throughput_counter;
     RuntimeProfile::Counter* _num_disks_accessed_counter;
     RuntimeProfile::Counter* _materialize_tuple_timer;  // time writing tuple slots
-    // Aggregated scanner thread counters
-    RuntimeProfile::ThreadCounters* _scanner_thread_counters;
-    RuntimeProfile::Counter* _num_scanner_threads_started_counter;
+
+    boost::scoped_ptr<RuntimeProfile> _scanner_profile;
+    boost::scoped_ptr<RuntimeProfile> _segment_profile;
 };
 
 }
diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h
index 889f0e3eb23a3f..48d172baadf70b 100644
--- a/be/src/olap/olap_common.h
+++ b/be/src/olap/olap_common.h
@@ -260,6 +260,7 @@ struct OlapReaderStatistics {
     int64_t rows_stats_filtered = 0;
     int64_t rows_bf_filtered = 0;
     int64_t rows_del_filtered = 0;
+    int64_t rows_conditions_filtered = 0;
 
     int64_t index_load_ns = 0;
 
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 90d8e8e51c4574..a1bf554ce53e81 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -220,7 +220,7 @@ Status SegmentIterator::_get_row_ranges_by_column_conditions() {
         RETURN_IF_ERROR(_get_row_ranges_from_conditions(&condition_row_ranges));
         size_t pre_size = _row_bitmap.cardinality();
         _row_bitmap &= RowRanges::ranges_to_roaring(condition_row_ranges);
-        _opts.stats->rows_del_filtered += (pre_size - _row_bitmap.cardinality());
+        _opts.stats->rows_conditions_filtered += (pre_size - _row_bitmap.cardinality());
     }
 
     // TODO(hkp): calculate filter rate to decide whether to
diff --git a/docs/en/administrator-guide/running-profile.md b/docs/en/administrator-guide/running-profile.md
index 60aca26b56b368..7022117567cf1d 100644
--- a/docs/en/administrator-guide/running-profile.md
+++ b/docs/en/administrator-guide/running-profile.md
@@ -148,103 +148,74 @@ The `OLAP_SCAN_NODE` is responsible for specific data scanning tasks. One `OLAP_
 
 Some or all of the predicate conditions in the query will be pushed to `OLAP_SCAN_NODE`. Some of these predicate conditions will continue to be pushed down to the storage engine in order to use the storage engine's index for data filtering. The other part will be kept in `OLAP_SCAN_NODE` to filter the data returned from the storage engine.
 
+The profile of the `OLAP_SCAN_NODE` node is usually used to analyze the efficiency of data scanning. It is divided into three layers: `OLAP_SCAN_NODE`, `OlapScanner`, and `SegmentIterator` according to the calling relationship.
+
 The profile of a typical `OLAP_SCAN_NODE` is as follows. Some indicators will have different meanings depending on the storage format (V1 or V2).
 
 ```
-OLAP_SCAN_NODE (id=0): (Active: 4.050ms, non-child: 35.68%)
-   -BitmapIndexFilterTimer: 0.000ns # Time consuming to filter data using bitmap index.
-   -BlockConvertTime: 7.433ms   # Time consuming to convert a vectorized block into a row structure RowBlock. Vectorized Block is VectorizedRowBatch in V1, and RowBlockV2 in V2.
-   -BlockFetchTime: 36.934ms    # Rowset Reader time to get Block.
-   -BlockLoadTime: 23.368ms # time of SegmentReader(V1) or SegmentIterator(V2) to get the block time.
-   -BlockSeekCount: 0   # The number of block seek times when reading segments.
-   -BlockSeekTime: 3.062ms  # Time consuming for block seek when reading segments.
-   -BlocksLoad: 221 # number of blocks read
-   -BytesRead: 6.59 MB  # The amount of data read from the data file. Assuming that 10 32-bit integers are read, the amount of data is 10 * 4B = 40 Bytes. This data only represents the fully expanded size of the data in memory, and does not represent the actual IO size.
-   -CachedPagesNum: 0   # In V2 only, when PageCache is enabled, the number of pages that hit Cache.
-   -CompressedBytesRead: 1.36 MB    # V1, the size of the data read from the file before decompression. In V2, the uncompressed size of Pages that did not hit PageCache.
-   -DecompressorTimer: 4.194ms  # Data decompression takes time.
-   -IOTimer: 1.404ms    # IO time to actually read data from the operating system.
-   -IndexLoadTime: 1.521ms  # In V1 only, it takes time to read Index Stream.
-   -NumDiskAccess: 6    # The number of disks involved in this ScanNode.
-   -NumScanners: 25 # The number of Scanners generated by this ScanNode.
-   -NumSegmentFiltered: 4   # Number of Segment filtered by column statistic when creating Segment Iterator.
-   -NumSegmentTotal: 20    # Total number of Segment related to this scan.
-   -PeakMemoryUsage: 0  # meaningless
-   -PerReadThreadRawHdfsThroughput: 0.00 /sec   # meaningless
-   -RawRowsRead: 141.71K    # The number of raw rows read in the storage engine. See below for details.
-   -ReaderInitTime: 16.515ms    # OlapScanner time to initialize Reader. V1 includes the time to form MergeHeap. V2 includes the time to generate Iterators at all levels and read the first block.
-   -RowsBitmapFiltered: 0   # Number of rows filtered by bitmap index
-   -RowsBloomFilterFiltered: 0  # In V2 only, the number of rows filtered by the BloomFilter index.
-   -RowsDelFiltered: 0  # V1 indicates the number of rows filtered according to the delete condition. V2 also includes the number of rows filtered by BloomFilter and some predicate conditions.
-   -RowsPushedCondFiltered: 0   # Filter the conditions based on the predicate passed down, such as the condition passed from BuildTable to ProbeTable in Join calculation. This value is inaccurate because if the filtering effect is poor, it will not be filtered.
-   -RowsRead: 132.78K   # The number of rows returned from the storage engine to the Scanner, excluding the number of rows filtered by the Scanner.
-   -RowsReturned: 132.78K   # The number of rows returned from ScanNode to the upper node.
-   -RowsReturnedRate: 32.78 M/sec   # RowsReturned/ActiveTime
-   -RowsStatsFiltered: 0    # In V2, the number of rows filtered according to Zonemap with predicate conditions. V1 also contains the number of rows filtered by BloomFilter.
-   -RowsVectorPredFiltered: 0   # The number of rows filtered by the vectorized conditional filtering operation.
-   -ScanTime: 49.239ms # Time-consuming statistics of Scanner calling get_next() method.
-   -ScannerThreadsInvoluntaryContextSwitches: 0 # meaningless
-   -ScannerThreadsTotalWallClockTime: 0.000ns   # meaningless
-     -MaterializeTupleTime(*): 0.000ns  # meaningless
-     -ScannerThreadsSysTime: 0.000ns    # meaningless
-     -ScannerThreadsUserTime: 0.000ns   # meaningless
-   -ScannerThreadsVoluntaryContextSwitches: 0   # meaningless
-   -ShowHintsTime: 0.000ns  # meaningless in V2. Part of the data is read in V1 to perform ScanRange segmentation.
-   -TabletCount: 25 # The number of tablets involved in this ScanNode.
-   -TotalPagesNum: 0    # In V2 only, the total number of pages read.
-   -TotalRawReadTime(*): 0.000ns    # meaningless
-   -TotalReadThroughput: 0.00 /sec  # meaningless
-   -UncompressedBytesRead: 4.28 MB  # V1 is the decompressed size of the read data file (if the file does not need to be decompressed, the file size is directly counted). In V2, only the uncompressed size of the PageCache is counted (if the Page does not need to be decompressed, the Page size is directly counted)
-   -VectorPredEvalTime: 0.000ns # Time consuming of vectorized conditional filtering operation.
+OLAP_SCAN_NODE (id=0):(Active: 1.2ms,% non-child: 0.00%)
+  - NumDiskAccess: 1                    # The number of disks involved in this ScanNode node.
+  - NumScanners: 20                     # The number of Scanners generated by this ScanNode.
+  - PeakMemoryUsage: 0.00               # Peak memory usage during query, not used yet
+  - RowsReturned: 7                     # The number of rows returned from ScanNode to the upper node.
+  - RowsReturnedRate: 6.979K /sec       # RowsReturned/ActiveTime
+  - TabletCount: 20                     # The number of Tablets involved in this ScanNode.
+  OlapScanner:
+    - BlockConvertTime: 8.941us         # The time it takes to convert a vectorized Block into a RowBlock with a row structure. The vectorized Block is VectorizedRowBatch in V1 and RowBlockV2 in V2.
+    - BlockFetchTime: 468.974us         # Rowset Reader gets the time of the Block.
+    - ReaderInitTime: 5.475ms           # The time when OlapScanner initializes Reader. V1 includes the time to form MergeHeap. V2 includes the time to generate various Iterators and read the first group of blocks.
+    - RowsDelFiltered: 0                # Including the number of rows filtered out according to the Delete information in the Tablet, and the number of rows filtered for marked deleted rows under the unique key model.
+    - RowsPushedCondFiltered: 0         # Filter conditions based on the predicates passed down, such as the conditions passed from BuildTable to ProbeTable in Join calculation. This value is not accurate, because if the filtering effect is poor, it will no longer be filtered.
+    - RowsRead: 7                       # The number of rows returned from the storage engine to the Scanner, excluding the number of rows filtered by the Scanner.
+    - ScanTime: 39.24us                 # The number of rows returned from ScanNode to the upper node.
+    - ShowHintsTime_V1: 0ns             # V2 has no meaning. Read part of the data in V1 to perform ScanRange segmentation.
+    SegmentIterator:
+      - BitmapIndexFilterTimer: 779ns   # Use bitmap index to filter data time-consuming.
+      - BlockLoadTime: 415.925us        # SegmentReader(V1) or SegmentIterator(V2) gets the time of the block.
+      - BlockSeekCount: 12              # The number of block seeks when reading Segment.
+      - BlockSeekTime: 222.556us        # It takes time to block seek when reading Segment.
+      - BlocksLoad: 6                   # read the number of blocks
+      - BytesRead: 265.00 B             # The amount of data read from the data file. Assuming that 10 32-bit integers are read, the amount of data is 10 * 4B = 40 Bytes. This data only represents the fully expanded size of the data in memory, and does not represent the actual IO size.
+      - CachedPagesNum: 30              # In V2 only, when PageCache is enabled, the number of Pages that hit the Cache.
+      - CompressedBytesRead: 0.00       # In V1, the size of the data read from the file before decompression. In V2, the pre-compressed size of the read page that did not hit the PageCache.
+      - DecompressorTimer: 0ns          # Data decompression takes time.
+      - IOTimer: 0ns                    # IO time for actually reading data from the operating system.
+      - IndexLoadTime_V1: 0ns           # Only in V1, it takes time to read Index Stream.
+      - NumSegmentFiltered: 0           # When generating Segment Iterator, the number of Segments that are completely filtered out through column statistics and query conditions.
+      - NumSegmentTotal: 6              # Query the number of all segments involved.
+      - RawRowsRead: 7                  # The number of raw rows read in the storage engine. See below for details.
+      - RowsBitmapIndexFiltered: 0      # Only in V2, the number of rows filtered by the Bitmap index.
+      - RowsBloomFilterFiltered: 0      # Only in V2, the number of rows filtered by BloomFilter index.
+      - RowsKeyRangeFiltered: 0         # In V2 only, the number of rows filtered out by SortkeyIndex index.
+      - RowsStatsFiltered: 0            # In V2, the number of rows filtered by the ZoneMap index, including the deletion condition. V1 also contains the number of rows filtered by BloomFilter.
+      - RowsConditionsFiltered: 0       # Only in V2, the number of rows filtered by various column indexes.
+      - RowsVectorPredFiltered: 0       # The number of rows filtered by the vectorized condition filtering operation.
+      - TotalPagesNum: 30               # Only in V2, the total number of pages read.
+      - UncompressedBytesRead: 0.00     # V1 is the decompressed size of the read data file (if the file does not need to be decompressed, the file size is directly counted). In V2, only the decompressed size of the Page that missed PageCache is counted (if the Page does not need to be decompressed, the Page size is directly counted)
+      - VectorPredEvalTime: 0ns         # Time-consuming of vectorized condition filtering operation.
 ```
 
-* Some notes on the number of rows in Profile
-
-    The metrics related to the number of rows in the Profile are:
-
-    * NumSegmentFiltered
-    * NumSegmentTotal
-
-    The number of segments actually read can be obtained through these two metrics.
-    
-    * RowsKeyRangeFiltered
-    * RowsBitmapIndexFiltered
-    * RowsBloomFilterFiltered
-    * RowsStatsFiltered
-    * RowsDelFiltered
-    * RawRowsRead
-    * RowsRead
-    * RowsReturned
-
-    The predicate conditions in a query are filtered in the storage engine and Scanner respectively. Among the above indicators, the group of metrics `Rows***Filtered` describes the number of rows filtered in the storage engine. The last three metrics describe the number of lines processed in Scanner.
-    
-    The following only describes the process of reading data in Segment V2 format. In the Segment V1 format, the meaning of these metrics are slightly different.
-
-    When reading a V2 format segment, it will first filter based on the Key range (the query range composed of the prefix key), and the number of filtered lines is recorded in `RowsKeyRangeFiltered`. After that, the data is filtered using the Bitmap index, and the filtered rows are recorded in `RowsBitmapIndexFiltered`. After that, the data is filtered using the BloomFilter index and recorded in `RowsBloomFilterFiltered`. The value of `RowsBloomFilterFiltered` is the difference between the total number of rows in the Segment (not the number of rows after being filtered by the Bitmap index) and the number of remaining rows after BloomFilter filtering, so the data filtered by BloomFilter may overlap with the data filtered by Bitmap.
-
-    `RowsStatsFiltered` records the number of rows filtered by other predicate conditions. This includes the predicate conditions pushed down to the storage engine and the Delete condition in the storage engine.
-    
-    `RowsDelFiltered` contains the number of filtered rows recorded by `RowsBloomFilterFiltered` and `RowsStatsFiltered`.
-    
-    `RawRowsRead` is the number of rows that need to be read after the above filtering. The `RowsRead` is the number of rows returned to the Scanner. `RowsRead` is usually smaller than `RawRowsRead`, because returning from the storage engine to the Scanner may go through a data aggregation.
-    
-    `RowsReturned` is the number of rows that ScanNode will eventually return to the upper node. `RowsReturned` will usually be less than
-`RowsRead`. Because there will be some predicate conditions that are not pushed down to the storage engine on the Scanner, it will be filtered in Scanner.
-
-    Through the above indicators, you can roughly analyze the number of rows processed by the storage engine and the final number of rows after filtering. Through the set of indicators of `Rows***Filtered`, you can also analyze whether the query condition is pushed down to the storage engine and the filtering effect of different indexes.
-    
-    If the gap between `RawRowsRead` and `RowsRead` is large, it means that a large number of rows are aggregated, and the aggregation may be time-consuming. If the gap between `RowsRead` and `RowsReturned` is large, it means that many lines are filtered in Scanner. This shows that many highly selected conditions are not pushed to the storage engine. The filtering efficiency in Scanner is worse than that in the storage engine.
+The predicate push down and index usage can be inferred from the related indicators of the number of data rows in the profile. The following only describes the profile in the reading process of segment V2 format data. In segment V1 format, the meaning of these indicators is slightly different.
 
-* Simple analysis of Scan Node Profile
+  - When reading a segment V2, if the query has key_ranges (the query range composed of prefix keys), first filter the data through the SortkeyIndex index, and the number of filtered rows is recorded in `RowsKeyRangeFiltered`.
+  - After that, use the Bitmap index to perform precise filtering on the columns containing the bitmap index in the query condition, and the number of filtered rows is recorded in `RowsBitmapIndexFiltered`.
+  - After that, according to the equivalent (eq, in, is) condition in the query condition, use the BloomFilter index to filter the data and record it in `RowsBloomFilterFiltered`. The value of `RowsBloomFilterFiltered` is the difference between the total number of rows of the Segment (not the number of rows filtered by the Bitmap index) and the number of remaining rows after BloomFilter, so the data filtered by BloomFilter may overlap with the data filtered by Bitmap.
+  - After that, use the ZoneMap index to filter the data according to the query conditions and delete conditions and record it in `RowsStatsFiltered`.
+  - `RowsConditionsFiltered` is the number of rows filtered by various indexes, including the values ​​of `RowsBloomFilterFiltered` and `RowsStatsFiltered`.
+  - So far, the Init phase is completed, and the number of rows filtered by the condition to be deleted in the Next phase is recorded in `RowsDelFiltered`. Therefore, the number of rows actually filtered by the delete condition are recorded in `RowsStatsFiltered` and `RowsDelFiltered` respectively.
+  - `RawRowsRead` is the final number of rows to be read after the above filtering.
+  - `RowsRead` is the number of rows finally returned to Scanner. `RowsRead` is usually smaller than `RawRowsRead`, because returning from the storage engine to the Scanner may go through a data aggregation. If the difference between `RawRowsRead` and `RowsRead` is large, it means that a large number of rows are aggregated, and aggregation may be time-consuming.
+  - `RowsReturned` is the number of rows finally returned by ScanNode to the upper node. `RowsReturned` is usually smaller than `RowsRead`. Because there will be some predicate conditions on the Scanner that are not pushed down to the storage engine, filtering will be performed once. If the difference between `RowsRead` and `RowsReturned` is large, it means that many rows are filtered in the Scanner. This shows that many highly selective predicate conditions are not pushed to the storage engine. The filtering efficiency in Scanner is worse than that in storage engine.
 
-    OlapScanNode's Profile is usually used to analyze the efficiency of data scanning. In addition to the information about the number of rows that can be used to infer the predicate pushdown and index usage, the following aspects can also be used for simple analysis.
+Through the above indicators, you can roughly analyze the number of rows processed by the storage engine and the size of the final filtered result row. Through the `Rows***Filtered` group of indicators, it is also possible to analyze whether the query conditions are pushed down to the storage engine, and the filtering effects of different indexes. In addition, a simple analysis can be made through the following aspects.
     
-    * First of all, many indicators, such as `IOTimer`, `BlockFetchTime`, etc. are the accumulation of all Scanner thread indicators, so the value may be relatively large. And because the Scanner thread reads data asynchronously, these cumulative indicators can only reflect the cumulative working time of the Scanner, and do not directly represent the time cost of ScanNode. The proportion of time spent by ScanNode in the entire query plan is the value recorded in the `Active` field. Sometimes it appears that `IOTimer` has tens of seconds, while `Active` actually has only a few seconds. This situation is usually because: 1. `IOTimer` is the accumulated time of multiple Scanners, and there are many Scanners. 2. The upper nodes are more time-consuming. For example, the upper node takes 100 seconds, while the lower ScanNode only takes 10 seconds. The field reflected in `Active` may only be a few milliseconds. Because while the upper node is processing data, the ScanNode has asynchronously scanned the data and prepared the data. When the upper-layer node obtains data from ScanNode, it can obtain the prepared data, so the `Active` time is very short.
-    * IOTimer is the IO time, which can directly reflect the time-consuming IO operation. Here is the accumulated IO time of all Scanner threads.
-    * NumScanners indicates the number of Scanner threads. Too many or too few threads will affect query efficiency. At the same time, some aggregate indicators can be divided by the number of threads to roughly estimate the time spent by each thread.
-    * TabletCount represents the number of tablets that need to be scanned. Excessive numbers may mean that a large number of random reads and data merge operations are required.
-    * UncompressedBytesRead indirectly reflects the amount of data read. If the value is large, it indicates that there may be a large number of IO operations.
-    * CachedPagesNum and TotalPagesNum. For V2 format, you can view the hit of PageCache. The higher the hit rate, the less time the IO and decompression operations take.
+  - Many indicators under `OlapScanner`, such as `IOTimer`, `BlockFetchTime`, etc., are the accumulation of all Scanner thread indicators, so the value may be relatively large. And because the Scanner thread reads data asynchronously, these cumulative indicators can only reflect the cumulative working time of the Scanner, and do not directly represent the time consumption of the ScanNode. The time-consuming ratio of ScanNode in the entire query plan is the value recorded in the `Active` field. Sometimes it appears that `IOTimer` has tens of seconds, but `Active` is actually only a few seconds. This situation is usually due to:
+    - `IOTimer` is the accumulated time of multiple Scanners, and there are more Scanners.
+    - The upper node is time-consuming. For example, the upper node takes 100 seconds, while the lower ScanNode only takes 10 seconds. The field reflected in `Active` may be only a few milliseconds. Because while the upper layer is processing data, ScanNode has performed data scanning asynchronously and prepared the data. When the upper node obtains data from ScanNode, it can obtain the prepared data, so the Active time is very short.
+  - `NumScanners` indicates the number of Scanner threads. Too many or too few threads will affect query efficiency. At the same time, some summary indicators can be divided by the number of threads to roughly estimate the time consumption of each thread.
+  - `TabletCount` indicates the number of tablets to be scanned. Too many may mean a lot of random read and data merge operations.
+  - `UncompressedBytesRead` indirectly reflects the amount of data read. If the value is large, it means that there may be a lot of IO operations.
+  - `CachedPagesNum` and `TotalPagesNum` can check the hitting status of PageCache. The higher the hit rate, the less time-consuming IO and decompression operations.  
 
 #### `Buffer pool`
  - AllocTime: Memory allocation time
diff --git a/docs/zh-CN/administrator-guide/running-profile.md b/docs/zh-CN/administrator-guide/running-profile.md
index cc1fb4855b7f57..ca2702f3f461ab 100644
--- a/docs/zh-CN/administrator-guide/running-profile.md
+++ b/docs/zh-CN/administrator-guide/running-profile.md
@@ -147,103 +147,74 @@ BE端收集的统计信息较多，下面列出了各个参数的对应含义：
 
 查询中的部分或全部谓词条件会推送给 `OLAP_SCAN_NODE`。这些谓词条件中一部分会继续下推给存储引擎，以便利用存储引擎的索引进行数据过滤。另一部分会保留在 `OLAP_SCAN_NODE` 中，用于过滤从存储引擎中返回的数据。
 
+`OLAP_SCAN_NODE` 节点的 Profile 通常用于分析数据扫描的效率，依据调用关系分为 `OLAP_SCAN_NODE`、`OlapScanner`、`SegmentIterator` 三层。
+
 一个典型的 `OLAP_SCAN_NODE` 节点的 Profile 如下。部分指标会因存储格式的不同（V1 或 V2）而有不同含义。
 
 ```
-OLAP_SCAN_NODE (id=0):(Active: 4.050ms, non-child: 35.68%)
-   - BitmapIndexFilterTimer: 0.000ns    # 利用 bitmap 索引过滤数据的耗时。
-   - BlockConvertTime: 7.433ms  # 将向量化Block转换为行结构的 RowBlock 的耗时。向量化 Block 在 V1 中为 VectorizedRowBatch，V2中为 RowBlockV2。
-   - BlockFetchTime: 36.934ms   # Rowset Reader 获取 Block 的时间。
-   - BlockLoadTime: 23.368ms    # SegmentReader(V1) 或 SegmentIterator(V2) 获取 block 的时间。
-   - BlockSeekCount: 0  # 读取 Segment 时进行 block seek 的次数。
-   - BlockSeekTime: 3.062ms # 读取 Segment 时进行 block seek 的耗时。
-   - BlocksLoad: 221    # 读取 Block 的数量
-   - BytesRead: 6.59 MB # 从数据文件中读取到的数据量。假设读取到了是10个32位整型，则数据量为 10 * 4B = 40 Bytes。这个数据仅表示数据在内存中全展开的大小，并不代表实际的 IO 大小。
-   - CachedPagesNum: 0  # 仅 V2 中，当开启 PageCache 后，命中 Cache 的 Page 数量。
-   - CompressedBytesRead: 1.36 MB   # V1 中，从文件中读取的解压前的数据大小。V2 中，读取到的没有命中 PageCache 的 Page 的压缩前的大小。
-   - DecompressorTimer: 4.194ms # 数据解压耗时。
-   - IOTimer: 1.404ms   # 实际从操作系统读取数据的 IO 时间。
-   - IndexLoadTime: 1.521ms # 仅 V1 中，读取 Index Stream 的耗时。
-   - NumDiskAccess: 6   # 该 ScanNode 节点涉及到的磁盘数量。
-   - NumScanners: 25    # 该 ScanNode 生成的 Scanner 数量。
-   - NumSegmentFiltered: 4   # 在生成 Segment Iterator 时，通过列统计信息和查询条件，完全过滤掉的 Segment 数量。
-   - NumSegmentTotal: 20    # 查询涉及的所有 Segment 数量。
-   - PeakMemoryUsage: 0     # 无意义
-   - PerReadThreadRawHdfsThroughput: 0.00 /sec  # 无意义
-   - RawRowsRead: 141.71K   # 存储引擎中读取的原始行数。详情见下文。
-   - ReaderInitTime: 16.515ms   # OlapScanner 初始化 Reader 的时间。V1 中包括组建 MergeHeap 的时间。V2 中包括生成各级 Iterator 并读取第一组Block的时间。
-   - RowsBitmapFiltered: 0  # 利用 bitmap 索引过滤掉的行数。
-   - RowsBloomFilterFiltered: 0 # 仅 V2 中，通过 BloomFilter 索引过滤掉的行数。
-   - RowsDelFiltered: 0     # V1 中表示根据 delete 条件过滤掉的行数。V2 中还包括通过 BloomFilter 和部分谓词条件过滤掉的行数。
-   - RowsPushedCondFiltered: 0  # 根据传递下推的谓词过滤掉的条件，比如 Join 计算中从 BuildTable 传递给 ProbeTable 的条件。该数值不准确，因为如果过滤效果差，就不再过滤了。
-   - RowsRead: 132.78K  # 从存储引擎返回到 Scanner 的行数，不包括经 Scanner 过滤的行数。
-   - RowsReturned: 132.78K  # 从 ScanNode 返回给上层节点的行数。
-   - RowsReturnedRate: 32.78 M/sec  # RowsReturned/ActiveTime
-   - RowsStatsFiltered: 0   # V2 中，包含谓词条件根据 Zonemap 过滤掉的行数。V1 中还包含通过 BloomFilter 过滤掉的行数。
-   - RowsVectorPredFiltered: 0  # 通过向量化条件过滤操作过滤掉的行数。
-   - ScanTime: 49.239ms：Scanner 调用 get_next() 方法的耗时统计。
-   - ScannerThreadsInvoluntaryContextSwitches: 0    # 无意义
-   - ScannerThreadsTotalWallClockTime: 0.000ns  # 无意义
-     - MaterializeTupleTime(*): 0.000ns # 无意义
-     - ScannerThreadsSysTime: 0.000ns   # 无意义
-     - ScannerThreadsUserTime: 0.000ns  # 无意义
-   - ScannerThreadsVoluntaryContextSwitches: 0  # 无意义
-   - ShowHintsTime: 0.000ns # V2 中无意义。V1 中读取部分数据来进行 ScanRange 的切分。
-   - TabletCount : 25   # 该 ScanNode 涉及的 Tablet 数量。
-   - TotalPagesNum: 0   # 仅 V2 中，读取的总 Page 数量。
-   - TotalRawReadTime(*): 0.000ns   # 无意义
-   - TotalReadThroughput: 0.00 /sec # 无意义
-   - UncompressedBytesRead: 4.28 MB # V1 中为读取的数据文件解压后的大小（如果文件无需解压，则直接统计文件大小）。V2 中，仅统计未命中 PageCache 的 Page 解压后的大小（如果Page无需解压，直接统计Page大小）
-   - VectorPredEvalTime: 0.000ns    # 向量化条件过滤操作的耗时。
+OLAP_SCAN_NODE (id=0):(Active: 1.2ms, % non-child: 0.00%)
+  - NumDiskAccess: 1                    # 该 ScanNode 节点涉及到的磁盘数量。
+  - NumScanners: 20                     # 该 ScanNode 生成的 Scanner 数量。
+  - PeakMemoryUsage: 0.00               # 查询时内存使用的峰值，暂未使用
+  - RowsReturned: 7                     # 从 ScanNode 返回给上层节点的行数。
+  - RowsReturnedRate: 6.979K /sec       # RowsReturned/ActiveTime
+  - TabletCount : 20                    # 该 ScanNode 涉及的 Tablet 数量。
+  OlapScanner:
+    - BlockConvertTime: 8.941us         # 将向量化Block转换为行结构的 RowBlock 的耗时。向量化 Block 在 V1 中为 VectorizedRowBatch，V2中为 RowBlockV2。
+    - BlockFetchTime: 468.974us         # Rowset Reader 获取 Block 的时间。
+    - ReaderInitTime: 5.475ms           # OlapScanner 初始化 Reader 的时间。V1 中包括组建 MergeHeap 的时间。V2 中包括生成各级 Iterator 并读取第一组Block的时间。
+    - RowsDelFiltered: 0                # 包括根据 Tablet 中存在的 Delete 信息过滤掉的行数，以及 unique key 模型下对被标记的删除行过滤的行数。
+    - RowsPushedCondFiltered: 0         # 根据传递下推的谓词过滤掉的条件，比如 Join 计算中从 BuildTable 传递给 ProbeTable 的条件。该数值不准确，因为如果过滤效果差，就不再过滤了。
+    - RowsRead: 7                       # 从存储引擎返回到 Scanner 的行数，不包括经 Scanner 过滤的行数。
+    - ScanTime: 39.24us                 # 从 ScanNode 返回给上层节点的行数。
+    - ShowHintsTime_V1: 0ns             # V2 中无意义。V1 中读取部分数据来进行 ScanRange 的切分。
+    SegmentIterator:
+      - BitmapIndexFilterTimer: 779ns   # 利用 bitmap 索引过滤数据的耗时。
+      - BlockLoadTime: 415.925us        # SegmentReader(V1) 或 SegmentIterator(V2) 获取 block 的时间。
+      - BlockSeekCount: 12              # 读取 Segment 时进行 block seek 的次数。
+      - BlockSeekTime: 222.556us        # 读取 Segment 时进行 block seek 的耗时。
+      - BlocksLoad: 6                   # 读取 Block 的数量
+      - BytesRead: 265.00 B             # 从数据文件中读取到的数据量。假设读取到了是10个32位整型，则数据量为 10 * 4B = 40 Bytes。这个数据仅表示数据在内存中全展开的大小，并不代表实际的 IO 大小。
+      - CachedPagesNum: 30              # 仅 V2 中，当开启 PageCache 后，命中 Cache 的 Page 数量。
+      - CompressedBytesRead: 0.00       # V1 中，从文件中读取的解压前的数据大小。V2 中，读取到的没有命中 PageCache 的 Page 的压缩前的大小。
+      - DecompressorTimer: 0ns          # 数据解压耗时。
+      - IOTimer: 0ns                    # 实际从操作系统读取数据的 IO 时间。
+      - IndexLoadTime_V1: 0ns           # 仅 V1 中，读取 Index Stream 的耗时。
+      - NumSegmentFiltered: 0           # 在生成 Segment Iterator 时，通过列统计信息和查询条件，完全过滤掉的 Segment 数量。
+      - NumSegmentTotal: 6              # 查询涉及的所有 Segment 数量。
+      - RawRowsRead: 7                  # 存储引擎中读取的原始行数。详情见下文。
+      - RowsBitmapIndexFiltered: 0      # 仅 V2 中，通过 Bitmap 索引过滤掉的行数。
+      - RowsBloomFilterFiltered: 0      # 仅 V2 中，通过 BloomFilter 索引过滤掉的行数。
+      - RowsKeyRangeFiltered: 0         # 仅 V2 中，通过 SortkeyIndex 索引过滤掉的行数。
+      - RowsStatsFiltered: 0            # V2 中，通过 ZoneMap 索引过滤掉的行数，包含删除条件。V1 中还包含通过 BloomFilter 过滤掉的行数。
+      - RowsConditionsFiltered: 0       # 仅 V2 中，通过各种列索引过滤掉的行数。
+      - RowsVectorPredFiltered: 0       # 通过向量化条件过滤操作过滤掉的行数。
+      - TotalPagesNum: 30               # 仅 V2 中，读取的总 Page 数量。
+      - UncompressedBytesRead: 0.00     # V1 中为读取的数据文件解压后的大小（如果文件无需解压，则直接统计文件大小）。V2 中，仅统计未命中 PageCache 的 Page 解压后的大小（如果Page无需解压，直接统计Page大小）
+      - VectorPredEvalTime: 0ns         # 向量化条件过滤操作的耗时。
 ```
 
-* Profile 中关于行数的一些说明
-
-    在 Profile 中和行数相关的指标有：
-
-    * NumSegmentFiltered
-    * NumSegmentTotal
-
-    通过这两个指标可以得到实际读取的 Segment 数量。
-    
-    * RowsKeyRangeFiltered
-    * RowsBitmapIndexFiltered
-    * RowsBloomFilterFiltered
-    * RowsStatsFiltered
-    * RowsDelFiltered
-    * RawRowsRead
-    * RowsRead
-    * RowsReturned
-
-    一个查询中的谓词条件会分别在存储引擎和 Scanner 中进行过滤。以上指标中，`Rows***Filtered` 这组指标描述了在存储引擎中被过滤的行数。后三个指标描述了在 Scanner 中被处理的行数。
-
-    以下仅针对 Segment V2 格式数据读取的流程进行说明。Segment V1 格式中，这些指标的含义略有不同。
+通过 Profile 中数据行数相关指标可以推断谓词条件下推和索引使用情况。以下仅针对 Segment V2 格式数据读取流程中的 Profile 进行说明。Segment V1 格式中，这些指标的含义略有不同。
 
-    当读取一个 V2 格式的 Segment 时，首先会根据 Key range（前缀key组成的查询范围）进行一次过滤，过滤掉的行数记录在 `RowsKeyRangeFiltered` 中。之后，再利用 Bitmap 索引过滤数据，过滤掉的行数记录在 `RowsBitmapIndexFiltered` 中。之后，再利用 BloomFilter 索引过滤数据，记录在 `RowsBloomFilterFiltered` 中。`RowsBloomFilterFiltered` 的值是 Segment 的总行数（而不是Bitmap索引过滤后的行数）和经过 BloomFilter 过滤后剩余行数的差值，因此 BloomFilter 过滤掉的数据可能会和 Bitmap 过滤掉的数据有重叠。
-
-    `RowsStatsFiltered` 中记录的是经过其他谓词条件过滤掉的行数，这里包括下推到存储引擎的谓词条件，以及存储引擎中的 Delete 条件。
-    
-    `RowsDelFiltered` 中包含了 `RowsBloomFilterFiltered` 和 `RowsStatsFiltered` 记录的过滤行数。
-    
-    `RawRowsRead` 是经过上述过滤后，最终需要读取的行数。而 `RowsRead` 是最终返回给 Scanner 的行数。`RowsRead` 通常小于 `RawRowsRead`，是因为从存储引擎返回到 Scanner，可能会经过一次数据聚合。
-    
-    `RowsReturned` 是 ScanNode 最终返回给上层节点的行数。`RowsReturned` 通常也会小于 
-`RowsRead`。因为在 Scanner 上会有一些没有下推给存储引擎的谓词条件，会进行一次过滤。
-
-    通过以上指标，可以大致分析出存储引擎处理的行数以及最终过滤后的结果行数大小。通过 `Rows***Filtered` 这组指标，也可以分析查询条件是否下推到了存储引擎，以及不同索引的过滤效果。
-    
-    如果 `RawRowsRead` 和 `RowsRead` 差距较大，则说明大量的行被聚合，而聚合可能比较耗时。如果 `RowsRead` 和 `RowsReturned` 差距较大，则说明很多行在 Scanner 中进行了过滤。这说明很多选择度高的谓词条件并没有推送给存储引擎。而在 Scanner 中的过滤效率会比在存储引擎中过滤效率差。
-    
-* Scan Node Profile 的简单分析
+  - 当读取一个 V2 格式的 Segment 时，若查询存在 key_ranges（前缀key组成的查询范围），首先通过 SortkeyIndex 索引过滤数据，过滤的行数记录在 `RowsKeyRangeFiltered`。
+  - 之后，对查询条件中含有 bitmap 索引的列，使用 Bitmap 索引进行精确过滤，过滤的行数记录在 `RowsBitmapIndexFiltered`。
+  - 之后，按查询条件中的等值（eq，in，is）条件，使用BloomFilter索引过滤数据，记录在 `RowsBloomFilterFiltered`。`RowsBloomFilterFiltered` 的值是 Segment 的总行数（而不是Bitmap索引过滤后的行数）和经过 BloomFilter 过滤后剩余行数的差值，因此 BloomFilter 过滤的数据可能会和 Bitmap 过滤的数据有重叠。
+  - 之后，按查询条件和删除条件，使用 ZoneMap 索引过滤数据，记录在 `RowsStatsFiltered`。
+  - `RowsConditionsFiltered` 是各种索引过滤的行数，包含了 `RowsBloomFilterFiltered` 和 `RowsStatsFiltered` 的值。
+  - 至此 Init 阶段完成，Next 阶段删除条件过滤的行数，记录在 `RowsDelFiltered`。因此删除条件实际过滤的行数，分别记录在 `RowsStatsFiltered` 和 `RowsDelFiltered` 中。
+  - `RawRowsRead` 是经过上述过滤后，最终需要读取的行数。
+  - `RowsRead` 是最终返回给 Scanner 的行数。`RowsRead` 通常小于 `RawRowsRead`，是因为从存储引擎返回到 Scanner，可能会经过一次数据聚合。如果 `RawRowsRead` 和 `RowsRead` 差距较大，则说明大量的行被聚合，而聚合可能比较耗时。
+  - `RowsReturned` 是 ScanNode 最终返回给上层节点的行数。`RowsReturned` 通常也会小于`RowsRead`。因为在 Scanner 上会有一些没有下推给存储引擎的谓词条件，会进行一次过滤。如果 `RowsRead` 和 `RowsReturned` 差距较大，则说明很多行在 Scanner 中进行了过滤。这说明很多选择度高的谓词条件并没有推送给存储引擎。而在 Scanner 中的过滤效率会比在存储引擎中过滤效率差。
 
-    OlapScanNode 的 Profile 通常用于分析数据扫描的效率。除了前面介绍的通过行数相关信息可以推断谓词条件下推和索引使用情况外，还可以通过以下几个方面进行简单的分析。
+通过以上指标，可以大致分析出存储引擎处理的行数以及最终过滤后的结果行数大小。通过 `Rows***Filtered` 这组指标，也可以分析查询条件是否下推到了存储引擎，以及不同索引的过滤效果。此外还可以通过以下几个方面进行简单的分析。
     
-    * 首先，很多指标，如 `IOTimer`，`BlockFetchTime` 等都是所有 Scanner 线程指标的累加，因此数值可能会比较大。并且因为 Scanner 线程是异步读取数据的，所以这些累加指标只能反映 Scanner 累加的工作时间，并不直接代表 ScanNode 的耗时。ScanNode 在整个查询计划中的耗时占比为 `Active` 字段记录的值。有时会出现比如 `IOTimer` 有几十秒，而 `Active` 实际只有几秒钟。这种情况通常因为：1. `IOTimer` 为多个 Scanner 的累加时间，而 Scanner 数量较多。2. 上层节点比较耗时。比如上层节点耗时 100秒，而底层 ScanNode 只需 10秒。则反映在 `Active` 的字段可能只有几毫秒。因为在上层处理数据的同时，ScanNode 已经异步的进行了数据扫描并准备好了数据。当上层节点从 ScanNode 获取数据时，可以获取到已经准备好的数据，因此 Active 时间很短。
-    * IOTimer 是 IO 时间，能够直接反映 IO 操作耗时。这里是所有 Scanner 线程累加的 IO 时间。
-    * NumScanners 表示 Scanner 线程数。线程数过多或过少都会影响查询效率。同时可以用一些汇总指标除以线程数来大致的估算每个线程的耗时。
-    * TabletCount 表示需要扫描的 tablet 数量。数量过多可能意味着需要大量的随机读取和数据合并操作。
-    * UncompressedBytesRead 间接反映了读取的数据量。如果该数值较大，说明可能有大量的 IO 操作。
-    * CachedPagesNum 和 TotalPagesNum。对于 V2 格式，可以查看命中 PageCache 的情况。命中率越高，说明 IO 和解压操作耗时越少。
+  - `OlapScanner` 下的很多指标，如 `IOTimer`，`BlockFetchTime` 等都是所有 Scanner 线程指标的累加，因此数值可能会比较大。并且因为 Scanner 线程是异步读取数据的，所以这些累加指标只能反映 Scanner 累加的工作时间，并不直接代表 ScanNode 的耗时。ScanNode 在整个查询计划中的耗时占比为 `Active` 字段记录的值。有时会出现比如 `IOTimer` 有几十秒，而 `Active` 实际只有几秒钟。这种情况通常因为：
+    - `IOTimer` 为多个 Scanner 的累加时间，而 Scanner 数量较多。
+    - 上层节点比较耗时。比如上层节点耗时 100秒，而底层 ScanNode 只需 10秒。则反映在 `Active` 的字段可能只有几毫秒。因为在上层处理数据的同时，ScanNode 已经异步的进行了数据扫描并准备好了数据。当上层节点从 ScanNode 获取数据时，可以获取到已经准备好的数据，因此 Active 时间很短。
+  - `NumScanners` 表示 Scanner 线程数。线程数过多或过少都会影响查询效率。同时可以用一些汇总指标除以线程数来大致的估算每个线程的耗时。
+  - `TabletCount` 表示需要扫描的 tablet 数量。数量过多可能意味着需要大量的随机读取和数据合并操作。
+  - `UncompressedBytesRead` 间接反映了读取的数据量。如果该数值较大，说明可能有大量的 IO 操作。
+  - `CachedPagesNum` 和 `TotalPagesNum` 可以查看命中 PageCache 的情况。命中率越高，说明 IO 和解压操作耗时越少。
 
 #### `Buffer pool`
  - AllocTime: 内存分配耗时

From c123704be9cdf9e41cac64e109010155aebc85ab Mon Sep 17 00:00:00 2001
From: Xinyi Zou <zouxinyi@baidu.com>
Date: Mon, 2 Nov 2020 18:33:52 +0800
Subject: [PATCH 2/4] _segment_profile and scanner_profile move to scan_node

---
 be/src/exec/olap_scan_node.cpp                    | 10 ++++++++++
 be/src/exec/olap_scan_node.h                      |  6 ++++++
 be/src/exec/scan_node.cpp                         | 13 ++-----------
 be/src/exec/scan_node.h                           |  5 -----
 docs/en/administrator-guide/running-profile.md    |  4 ++--
 docs/zh-CN/administrator-guide/running-profile.md |  4 ++--
 6 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp
index f36f22537f960a..4b953b5c5b8050 100644
--- a/be/src/exec/olap_scan_node.cpp
+++ b/be/src/exec/olap_scan_node.cpp
@@ -87,6 +87,15 @@ Status OlapScanNode::init(const TPlanNode& tnode, RuntimeState* state) {
     return Status::OK();
 }
 
+
+void OlapScanNode::init_scan_profile() {
+    _scanner_profile.reset(new RuntimeProfile("OlapScanner"));
+    runtime_profile()->add_child(_scanner_profile.get(), true, NULL);
+
+    _segment_profile.reset(new RuntimeProfile("SegmentIterator"));
+    _scanner_profile->add_child(_segment_profile.get(), true, NULL);
+}
+
 void OlapScanNode::_init_counter(RuntimeState* state) {
     ADD_TIMER(_scanner_profile, "ShowHintsTime_V1");
 
@@ -129,6 +138,7 @@ void OlapScanNode::_init_counter(RuntimeState* state) {
 }
 
 Status OlapScanNode::prepare(RuntimeState* state) {
+    init_scan_profile();
     RETURN_IF_ERROR(ScanNode::prepare(state));
     // create scanner profile
     // create timer
diff --git a/be/src/exec/olap_scan_node.h b/be/src/exec/olap_scan_node.h
index df4d529ad94cf4..9a10a85cb0e157 100644
--- a/be/src/exec/olap_scan_node.h
+++ b/be/src/exec/olap_scan_node.h
@@ -158,6 +158,9 @@ class OlapScanNode : public ScanNode {
 
 private:
     void _init_counter(RuntimeState* state);
+    // OLAP_SCAN_NODE profile layering: OLAP_SCAN_NODE, OlapScanner, and SegmentIterator 
+    // according to the calling relationship
+    void init_scan_profile();
 
     void construct_is_null_pred_in_where_pred(Expr* expr, SlotDescriptor* slot, std::string is_null_str);
 
@@ -254,6 +257,9 @@ class OlapScanNode : public ScanNode {
     // or be overwritten by value in TQueryOptions
     int32_t _max_pushdown_conditions_per_column = 1024;
 
+    boost::scoped_ptr<RuntimeProfile> _scanner_profile;
+    boost::scoped_ptr<RuntimeProfile> _segment_profile;
+
     // Counters
     RuntimeProfile::Counter* _io_timer = nullptr;
     RuntimeProfile::Counter* _read_compressed_counter = nullptr;
diff --git a/be/src/exec/scan_node.cpp b/be/src/exec/scan_node.cpp
index d2d90fe47b6761..7ab4809af8a03c 100644
--- a/be/src/exec/scan_node.cpp
+++ b/be/src/exec/scan_node.cpp
@@ -30,14 +30,13 @@ const string ScanNode::_s_scanner_thread_total_wallclock_time =
     "ScannerThreadsTotalWallClockTime";
 
 Status ScanNode::prepare(RuntimeState* state) {
-    init_scan_profile();
     RETURN_IF_ERROR(ExecNode::prepare(state));
 
     _bytes_read_counter =
-        ADD_COUNTER(_segment_profile, _s_bytes_read_counter, TUnit::BYTES);
+        ADD_COUNTER(runtime_profile(), _s_bytes_read_counter, TUnit::BYTES);
     //TODO: The _rows_read_counter == RowsReturned counter in exec node, there is no need to keep both of them
     _rows_read_counter =
-        ADD_COUNTER(_scanner_profile, _s_rows_read_counter, TUnit::UNIT);
+        ADD_COUNTER(runtime_profile(), _s_rows_read_counter, TUnit::UNIT);
 #ifndef BE_TEST
 #endif
     _materialize_tuple_timer = ADD_CHILD_TIMER(runtime_profile(), _s_materialize_tuple_timer,
@@ -48,12 +47,4 @@ Status ScanNode::prepare(RuntimeState* state) {
     return Status::OK();
 }
 
-void ScanNode::init_scan_profile() {
-    _scanner_profile.reset(new RuntimeProfile("OlapScanner"));
-    runtime_profile()->add_child(_scanner_profile.get(), true, NULL);
-
-    _segment_profile.reset(new RuntimeProfile("SegmentIterator"));
-    _scanner_profile->add_child(_segment_profile.get(), true, NULL);
-}
-
 }
diff --git a/be/src/exec/scan_node.h b/be/src/exec/scan_node.h
index 98a754b3b382e8..67326e533cf4ba 100644
--- a/be/src/exec/scan_node.h
+++ b/be/src/exec/scan_node.h
@@ -99,8 +99,6 @@ class ScanNode : public ExecNode {
     RuntimeProfile::Counter* materialize_tuple_timer() const {
         return _materialize_tuple_timer;
     }
-    // OLAP_SCAN_NODE profile layering: OLAP_SCAN_NODE, OlapScanner, and SegmentIterator according to the calling relationship
-    void init_scan_profile();
 
     // names of ScanNode common counters
     static const std::string _s_bytes_read_counter;
@@ -116,9 +114,6 @@ class ScanNode : public ExecNode {
     RuntimeProfile::Counter* _rows_read_counter;
     RuntimeProfile::Counter* _num_disks_accessed_counter;
     RuntimeProfile::Counter* _materialize_tuple_timer;  // time writing tuple slots
-
-    boost::scoped_ptr<RuntimeProfile> _scanner_profile;
-    boost::scoped_ptr<RuntimeProfile> _segment_profile;
 };
 
 }
diff --git a/docs/en/administrator-guide/running-profile.md b/docs/en/administrator-guide/running-profile.md
index 7022117567cf1d..1ed7689e688379 100644
--- a/docs/en/administrator-guide/running-profile.md
+++ b/docs/en/administrator-guide/running-profile.md
@@ -154,9 +154,11 @@ The profile of a typical `OLAP_SCAN_NODE` is as follows. Some indicators will ha
 
 ```
 OLAP_SCAN_NODE (id=0):(Active: 1.2ms,% non-child: 0.00%)
+  - BytesRead: 265.00 B                 # The amount of data read from the data file. Assuming that 10 32-bit integers are read, the amount of data is 10 * 4B = 40 Bytes. This data only represents the fully expanded size of the data in memory, and does not represent the actual IO size.
   - NumDiskAccess: 1                    # The number of disks involved in this ScanNode node.
   - NumScanners: 20                     # The number of Scanners generated by this ScanNode.
   - PeakMemoryUsage: 0.00               # Peak memory usage during query, not used yet
+  - RowsRead: 7                         # The number of rows returned from the storage engine to the Scanner, excluding the number of rows filtered by the Scanner.
   - RowsReturned: 7                     # The number of rows returned from ScanNode to the upper node.
   - RowsReturnedRate: 6.979K /sec       # RowsReturned/ActiveTime
   - TabletCount: 20                     # The number of Tablets involved in this ScanNode.
@@ -166,7 +168,6 @@ OLAP_SCAN_NODE (id=0):(Active: 1.2ms,% non-child: 0.00%)
     - ReaderInitTime: 5.475ms           # The time when OlapScanner initializes Reader. V1 includes the time to form MergeHeap. V2 includes the time to generate various Iterators and read the first group of blocks.
     - RowsDelFiltered: 0                # Including the number of rows filtered out according to the Delete information in the Tablet, and the number of rows filtered for marked deleted rows under the unique key model.
     - RowsPushedCondFiltered: 0         # Filter conditions based on the predicates passed down, such as the conditions passed from BuildTable to ProbeTable in Join calculation. This value is not accurate, because if the filtering effect is poor, it will no longer be filtered.
-    - RowsRead: 7                       # The number of rows returned from the storage engine to the Scanner, excluding the number of rows filtered by the Scanner.
     - ScanTime: 39.24us                 # The number of rows returned from ScanNode to the upper node.
     - ShowHintsTime_V1: 0ns             # V2 has no meaning. Read part of the data in V1 to perform ScanRange segmentation.
     SegmentIterator:
@@ -175,7 +176,6 @@ OLAP_SCAN_NODE (id=0):(Active: 1.2ms,% non-child: 0.00%)
       - BlockSeekCount: 12              # The number of block seeks when reading Segment.
       - BlockSeekTime: 222.556us        # It takes time to block seek when reading Segment.
       - BlocksLoad: 6                   # read the number of blocks
-      - BytesRead: 265.00 B             # The amount of data read from the data file. Assuming that 10 32-bit integers are read, the amount of data is 10 * 4B = 40 Bytes. This data only represents the fully expanded size of the data in memory, and does not represent the actual IO size.
       - CachedPagesNum: 30              # In V2 only, when PageCache is enabled, the number of Pages that hit the Cache.
       - CompressedBytesRead: 0.00       # In V1, the size of the data read from the file before decompression. In V2, the pre-compressed size of the read page that did not hit the PageCache.
       - DecompressorTimer: 0ns          # Data decompression takes time.
diff --git a/docs/zh-CN/administrator-guide/running-profile.md b/docs/zh-CN/administrator-guide/running-profile.md
index ca2702f3f461ab..047de8f02c2eb5 100644
--- a/docs/zh-CN/administrator-guide/running-profile.md
+++ b/docs/zh-CN/administrator-guide/running-profile.md
@@ -153,9 +153,11 @@ BE端收集的统计信息较多，下面列出了各个参数的对应含义：
 
 ```
 OLAP_SCAN_NODE (id=0):(Active: 1.2ms, % non-child: 0.00%)
+  - BytesRead: 265.00 B                 # 从数据文件中读取到的数据量。假设读取到了是10个32位整型，则数据量为 10 * 4B = 40 Bytes。这个数据仅表示数据在内存中全展开的大小，并不代表实际的 IO 大小。 
   - NumDiskAccess: 1                    # 该 ScanNode 节点涉及到的磁盘数量。
   - NumScanners: 20                     # 该 ScanNode 生成的 Scanner 数量。
   - PeakMemoryUsage: 0.00               # 查询时内存使用的峰值，暂未使用
+  - RowsRead: 7                         # 从存储引擎返回到 Scanner 的行数，不包括经 Scanner 过滤的行数。
   - RowsReturned: 7                     # 从 ScanNode 返回给上层节点的行数。
   - RowsReturnedRate: 6.979K /sec       # RowsReturned/ActiveTime
   - TabletCount : 20                    # 该 ScanNode 涉及的 Tablet 数量。
@@ -165,7 +167,6 @@ OLAP_SCAN_NODE (id=0):(Active: 1.2ms, % non-child: 0.00%)
     - ReaderInitTime: 5.475ms           # OlapScanner 初始化 Reader 的时间。V1 中包括组建 MergeHeap 的时间。V2 中包括生成各级 Iterator 并读取第一组Block的时间。
     - RowsDelFiltered: 0                # 包括根据 Tablet 中存在的 Delete 信息过滤掉的行数，以及 unique key 模型下对被标记的删除行过滤的行数。
     - RowsPushedCondFiltered: 0         # 根据传递下推的谓词过滤掉的条件，比如 Join 计算中从 BuildTable 传递给 ProbeTable 的条件。该数值不准确，因为如果过滤效果差，就不再过滤了。
-    - RowsRead: 7                       # 从存储引擎返回到 Scanner 的行数，不包括经 Scanner 过滤的行数。
     - ScanTime: 39.24us                 # 从 ScanNode 返回给上层节点的行数。
     - ShowHintsTime_V1: 0ns             # V2 中无意义。V1 中读取部分数据来进行 ScanRange 的切分。
     SegmentIterator:
@@ -174,7 +175,6 @@ OLAP_SCAN_NODE (id=0):(Active: 1.2ms, % non-child: 0.00%)
       - BlockSeekCount: 12              # 读取 Segment 时进行 block seek 的次数。
       - BlockSeekTime: 222.556us        # 读取 Segment 时进行 block seek 的耗时。
       - BlocksLoad: 6                   # 读取 Block 的数量
-      - BytesRead: 265.00 B             # 从数据文件中读取到的数据量。假设读取到了是10个32位整型，则数据量为 10 * 4B = 40 Bytes。这个数据仅表示数据在内存中全展开的大小，并不代表实际的 IO 大小。
       - CachedPagesNum: 30              # 仅 V2 中，当开启 PageCache 后，命中 Cache 的 Page 数量。
       - CompressedBytesRead: 0.00       # V1 中，从文件中读取的解压前的数据大小。V2 中，读取到的没有命中 PageCache 的 Page 的压缩前的大小。
       - DecompressorTimer: 0ns          # 数据解压耗时。

From dccf155843ab45b0510e83e99265bff8094eff0f Mon Sep 17 00:00:00 2001
From: Xinyi Zou <zouxinyi@baidu.com>
Date: Mon, 2 Nov 2020 19:01:16 +0800
Subject: [PATCH 3/4] std::unique_ptr replace boost::scoped_ptr

---
 be/src/exec/olap_scan_node.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/be/src/exec/olap_scan_node.h b/be/src/exec/olap_scan_node.h
index 9a10a85cb0e157..9e6b371dc21cf3 100644
--- a/be/src/exec/olap_scan_node.h
+++ b/be/src/exec/olap_scan_node.h
@@ -257,8 +257,8 @@ class OlapScanNode : public ScanNode {
     // or be overwritten by value in TQueryOptions
     int32_t _max_pushdown_conditions_per_column = 1024;
 
-    boost::scoped_ptr<RuntimeProfile> _scanner_profile;
-    boost::scoped_ptr<RuntimeProfile> _segment_profile;
+    std::unique_ptr<RuntimeProfile> _scanner_profile;
+    std::unique_ptr<RuntimeProfile> _segment_profile;
 
     // Counters
     RuntimeProfile::Counter* _io_timer = nullptr;

From fdc72d4c5c9d9b95f2e1cad49d79e36015479581 Mon Sep 17 00:00:00 2001
From: Xinyi Zou <zouxinyi@baidu.com>
Date: Mon, 9 Nov 2020 17:11:19 +0800
Subject: [PATCH 4/4] recovery TotalReadThroughput counter

---
 be/src/exec/scan_node.cpp                         | 3 +++
 be/src/exec/scan_node.h                           | 6 ++++++
 docs/en/administrator-guide/running-profile.md    | 5 +++--
 docs/zh-CN/administrator-guide/running-profile.md | 5 +++--
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/be/src/exec/scan_node.cpp b/be/src/exec/scan_node.cpp
index 7ab4809af8a03c..3891b7edc4371e 100644
--- a/be/src/exec/scan_node.cpp
+++ b/be/src/exec/scan_node.cpp
@@ -23,6 +23,7 @@ namespace doris {
 
 const string ScanNode::_s_bytes_read_counter = "BytesRead";
 const string ScanNode::_s_rows_read_counter = "RowsRead";
+const string ScanNode::_s_total_throughput_counter = "TotalReadThroughput";
 const string ScanNode::_s_materialize_tuple_timer = "MaterializeTupleTime(*)";
 const string ScanNode::_s_num_disks_accessed_counter = "NumDiskAccess";
 const string ScanNode::_s_scanner_thread_counters_prefix = "ScannerThreads";
@@ -38,6 +39,8 @@ Status ScanNode::prepare(RuntimeState* state) {
     _rows_read_counter =
         ADD_COUNTER(runtime_profile(), _s_rows_read_counter, TUnit::UNIT);
 #ifndef BE_TEST
+    _total_throughput_counter = runtime_profile()->add_rate_counter(
+                                    _s_total_throughput_counter, _bytes_read_counter);
 #endif
     _materialize_tuple_timer = ADD_CHILD_TIMER(runtime_profile(), _s_materialize_tuple_timer,
                                _s_scanner_thread_total_wallclock_time);
diff --git a/be/src/exec/scan_node.h b/be/src/exec/scan_node.h
index 67326e533cf4ba..eacb75011bcbfe 100644
--- a/be/src/exec/scan_node.h
+++ b/be/src/exec/scan_node.h
@@ -96,6 +96,9 @@ class ScanNode : public ExecNode {
     RuntimeProfile::Counter* rows_read_counter() const {
         return _rows_read_counter;
     }
+    RuntimeProfile::Counter* total_throughput_counter() const {
+        return _total_throughput_counter;
+    }
     RuntimeProfile::Counter* materialize_tuple_timer() const {
         return _materialize_tuple_timer;
     }
@@ -103,6 +106,7 @@ class ScanNode : public ExecNode {
     // names of ScanNode common counters
     static const std::string _s_bytes_read_counter;
     static const std::string _s_rows_read_counter;
+    static const std::string _s_total_throughput_counter;
     static const std::string _s_num_disks_accessed_counter;
     static const std::string _s_materialize_tuple_timer;
     static const std::string _s_scanner_thread_counters_prefix;
@@ -112,6 +116,8 @@ class ScanNode : public ExecNode {
     RuntimeProfile::Counter* _bytes_read_counter; // # bytes read from the scanner
     // # rows/tuples read from the scanner (including those discarded by eval_conjucts())
     RuntimeProfile::Counter* _rows_read_counter;
+    // Wall based aggregate read throughput [bytes/sec]
+    RuntimeProfile::Counter* _total_throughput_counter;
     RuntimeProfile::Counter* _num_disks_accessed_counter;
     RuntimeProfile::Counter* _materialize_tuple_timer;  // time writing tuple slots
 };
diff --git a/docs/en/administrator-guide/running-profile.md b/docs/en/administrator-guide/running-profile.md
index 1ed7689e688379..de558b44337b9a 100644
--- a/docs/en/administrator-guide/running-profile.md
+++ b/docs/en/administrator-guide/running-profile.md
@@ -144,7 +144,7 @@ There are many statistical information collected at BE.  so we list the correspo
 
 #### `OLAP_SCAN_NODE`
 
-The `OLAP_SCAN_NODE` is responsible for specific data scanning tasks. One `OLAP_SCAN_NODE` will generate one or more `OlapScanner` threads. Each Scanner thread is responsible for scanning part of the data.
+The `OLAP_SCAN_NODE` is responsible for specific data scanning tasks. One `OLAP_SCAN_NODE` will generate one or more `OlapScanner`. Each Scanner thread is responsible for scanning part of the data.
 
 Some or all of the predicate conditions in the query will be pushed to `OLAP_SCAN_NODE`. Some of these predicate conditions will continue to be pushed down to the storage engine in order to use the storage engine's index for data filtering. The other part will be kept in `OLAP_SCAN_NODE` to filter the data returned from the storage engine.
 
@@ -162,6 +162,7 @@ OLAP_SCAN_NODE (id=0):(Active: 1.2ms,% non-child: 0.00%)
   - RowsReturned: 7                     # The number of rows returned from ScanNode to the upper node.
   - RowsReturnedRate: 6.979K /sec       # RowsReturned/ActiveTime
   - TabletCount: 20                     # The number of Tablets involved in this ScanNode.
+  - TotalReadThroughput: 74.70 KB/sec   # BytesRead divided by the total time spent in this node (from Open to Close). For IO bounded queries, this should be very close to the total throughput of all the disks
   OlapScanner:
     - BlockConvertTime: 8.941us         # The time it takes to convert a vectorized Block into a RowBlock with a row structure. The vectorized Block is VectorizedRowBatch in V1 and RowBlockV2 in V2.
     - BlockFetchTime: 468.974us         # Rowset Reader gets the time of the Block.
@@ -212,7 +213,7 @@ Through the above indicators, you can roughly analyze the number of rows process
   - Many indicators under `OlapScanner`, such as `IOTimer`, `BlockFetchTime`, etc., are the accumulation of all Scanner thread indicators, so the value may be relatively large. And because the Scanner thread reads data asynchronously, these cumulative indicators can only reflect the cumulative working time of the Scanner, and do not directly represent the time consumption of the ScanNode. The time-consuming ratio of ScanNode in the entire query plan is the value recorded in the `Active` field. Sometimes it appears that `IOTimer` has tens of seconds, but `Active` is actually only a few seconds. This situation is usually due to:
     - `IOTimer` is the accumulated time of multiple Scanners, and there are more Scanners.
     - The upper node is time-consuming. For example, the upper node takes 100 seconds, while the lower ScanNode only takes 10 seconds. The field reflected in `Active` may be only a few milliseconds. Because while the upper layer is processing data, ScanNode has performed data scanning asynchronously and prepared the data. When the upper node obtains data from ScanNode, it can obtain the prepared data, so the Active time is very short.
-  - `NumScanners` indicates the number of Scanner threads. Too many or too few threads will affect query efficiency. At the same time, some summary indicators can be divided by the number of threads to roughly estimate the time consumption of each thread.
+  - `NumScanners` represents the number of Tasks submitted by the Scanner to the thread pool. It is scheduled by the thread pool in `RuntimeState`. The two parameters `doris_scanner_thread_pool_thread_num` and `doris_scanner_thread_pool_queue_size` control the size of the thread pool and the queue length respectively. Too many or too few threads will affect query efficiency. At the same time, some summary indicators can be divided by the number of threads to roughly estimate the time consumption of each thread.
   - `TabletCount` indicates the number of tablets to be scanned. Too many may mean a lot of random read and data merge operations.
   - `UncompressedBytesRead` indirectly reflects the amount of data read. If the value is large, it means that there may be a lot of IO operations.
   - `CachedPagesNum` and `TotalPagesNum` can check the hitting status of PageCache. The higher the hit rate, the less time-consuming IO and decompression operations.  
diff --git a/docs/zh-CN/administrator-guide/running-profile.md b/docs/zh-CN/administrator-guide/running-profile.md
index 047de8f02c2eb5..e73b2f31004503 100644
--- a/docs/zh-CN/administrator-guide/running-profile.md
+++ b/docs/zh-CN/administrator-guide/running-profile.md
@@ -143,7 +143,7 @@ BE端收集的统计信息较多，下面列出了各个参数的对应含义：
 
 #### `OLAP_SCAN_NODE`
 
-`OLAP_SCAN_NODE` 节点负责具体的数据扫描任务。一个 `OLAP_SCAN_NODE` 会生成一个或多个 `OlapScanner` 线程。每个 Scanner 线程负责扫描部分数据。
+`OLAP_SCAN_NODE` 节点负责具体的数据扫描任务。一个 `OLAP_SCAN_NODE` 会生成一个或多个 `OlapScanner` 。每个 Scanner 线程负责扫描部分数据。
 
 查询中的部分或全部谓词条件会推送给 `OLAP_SCAN_NODE`。这些谓词条件中一部分会继续下推给存储引擎，以便利用存储引擎的索引进行数据过滤。另一部分会保留在 `OLAP_SCAN_NODE` 中，用于过滤从存储引擎中返回的数据。
 
@@ -161,6 +161,7 @@ OLAP_SCAN_NODE (id=0):(Active: 1.2ms, % non-child: 0.00%)
   - RowsReturned: 7                     # 从 ScanNode 返回给上层节点的行数。
   - RowsReturnedRate: 6.979K /sec       # RowsReturned/ActiveTime
   - TabletCount : 20                    # 该 ScanNode 涉及的 Tablet 数量。
+  - TotalReadThroughput: 74.70 KB/sec   # BytesRead除以该节点运行的总时间（从Open到Close），对于IO受限的查询，接近磁盘的总吞吐量。
   OlapScanner:
     - BlockConvertTime: 8.941us         # 将向量化Block转换为行结构的 RowBlock 的耗时。向量化 Block 在 V1 中为 VectorizedRowBatch，V2中为 RowBlockV2。
     - BlockFetchTime: 468.974us         # Rowset Reader 获取 Block 的时间。
@@ -211,7 +212,7 @@ OLAP_SCAN_NODE (id=0):(Active: 1.2ms, % non-child: 0.00%)
   - `OlapScanner` 下的很多指标，如 `IOTimer`，`BlockFetchTime` 等都是所有 Scanner 线程指标的累加，因此数值可能会比较大。并且因为 Scanner 线程是异步读取数据的，所以这些累加指标只能反映 Scanner 累加的工作时间，并不直接代表 ScanNode 的耗时。ScanNode 在整个查询计划中的耗时占比为 `Active` 字段记录的值。有时会出现比如 `IOTimer` 有几十秒，而 `Active` 实际只有几秒钟。这种情况通常因为：
     - `IOTimer` 为多个 Scanner 的累加时间，而 Scanner 数量较多。
     - 上层节点比较耗时。比如上层节点耗时 100秒，而底层 ScanNode 只需 10秒。则反映在 `Active` 的字段可能只有几毫秒。因为在上层处理数据的同时，ScanNode 已经异步的进行了数据扫描并准备好了数据。当上层节点从 ScanNode 获取数据时，可以获取到已经准备好的数据，因此 Active 时间很短。
-  - `NumScanners` 表示 Scanner 线程数。线程数过多或过少都会影响查询效率。同时可以用一些汇总指标除以线程数来大致的估算每个线程的耗时。
+  - `NumScanners` 表示 Scanner 提交到线程池的Task个数，由 `RuntimeState` 中的线程池调度，`doris_scanner_thread_pool_thread_num` 和 `doris_scanner_thread_pool_queue_size` 两个参数分别控制线程池的大小和队列长度。线程数过多或过少都会影响查询效率。同时可以用一些汇总指标除以线程数来大致的估算每个线程的耗时。
   - `TabletCount` 表示需要扫描的 tablet 数量。数量过多可能意味着需要大量的随机读取和数据合并操作。
   - `UncompressedBytesRead` 间接反映了读取的数据量。如果该数值较大，说明可能有大量的 IO 操作。
   - `CachedPagesNum` 和 `TotalPagesNum` 可以查看命中 PageCache 的情况。命中率越高，说明 IO 和解压操作耗时越少。