From c6d083a2e6beba9319e07b983b597aef3889ac2d Mon Sep 17 00:00:00 2001
From: yiguolei <yiguolei@gmail.com>
Date: Tue, 11 Oct 2022 15:08:43 +0800
Subject: [PATCH 1/6] [improvement](memory) disable page cache and chunk
 allocator, optimize memory allocate size

---
 be/src/common/config.h                    | 14 +++--
 be/src/runtime/exec_env_init.cpp          | 14 ++---
 be/src/runtime/mem_pool.cpp               |  5 +-
 be/src/runtime/mem_pool.h                 |  6 +--
 be/src/runtime/memory/chunk_allocator.cpp | 62 ++++++++++++-----------
 be/src/util/bit_util.h                    | 10 +++-
 be/src/vec/common/arena.h                 | 11 ++--
 be/src/vec/common/pod_array.h             | 31 ++++++++----
 8 files changed, 89 insertions(+), 64 deletions(-)
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 021f5cd680107a..0b8ac4af9f0048 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -239,7 +239,7 @@ CONF_Int32(storage_page_cache_shard_size, "16");
 // all storage page cache will be divided into data_page_cache and index_page_cache
 CONF_Int32(index_page_cache_percentage, "10");
 // whether to disable page cache feature in storage
-CONF_Bool(disable_storage_page_cache, "false");
+CONF_Bool(disable_storage_page_cache, "true");
 
 CONF_Bool(enable_storage_vectorization, "true");
 
@@ -439,14 +439,20 @@ CONF_Bool(disable_mem_pools, "false");
 // increase this variable can improve performance,
 // but will acquire more free memory which can not be used by other modules.
 CONF_mString(chunk_reserved_bytes_limit, "10%");
-// 1024, The minimum chunk allocator size (in bytes)
-CONF_Int32(min_chunk_reserved_bytes, "1024");
+
+// Whether using chunk allocator to cache memory chunk
+CONF_Bool(disable_chunk_allocator, "true");
 // Disable Chunk Allocator in Vectorized Allocator, this will reduce memory cache.
 // For high concurrent queries, using Chunk Allocator with vectorized Allocator can reduce the impact
 // of gperftools tcmalloc central lock.
 // Jemalloc or google tcmalloc have core cache, Chunk Allocator may no longer be needed after replacing
 // gperftools tcmalloc.
-CONF_mBool(disable_chunk_allocator_in_vec, "false");
+CONF_mBool(disable_chunk_allocator_in_vec, "true");
+
+// Both MemPool and vectorized engine's podarray allocator, vectorized engine's arena will try to allocate memory as power of two.
+// But if the memory is very large then power of two is also very large. This config means if the allocated memory's size is larger
+// than this limit then all allocators will not use RoundUpToPowerOfTwo to allocate memory.
+CONF_mInt64(memory_linear_growth_threshold, "134217728"); // 128Mb
 
 // The probing algorithm of partitioned hash table.
 // Enable quadratic probing hash table
diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp
index a55f5477782980..ca342f1f5e9911 100644
--- a/be/src/runtime/exec_env_init.cpp
+++ b/be/src/runtime/exec_env_init.cpp
@@ -185,6 +185,8 @@ Status ExecEnv::_init(const std::vector<StorePath>& store_paths) {
 }
 
 Status ExecEnv::_init_mem_tracker() {
+    LOG(INFO) << "Physical memory is: "
+              << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES);
     // 1. init global memory limit.
     int64_t global_memory_limit_bytes = 0;
     bool is_percent = false;
@@ -199,9 +201,7 @@ Status ExecEnv::_init_mem_tracker() {
     if (global_memory_limit_bytes > MemInfo::physical_mem()) {
         LOG(WARNING) << "Memory limit "
                      << PrettyPrinter::print(global_memory_limit_bytes, TUnit::BYTES)
-                     << " exceeds physical memory of "
-                     << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES)
-                     << ". Using physical memory instead";
+                     << " exceeds physical memory, using physical memory instead";
         global_memory_limit_bytes = MemInfo::physical_mem();
     }
     _process_mem_tracker =
@@ -308,12 +308,6 @@ Status ExecEnv::_init_mem_tracker() {
     RETURN_IF_ERROR(_tmp_file_mgr->init());
 
     // 5. init chunk allocator
-    if (!BitUtil::IsPowerOf2(config::min_chunk_reserved_bytes)) {
-        ss << "Config min_chunk_reserved_bytes must be a power-of-two: "
-           << config::min_chunk_reserved_bytes;
-        return Status::InternalError(ss.str());
-    }
-
     int64_t chunk_reserved_bytes_limit =
             ParseUtil::parse_mem_spec(config::chunk_reserved_bytes_limit, global_memory_limit_bytes,
                                       MemInfo::physical_mem(), &is_percent);
@@ -323,8 +317,6 @@ Status ExecEnv::_init_mem_tracker() {
            << config::chunk_reserved_bytes_limit;
         return Status::InternalError(ss.str());
     }
-    chunk_reserved_bytes_limit =
-            BitUtil::RoundDown(chunk_reserved_bytes_limit, config::min_chunk_reserved_bytes);
     ChunkAllocator::init_instance(chunk_reserved_bytes_limit);
     LOG(INFO) << "Chunk allocator memory limit: "
               << PrettyPrinter::print(chunk_reserved_bytes_limit, TUnit::BYTES)
diff --git a/be/src/runtime/mem_pool.cpp b/be/src/runtime/mem_pool.cpp
index c2b709162c2929..f53dd4746c2a32 100644
--- a/be/src/runtime/mem_pool.cpp
+++ b/be/src/runtime/mem_pool.cpp
@@ -131,8 +131,9 @@ Status MemPool::find_chunk(size_t min_size, bool check_limits) {
         DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE);
         chunk_size = std::max<size_t>(min_size, next_chunk_size_);
     }
-
-    chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size);
+    if (chunk_size < config::memory_linear_growth_threshold) {
+        chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size);
+    }
     if (check_limits &&
         !thread_context()->_thread_mem_tracker_mgr->limiter_mem_tracker_raw()->check_limit(
                 chunk_size)) {
diff --git a/be/src/runtime/mem_pool.h b/be/src/runtime/mem_pool.h
index 41240ab375834d..a1db3e6c5dbcdb 100644
--- a/be/src/runtime/mem_pool.h
+++ b/be/src/runtime/mem_pool.h
@@ -231,9 +231,9 @@ class MemPool {
         // I refers to https://github.com/mcgov/asan_alignment_example.
 
         ChunkInfo& info = chunks_[current_chunk_idx_];
-        int64_t aligned_allocated_bytes =
-                BitUtil::RoundUpToPowerOf2(info.allocated_bytes + DEFAULT_PADDING_SIZE, alignment);
-        if (aligned_allocated_bytes + size <= info.chunk.size) {
+        int64_t aligned_allocated_bytes = BitUtil::RoundUpToMultiplyOfFactor(
+                info.allocated_bytes + DEFAULT_PADDING_SIZE, alignment);
+        if (aligned_allocated_bytes + size + DEFAULT_PADDING_SIZE <= info.chunk.size) {
             // Ensure the requested alignment is respected.
             int64_t padding = aligned_allocated_bytes - info.allocated_bytes;
             uint8_t* result = info.chunk.data + aligned_allocated_bytes;
diff --git a/be/src/runtime/memory/chunk_allocator.cpp b/be/src/runtime/memory/chunk_allocator.cpp
index 43acc79538182f..469e5ef7ab198e 100644
--- a/be/src/runtime/memory/chunk_allocator.cpp
+++ b/be/src/runtime/memory/chunk_allocator.cpp
@@ -154,35 +154,37 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit)
 Status ChunkAllocator::allocate(size_t size, Chunk* chunk) {
     CHECK((size > 0 && (size & (size - 1)) == 0));
 
-    // fast path: allocate from current core arena
-    int core_id = CpuInfo::get_current_core();
-    chunk->size = size;
-    chunk->core_id = core_id;
-
-    if (_arenas[core_id]->pop_free_chunk(size, &chunk->data)) {
-        DCHECK_GE(_reserved_bytes, 0);
-        _reserved_bytes.fetch_sub(size);
-        chunk_pool_local_core_alloc_count->increment(1);
-        // transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
-        THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
-        return Status::OK();
-    }
-    // Second path: try to allocate from other core's arena
-    // When the reserved bytes is greater than the limit, the chunk is stolen from other arena.
-    // Otherwise, it is allocated from the system first, which can reserve enough memory as soon as possible.
-    // After that, allocate from current core arena as much as possible.
-    if (_reserved_bytes > _steal_arena_limit) {
-        ++core_id;
-        for (int i = 1; i < _arenas.size(); ++i, ++core_id) {
-            if (_arenas[core_id % _arenas.size()]->pop_free_chunk(size, &chunk->data)) {
-                DCHECK_GE(_reserved_bytes, 0);
-                _reserved_bytes.fetch_sub(size);
-                chunk_pool_other_core_alloc_count->increment(1);
-                // reset chunk's core_id to other
-                chunk->core_id = core_id % _arenas.size();
-                // transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
-                THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
-                return Status::OK();
+    if (!config::disable_chunk_allocator) {
+        // fast path: allocate from current core arena
+        int core_id = CpuInfo::get_current_core();
+        chunk->size = size;
+        chunk->core_id = core_id;
+
+        if (_arenas[core_id]->pop_free_chunk(size, &chunk->data)) {
+            DCHECK_GE(_reserved_bytes, 0);
+            _reserved_bytes.fetch_sub(size);
+            chunk_pool_local_core_alloc_count->increment(1);
+            // transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
+            THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
+            return Status::OK();
+        }
+        // Second path: try to allocate from other core's arena
+        // When the reserved bytes is greater than the limit, the chunk is stolen from other arena.
+        // Otherwise, it is allocated from the system first, which can reserve enough memory as soon as possible.
+        // After that, allocate from current core arena as much as possible.
+        if (_reserved_bytes > _steal_arena_limit) {
+            ++core_id;
+            for (int i = 1; i < _arenas.size(); ++i, ++core_id) {
+                if (_arenas[core_id % _arenas.size()]->pop_free_chunk(size, &chunk->data)) {
+                    DCHECK_GE(_reserved_bytes, 0);
+                    _reserved_bytes.fetch_sub(size);
+                    chunk_pool_other_core_alloc_count->increment(1);
+                    // reset chunk's core_id to other
+                    chunk->core_id = core_id % _arenas.size();
+                    // transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
+                    THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
+                    return Status::OK();
+                }
             }
         }
     }
@@ -204,7 +206,7 @@ Status ChunkAllocator::allocate(size_t size, Chunk* chunk) {
 void ChunkAllocator::free(const Chunk& chunk) {
     DCHECK(chunk.core_id != -1);
     CHECK((chunk.size & (chunk.size - 1)) == 0);
-    if (config::disable_mem_pools) {
+    if (config::disable_chunk_allocator) {
         SystemAllocator::free(chunk.data, chunk.size);
         return;
     }
diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h
index 28534b139b6d9b..f68586df64754a 100644
--- a/be/src/util/bit_util.h
+++ b/be/src/util/bit_util.h
@@ -43,6 +43,8 @@ class BitUtil {
         return value / divisor + (value % divisor != 0);
     }
 
+    static inline size_t round_up_to_page_size(size_t s) { return (s + 4096 - 1) / 4096 * 4096; }
+
     // Returns 'value' rounded up to the nearest multiple of 'factor'
     static inline int64_t round_up(int64_t value, int64_t factor) {
         return (value + (factor - 1)) / factor * factor;
@@ -304,8 +306,12 @@ class BitUtil {
     }
 
     /// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
-    /// a power of two
-    static inline int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
+    /// a power of two, for example
+    /// Factor has to be a power of two
+    /// factor = 16, value = 10 --> result = 16
+    /// factor = 16, value = 17 --> result = 32
+    /// factor = 16, value = 33 --> result = 48
+    static inline int64_t RoundUpToMultiplyOfFactor(int64_t value, int64_t factor) {
         DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
         return (value + (factor - 1)) & ~(factor - 1);
     }
diff --git a/be/src/vec/common/arena.h b/be/src/vec/common/arena.h
index e136bae1438fe3..8042d5618dd097 100644
--- a/be/src/vec/common/arena.h
+++ b/be/src/vec/common/arena.h
@@ -127,11 +127,16 @@ class Arena : private boost::noncopyable {
 
 public:
     Arena(size_t initial_size_ = 4096, size_t growth_factor_ = 2,
-          size_t linear_growth_threshold_ = 128 * 1024 * 1024)
+          size_t linear_growth_threshold_ = -1)
             : growth_factor(growth_factor_),
-              linear_growth_threshold(linear_growth_threshold_),
               head(new Chunk(initial_size_, nullptr)),
-              size_in_bytes(head->size()) {}
+              size_in_bytes(head->size()) {
+        if (linear_growth_threshold_ < 0) {
+            linear_growth_threshold = config::memory_linear_growth_threshold;
+        } else {
+            linear_growth_threshold = linear_growth_threshold_;
+        }
+    }
 
     ~Arena() { delete head; }
 
diff --git a/be/src/vec/common/pod_array.h b/be/src/vec/common/pod_array.h
index 0b979fd6a8d28f..2dcd0a1689fa40 100644
--- a/be/src/vec/common/pod_array.h
+++ b/be/src/vec/common/pod_array.h
@@ -30,6 +30,8 @@
 #include <cstddef>
 #include <memory>
 
+#include "common/config.h"
+#include "util/bit_util.h"
 #include "vec/common/allocator.h"
 #include "vec/common/bit_helpers.h"
 #include "vec/common/memcpy_small.h"
@@ -120,8 +122,16 @@ class PODArrayBase : private boost::noncopyable,
         }
     }
 
+    inline size_t round_up_memory_size(size_t required_capacity) {
+        if (required_capacity > config::memory_linear_growth_threshold) {
+            return BitUtil::round_up_to_page_size(required_capacity);
+        } else {
+            return round_up_to_power_of_two_or_zero(required_capacity);
+        }
+    }
+
     void alloc_for_num_elements(size_t num_elements) {
-        alloc(round_up_to_power_of_two_or_zero(minimum_memory_for_elements(num_elements)));
+        alloc(round_up_memory_size(minimum_memory_for_elements(num_elements)));
     }
 
     template <typename... TAllocatorParams>
@@ -189,8 +199,10 @@ class PODArrayBase : private boost::noncopyable,
             realloc(std::max(integerRoundUp(initial_bytes, ELEMENT_SIZE),
                              minimum_memory_for_elements(1)),
                     std::forward<TAllocatorParams>(allocator_params)...);
-        } else
+        } else {
+            // There is still a power of 2 expansion here, this method is used in push back method
             realloc(allocated_bytes() * 2, std::forward<TAllocatorParams>(allocator_params)...);
+        }
     }
 
 #ifndef NDEBUG
@@ -228,9 +240,10 @@ class PODArrayBase : private boost::noncopyable,
 
     template <typename... TAllocatorParams>
     void reserve(size_t n, TAllocatorParams&&... allocator_params) {
-        if (n > capacity())
-            realloc(round_up_to_power_of_two_or_zero(minimum_memory_for_elements(n)),
+        if (n > capacity()) {
+            realloc(round_up_memory_size(minimum_memory_for_elements(n)),
                     std::forward<TAllocatorParams>(allocator_params)...);
+        }
     }
 
     template <typename... TAllocatorParams>
@@ -444,9 +457,10 @@ class PODArray : public PODArrayBase<sizeof(T), initial_bytes, TAllocator, pad_r
     template <typename It1, typename It2, typename... TAllocatorParams>
     void insert_prepare(It1 from_begin, It2 from_end, TAllocatorParams&&... allocator_params) {
         size_t required_capacity = this->size() + (from_end - from_begin);
-        if (required_capacity > this->capacity())
-            this->reserve(round_up_to_power_of_two_or_zero(required_capacity),
-                          std::forward<TAllocatorParams>(allocator_params)...);
+        if (required_capacity > this->capacity()) {
+            // Reserve function will try to allocate power of two memory size, so that not need expand it here
+            this->reserve(required_capacity, std::forward<TAllocatorParams>(allocator_params)...);
+        }
     }
 
     /// Do not insert into the array a piece of itself. Because with the resize, the iterators on themselves can be invalidated.
@@ -623,8 +637,7 @@ class PODArray : public PODArrayBase<sizeof(T), initial_bytes, TAllocator, pad_r
     template <typename It1, typename It2>
     void assign(It1 from_begin, It2 from_end) {
         size_t required_capacity = from_end - from_begin;
-        if (required_capacity > this->capacity())
-            this->reserve(round_up_to_power_of_two_or_zero(required_capacity));
+        if (required_capacity > this->capacity()) this->reserve(required_capacity);
 
         size_t bytes_to_copy = this->byte_size(required_capacity);
         memcpy(this->c_start, reinterpret_cast<const void*>(&*from_begin), bytes_to_copy);

From 2ced248bafa7bb94e3aba1d802472fa6a245a6e0 Mon Sep 17 00:00:00 2001
From: yiguolei <yiguolei@gmail.com>
Date: Wed, 12 Oct 2022 15:56:14 +0800
Subject: [PATCH 2/6] fix bugs

---
 be/src/vec/common/pod_array.h | 86 ++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 42 deletions(-)

diff --git a/be/src/vec/common/pod_array.h b/be/src/vec/common/pod_array.h
index 2dcd0a1689fa40..79c890a9b9d594 100644
--- a/be/src/vec/common/pod_array.h
+++ b/be/src/vec/common/pod_array.h
@@ -122,16 +122,9 @@ class PODArrayBase : private boost::noncopyable,
         }
     }
 
-    inline size_t round_up_memory_size(size_t required_capacity) {
-        if (required_capacity > config::memory_linear_growth_threshold) {
-            return BitUtil::round_up_to_page_size(required_capacity);
-        } else {
-            return round_up_to_power_of_two_or_zero(required_capacity);
-        }
-    }
-
+    /// Not round up, keep the size just as the application pass in like std::vector
     void alloc_for_num_elements(size_t num_elements) {
-        alloc(round_up_memory_size(minimum_memory_for_elements(num_elements)));
+        alloc(minimum_memory_for_elements(num_elements));
     }
 
     template <typename... TAllocatorParams>
@@ -191,6 +184,7 @@ class PODArrayBase : private boost::noncopyable,
         return (stack_threshold > 0) && (allocated_bytes() <= stack_threshold);
     }
 
+    /// This method is called by push back or emplace back, this is the same behaviour with std::vector
     template <typename... TAllocatorParams>
     void reserve_for_next_size(TAllocatorParams&&... allocator_params) {
         if (size() == 0) {
@@ -240,50 +234,54 @@ class PODArrayBase : private boost::noncopyable,
 
     template <typename... TAllocatorParams>
     void reserve(size_t n, TAllocatorParams&&... allocator_params) {
-        if (n > capacity()) {
-            realloc(round_up_memory_size(minimum_memory_for_elements(n)),
+        if (n > capacity())
+            realloc(minimum_memory_for_elements(n),
                     std::forward<TAllocatorParams>(allocator_params)...);
-        }
     }
+}
 
-    template <typename... TAllocatorParams>
-    void resize(size_t n, TAllocatorParams&&... allocator_params) {
-        reserve(n, std::forward<TAllocatorParams>(allocator_params)...);
-        resize_assume_reserved(n);
-    }
+template <typename... TAllocatorParams>
+void resize(size_t n, TAllocatorParams&&... allocator_params) {
+    reserve(n, std::forward<TAllocatorParams>(allocator_params)...);
+    resize_assume_reserved(n);
+}
 
-    void resize_assume_reserved(const size_t n) {
-        c_end = c_start + byte_size(n);
-        reset_peak();
-    }
+void resize_assume_reserved(const size_t n) {
+    c_end = c_start + byte_size(n);
+    reset_peak();
+}
 
-    const char* raw_data() const { return c_start; }
+const char* raw_data() const {
+    return c_start;
+}
 
-    template <typename... TAllocatorParams>
-    void push_back_raw(const char* ptr, TAllocatorParams&&... allocator_params) {
-        if (UNLIKELY(c_end == c_end_of_storage))
-            reserve_for_next_size(std::forward<TAllocatorParams>(allocator_params)...);
+template <typename... TAllocatorParams>
+void push_back_raw(const char* ptr, TAllocatorParams&&... allocator_params) {
+    if (UNLIKELY(c_end == c_end_of_storage))
+        reserve_for_next_size(std::forward<TAllocatorParams>(allocator_params)...);
 
-        memcpy(c_end, ptr, ELEMENT_SIZE);
-        c_end += byte_size(1);
-        reset_peak();
-    }
+    memcpy(c_end, ptr, ELEMENT_SIZE);
+    c_end += byte_size(1);
+    reset_peak();
+}
 
-    void protect() {
+void protect() {
 #ifndef NDEBUG
-        protect_impl(PROT_READ);
-        mprotected = true;
+    protect_impl(PROT_READ);
+    mprotected = true;
 #endif
-    }
+}
 
-    void unprotect() {
+void unprotect() {
 #ifndef NDEBUG
-        if (mprotected) protect_impl(PROT_WRITE);
-        mprotected = false;
+    if (mprotected) protect_impl(PROT_WRITE);
+    mprotected = false;
 #endif
-    }
+}
 
-    ~PODArrayBase() { dealloc(); }
+~PODArrayBase() {
+    dealloc();
+}
 };
 
 template <typename T, size_t initial_bytes, typename TAllocator, size_t pad_right_,
@@ -458,8 +456,9 @@ class PODArray : public PODArrayBase<sizeof(T), initial_bytes, TAllocator, pad_r
     void insert_prepare(It1 from_begin, It2 from_end, TAllocatorParams&&... allocator_params) {
         size_t required_capacity = this->size() + (from_end - from_begin);
         if (required_capacity > this->capacity()) {
-            // Reserve function will try to allocate power of two memory size, so that not need expand it here
-            this->reserve(required_capacity, std::forward<TAllocatorParams>(allocator_params)...);
+            // std::vector's insert method will expand if required capactiy is larger than current
+            this->reserve(round_up_to_power_of_two_or_zero(required_capacity),
+                          std::forward<TAllocatorParams>(allocator_params)...);
         }
     }
 
@@ -637,7 +636,10 @@ class PODArray : public PODArrayBase<sizeof(T), initial_bytes, TAllocator, pad_r
     template <typename It1, typename It2>
     void assign(It1 from_begin, It2 from_end) {
         size_t required_capacity = from_end - from_begin;
-        if (required_capacity > this->capacity()) this->reserve(required_capacity);
+        if (required_capacity > this->capacity()) {
+            // std::vector assign just expand the capacity to the required capacity
+            this->reserve(required_capacity);
+        }
 
         size_t bytes_to_copy = this->byte_size(required_capacity);
         memcpy(this->c_start, reinterpret_cast<const void*>(&*from_begin), bytes_to_copy);

From 5e1cc77a18e55070d4bd43a7479e132c4152c9de Mon Sep 17 00:00:00 2001
From: yiguolei <yiguolei@gmail.com>
Date: Wed, 12 Oct 2022 18:07:49 +0800
Subject: [PATCH 3/6] fix bugs

---
 be/src/vec/common/pod_array.h | 59 ++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/be/src/vec/common/pod_array.h b/be/src/vec/common/pod_array.h
index 79c890a9b9d594..6d844340efc694 100644
--- a/be/src/vec/common/pod_array.h
+++ b/be/src/vec/common/pod_array.h
@@ -238,50 +238,45 @@ class PODArrayBase : private boost::noncopyable,
             realloc(minimum_memory_for_elements(n),
                     std::forward<TAllocatorParams>(allocator_params)...);
     }
-}
 
-template <typename... TAllocatorParams>
-void resize(size_t n, TAllocatorParams&&... allocator_params) {
-    reserve(n, std::forward<TAllocatorParams>(allocator_params)...);
-    resize_assume_reserved(n);
-}
+    template <typename... TAllocatorParams>
+    void resize(size_t n, TAllocatorParams&&... allocator_params) {
+        reserve(n, std::forward<TAllocatorParams>(allocator_params)...);
+        resize_assume_reserved(n);
+    }
 
-void resize_assume_reserved(const size_t n) {
-    c_end = c_start + byte_size(n);
-    reset_peak();
-}
+    void resize_assume_reserved(const size_t n) {
+        c_end = c_start + byte_size(n);
+        reset_peak();
+    }
 
-const char* raw_data() const {
-    return c_start;
-}
+    const char* raw_data() const { return c_start; }
 
-template <typename... TAllocatorParams>
-void push_back_raw(const char* ptr, TAllocatorParams&&... allocator_params) {
-    if (UNLIKELY(c_end == c_end_of_storage))
-        reserve_for_next_size(std::forward<TAllocatorParams>(allocator_params)...);
+    template <typename... TAllocatorParams>
+    void push_back_raw(const char* ptr, TAllocatorParams&&... allocator_params) {
+        if (UNLIKELY(c_end == c_end_of_storage))
+            reserve_for_next_size(std::forward<TAllocatorParams>(allocator_params)...);
 
-    memcpy(c_end, ptr, ELEMENT_SIZE);
-    c_end += byte_size(1);
-    reset_peak();
-}
+        memcpy(c_end, ptr, ELEMENT_SIZE);
+        c_end += byte_size(1);
+        reset_peak();
+    }
 
-void protect() {
+    void protect() {
 #ifndef NDEBUG
-    protect_impl(PROT_READ);
-    mprotected = true;
+        protect_impl(PROT_READ);
+        mprotected = true;
 #endif
-}
+    }
 
-void unprotect() {
+    void unprotect() {
 #ifndef NDEBUG
-    if (mprotected) protect_impl(PROT_WRITE);
-    mprotected = false;
+        if (mprotected) protect_impl(PROT_WRITE);
+        mprotected = false;
 #endif
-}
+    }
 
-~PODArrayBase() {
-    dealloc();
-}
+    ~PODArrayBase() { dealloc(); }
 };
 
 template <typename T, size_t initial_bytes, typename TAllocator, size_t pad_right_,

From afe8c8db580ee494cdba5a84f174cdb6bb463cdd Mon Sep 17 00:00:00 2001
From: yiguolei <yiguolei@gmail.com>
Date: Thu, 13 Oct 2022 14:58:42 +0800
Subject: [PATCH 4/6] fix bugs

---
 be/src/runtime/memory/chunk_allocator.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/be/src/runtime/memory/chunk_allocator.cpp b/be/src/runtime/memory/chunk_allocator.cpp
index 469e5ef7ab198e..09df7e23d4048e 100644
--- a/be/src/runtime/memory/chunk_allocator.cpp
+++ b/be/src/runtime/memory/chunk_allocator.cpp
@@ -154,12 +154,11 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit)
 Status ChunkAllocator::allocate(size_t size, Chunk* chunk) {
     CHECK((size > 0 && (size & (size - 1)) == 0));
 
+    int core_id = CpuInfo::get_current_core();
+    chunk->core_id = core_id;
+    chunk->size = size;
     if (!config::disable_chunk_allocator) {
         // fast path: allocate from current core arena
-        int core_id = CpuInfo::get_current_core();
-        chunk->size = size;
-        chunk->core_id = core_id;
-
         if (_arenas[core_id]->pop_free_chunk(size, &chunk->data)) {
             DCHECK_GE(_reserved_bytes, 0);
             _reserved_bytes.fetch_sub(size);

From 30514964a9b39f0edb668909ac53a081af580713 Mon Sep 17 00:00:00 2001
From: yiguolei <yiguolei@gmail.com>
Date: Fri, 14 Oct 2022 14:10:44 +0800
Subject: [PATCH 5/6] fix bugs

---
 be/src/common/config.h           |  3 +++
 be/src/runtime/exec_env_init.cpp | 10 ++++++++--
 be/src/runtime/mem_pool.cpp      | 16 ++--------------
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/be/src/common/config.h b/be/src/common/config.h
index 0b8ac4af9f0048..4670e8a4b942e8 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -440,6 +440,9 @@ CONF_Bool(disable_mem_pools, "false");
 // but will acquire more free memory which can not be used by other modules.
 CONF_mString(chunk_reserved_bytes_limit, "10%");
 
+// 1024, The minimum chunk allocator size (in bytes)
+CONF_Int32(min_chunk_reserved_bytes, "1024");
+
 // Whether using chunk allocator to cache memory chunk
 CONF_Bool(disable_chunk_allocator, "true");
 // Disable Chunk Allocator in Vectorized Allocator, this will reduce memory cache.
diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp
index ca342f1f5e9911..68532cafef2bbe 100644
--- a/be/src/runtime/exec_env_init.cpp
+++ b/be/src/runtime/exec_env_init.cpp
@@ -185,8 +185,6 @@ Status ExecEnv::_init(const std::vector<StorePath>& store_paths) {
 }
 
 Status ExecEnv::_init_mem_tracker() {
-    LOG(INFO) << "Physical memory is: "
-              << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES);
     // 1. init global memory limit.
     int64_t global_memory_limit_bytes = 0;
     bool is_percent = false;
@@ -308,6 +306,12 @@ Status ExecEnv::_init_mem_tracker() {
     RETURN_IF_ERROR(_tmp_file_mgr->init());
 
     // 5. init chunk allocator
+    if (!BitUtil::IsPowerOf2(config::min_chunk_reserved_bytes)) {
+        ss << "Config min_chunk_reserved_bytes must be a power-of-two: "
+           << config::min_chunk_reserved_bytes;
+        return Status::InternalError(ss.str());
+    }
+
     int64_t chunk_reserved_bytes_limit =
             ParseUtil::parse_mem_spec(config::chunk_reserved_bytes_limit, global_memory_limit_bytes,
                                       MemInfo::physical_mem(), &is_percent);
@@ -317,6 +321,8 @@ Status ExecEnv::_init_mem_tracker() {
            << config::chunk_reserved_bytes_limit;
         return Status::InternalError(ss.str());
     }
+    chunk_reserved_bytes_limit =
+            BitUtil::RoundDown(chunk_reserved_bytes_limit, config::min_chunk_reserved_bytes);
     ChunkAllocator::init_instance(chunk_reserved_bytes_limit);
     LOG(INFO) << "Chunk allocator memory limit: "
               << PrettyPrinter::print(chunk_reserved_bytes_limit, TUnit::BYTES)
diff --git a/be/src/runtime/mem_pool.cpp b/be/src/runtime/mem_pool.cpp
index f53dd4746c2a32..7e80e7e5b458ab 100644
--- a/be/src/runtime/mem_pool.cpp
+++ b/be/src/runtime/mem_pool.cpp
@@ -119,21 +119,9 @@ Status MemPool::find_chunk(size_t min_size, bool check_limits) {
     }
 
     // Didn't find a big enough free chunk - need to allocate new chunk.
-    size_t chunk_size = 0;
     DCHECK_LE(next_chunk_size_, MAX_CHUNK_SIZE);
-
-    if (config::disable_mem_pools) {
-        // Disable pooling by sizing the chunk to fit only this allocation.
-        // Make sure the alignment guarantees are respected.
-        // This will generate too many small chunks.
-        chunk_size = std::max<size_t>(min_size, alignof(max_align_t));
-    } else {
-        DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE);
-        chunk_size = std::max<size_t>(min_size, next_chunk_size_);
-    }
-    if (chunk_size < config::memory_linear_growth_threshold) {
-        chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size);
-    }
+    DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE);
+    size_t chunk_size = BitUtil::RoundUpToPowerOfTwo(std::max<size_t>(min_size, next_chunk_size_));
     if (check_limits &&
         !thread_context()->_thread_mem_tracker_mgr->limiter_mem_tracker_raw()->check_limit(
                 chunk_size)) {

From f8ad6c0eaf1b96549b16587401da34427f5fa440 Mon Sep 17 00:00:00 2001
From: yiguolei <yiguolei@gmail.com>
Date: Sat, 15 Oct 2022 10:29:52 +0800
Subject: [PATCH 6/6] fix bugs

---
 be/src/common/config.h           |  3 ---
 be/src/runtime/exec_env_init.cpp | 11 ++---------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/be/src/common/config.h b/be/src/common/config.h
index 4670e8a4b942e8..0b8ac4af9f0048 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -440,9 +440,6 @@ CONF_Bool(disable_mem_pools, "false");
 // but will acquire more free memory which can not be used by other modules.
 CONF_mString(chunk_reserved_bytes_limit, "10%");
 
-// 1024, The minimum chunk allocator size (in bytes)
-CONF_Int32(min_chunk_reserved_bytes, "1024");
-
 // Whether using chunk allocator to cache memory chunk
 CONF_Bool(disable_chunk_allocator, "true");
 // Disable Chunk Allocator in Vectorized Allocator, this will reduce memory cache.
diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp
index 68532cafef2bbe..327a0307e75f9b 100644
--- a/be/src/runtime/exec_env_init.cpp
+++ b/be/src/runtime/exec_env_init.cpp
@@ -305,13 +305,6 @@ Status ExecEnv::_init_mem_tracker() {
     RETURN_IF_ERROR(_disk_io_mgr->init(global_memory_limit_bytes));
     RETURN_IF_ERROR(_tmp_file_mgr->init());
 
-    // 5. init chunk allocator
-    if (!BitUtil::IsPowerOf2(config::min_chunk_reserved_bytes)) {
-        ss << "Config min_chunk_reserved_bytes must be a power-of-two: "
-           << config::min_chunk_reserved_bytes;
-        return Status::InternalError(ss.str());
-    }
-
     int64_t chunk_reserved_bytes_limit =
             ParseUtil::parse_mem_spec(config::chunk_reserved_bytes_limit, global_memory_limit_bytes,
                                       MemInfo::physical_mem(), &is_percent);
@@ -321,8 +314,8 @@ Status ExecEnv::_init_mem_tracker() {
            << config::chunk_reserved_bytes_limit;
         return Status::InternalError(ss.str());
     }
-    chunk_reserved_bytes_limit =
-            BitUtil::RoundDown(chunk_reserved_bytes_limit, config::min_chunk_reserved_bytes);
+    // Has to round to multiple of page size(4096 bytes), chunk allocator will also check this
+    chunk_reserved_bytes_limit = BitUtil::RoundDown(chunk_reserved_bytes_limit, 4096);
     ChunkAllocator::init_instance(chunk_reserved_bytes_limit);
     LOG(INFO) << "Chunk allocator memory limit: "
               << PrettyPrinter::print(chunk_reserved_bytes_limit, TUnit::BYTES)