Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ CONF_Int32(storage_page_cache_shard_size, "16");
// all storage page cache will be divided into data_page_cache and index_page_cache
CONF_Int32(index_page_cache_percentage, "10");
// whether to disable page cache feature in storage
CONF_Bool(disable_storage_page_cache, "false");
CONF_Bool(disable_storage_page_cache, "true");

CONF_Bool(enable_storage_vectorization, "true");

Expand Down Expand Up @@ -439,14 +439,20 @@ CONF_Bool(disable_mem_pools, "false");
// increase this variable can improve performance,
// but will acquire more free memory which can not be used by other modules.
CONF_mString(chunk_reserved_bytes_limit, "10%");
// 1024, The minimum chunk allocator size (in bytes)
CONF_Int32(min_chunk_reserved_bytes, "1024");

// Whether using chunk allocator to cache memory chunk
CONF_Bool(disable_chunk_allocator, "true");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

disable_chunk_allocator_in_mem_pool

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I will remove mempool after we removed non-vectorized engine. MemPool is used as MemPool.cpp, it is like a arena. The config disable_mem_pools is also very confused. I will remove them.

// Disable Chunk Allocator in Vectorized Allocator, this will reduce memory cache.
// For high concurrent queries, using Chunk Allocator with vectorized Allocator can reduce the impact
// of gperftools tcmalloc central lock.
// Jemalloc or google tcmalloc have core cache, Chunk Allocator may no longer be needed after replacing
// gperftools tcmalloc.
CONF_mBool(disable_chunk_allocator_in_vec, "false");
CONF_mBool(disable_chunk_allocator_in_vec, "true");

// Both MemPool and vectorized engine's podarray allocator, vectorized engine's arena will try to allocate memory as power of two.
// But if the memory is very large then power of two is also very large. This config means if the allocated memory's size is larger
// than this limit then all allocators will not use RoundUpToPowerOfTwo to allocate memory.
CONF_mInt64(memory_linear_growth_threshold, "134217728"); // 128Mb

// The probing algorithm of partitioned hash table.
// Enable quadratic probing hash table
Expand Down
15 changes: 3 additions & 12 deletions be/src/runtime/exec_env_init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,7 @@ Status ExecEnv::_init_mem_tracker() {
if (global_memory_limit_bytes > MemInfo::physical_mem()) {
LOG(WARNING) << "Memory limit "
<< PrettyPrinter::print(global_memory_limit_bytes, TUnit::BYTES)
<< " exceeds physical memory of "
<< PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES)
<< ". Using physical memory instead";
<< " exceeds physical memory, using physical memory instead";
global_memory_limit_bytes = MemInfo::physical_mem();
}
_process_mem_tracker =
Expand Down Expand Up @@ -307,13 +305,6 @@ Status ExecEnv::_init_mem_tracker() {
RETURN_IF_ERROR(_disk_io_mgr->init(global_memory_limit_bytes));
RETURN_IF_ERROR(_tmp_file_mgr->init());

// 5. init chunk allocator
if (!BitUtil::IsPowerOf2(config::min_chunk_reserved_bytes)) {
ss << "Config min_chunk_reserved_bytes must be a power-of-two: "
<< config::min_chunk_reserved_bytes;
return Status::InternalError(ss.str());
}

int64_t chunk_reserved_bytes_limit =
ParseUtil::parse_mem_spec(config::chunk_reserved_bytes_limit, global_memory_limit_bytes,
MemInfo::physical_mem(), &is_percent);
Expand All @@ -323,8 +314,8 @@ Status ExecEnv::_init_mem_tracker() {
<< config::chunk_reserved_bytes_limit;
return Status::InternalError(ss.str());
}
chunk_reserved_bytes_limit =
BitUtil::RoundDown(chunk_reserved_bytes_limit, config::min_chunk_reserved_bytes);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The BitUtil::RoundDown(chunk_reserved_bytes_limit, 4096) here ensures that chunk_reserved_bytes_limit is a multiple of 4096

4096 is the minimum chunk size currently allocated by the chunk allocator

A separate conf min_chunk_reserved_bytes is not necessary, but RoundDown is meaningful

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes ... I will move back min_chunk_reserved_bytes

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can remove min_chunk_reserved_bytes, const 4096 is fine, the user will not modify it
This knowledge suggests~

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I remove it.

// Has to round to multiple of page size(4096 bytes), chunk allocator will also check this
chunk_reserved_bytes_limit = BitUtil::RoundDown(chunk_reserved_bytes_limit, 4096);
ChunkAllocator::init_instance(chunk_reserved_bytes_limit);
LOG(INFO) << "Chunk allocator memory limit: "
<< PrettyPrinter::print(chunk_reserved_bytes_limit, TUnit::BYTES)
Expand Down
15 changes: 2 additions & 13 deletions be/src/runtime/mem_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,20 +119,9 @@ Status MemPool::find_chunk(size_t min_size, bool check_limits) {
}

// Didn't find a big enough free chunk - need to allocate new chunk.
size_t chunk_size = 0;
DCHECK_LE(next_chunk_size_, MAX_CHUNK_SIZE);

if (config::disable_mem_pools) {
// Disable pooling by sizing the chunk to fit only this allocation.
// Make sure the alignment guarantees are respected.
// This will generate too many small chunks.
chunk_size = std::max<size_t>(min_size, alignof(max_align_t));
} else {
DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE);
chunk_size = std::max<size_t>(min_size, next_chunk_size_);
}

chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size);
DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE);
size_t chunk_size = BitUtil::RoundUpToPowerOfTwo(std::max<size_t>(min_size, next_chunk_size_));
if (check_limits &&
!thread_context()->_thread_mem_tracker_mgr->limiter_mem_tracker_raw()->check_limit(
chunk_size)) {
Expand Down
6 changes: 3 additions & 3 deletions be/src/runtime/mem_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,9 @@ class MemPool {
// I refers to https://github.com/mcgov/asan_alignment_example.

ChunkInfo& info = chunks_[current_chunk_idx_];
int64_t aligned_allocated_bytes =
BitUtil::RoundUpToPowerOf2(info.allocated_bytes + DEFAULT_PADDING_SIZE, alignment);
if (aligned_allocated_bytes + size <= info.chunk.size) {
int64_t aligned_allocated_bytes = BitUtil::RoundUpToMultiplyOfFactor(
info.allocated_bytes + DEFAULT_PADDING_SIZE, alignment);
if (aligned_allocated_bytes + size + DEFAULT_PADDING_SIZE <= info.chunk.size) {
// Ensure the requested alignment is respected.
int64_t padding = aligned_allocated_bytes - info.allocated_bytes;
uint8_t* result = info.chunk.data + aligned_allocated_bytes;
Expand Down
57 changes: 29 additions & 28 deletions be/src/runtime/memory/chunk_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,35 +154,36 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit)
Status ChunkAllocator::allocate(size_t size, Chunk* chunk) {
CHECK((size > 0 && (size & (size - 1)) == 0));

// fast path: allocate from current core arena
int core_id = CpuInfo::get_current_core();
chunk->size = size;
chunk->core_id = core_id;

if (_arenas[core_id]->pop_free_chunk(size, &chunk->data)) {
DCHECK_GE(_reserved_bytes, 0);
_reserved_bytes.fetch_sub(size);
chunk_pool_local_core_alloc_count->increment(1);
// transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
return Status::OK();
}
// Second path: try to allocate from other core's arena
// When the reserved bytes is greater than the limit, the chunk is stolen from other arena.
// Otherwise, it is allocated from the system first, which can reserve enough memory as soon as possible.
// After that, allocate from current core arena as much as possible.
if (_reserved_bytes > _steal_arena_limit) {
++core_id;
for (int i = 1; i < _arenas.size(); ++i, ++core_id) {
if (_arenas[core_id % _arenas.size()]->pop_free_chunk(size, &chunk->data)) {
DCHECK_GE(_reserved_bytes, 0);
_reserved_bytes.fetch_sub(size);
chunk_pool_other_core_alloc_count->increment(1);
// reset chunk's core_id to other
chunk->core_id = core_id % _arenas.size();
// transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
return Status::OK();
chunk->size = size;
if (!config::disable_chunk_allocator) {
// fast path: allocate from current core arena
if (_arenas[core_id]->pop_free_chunk(size, &chunk->data)) {
DCHECK_GE(_reserved_bytes, 0);
_reserved_bytes.fetch_sub(size);
chunk_pool_local_core_alloc_count->increment(1);
// transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
return Status::OK();
}
// Second path: try to allocate from other core's arena
// When the reserved bytes is greater than the limit, the chunk is stolen from other arena.
// Otherwise, it is allocated from the system first, which can reserve enough memory as soon as possible.
// After that, allocate from current core arena as much as possible.
if (_reserved_bytes > _steal_arena_limit) {
++core_id;
for (int i = 1; i < _arenas.size(); ++i, ++core_id) {
if (_arenas[core_id % _arenas.size()]->pop_free_chunk(size, &chunk->data)) {
DCHECK_GE(_reserved_bytes, 0);
_reserved_bytes.fetch_sub(size);
chunk_pool_other_core_alloc_count->increment(1);
// reset chunk's core_id to other
chunk->core_id = core_id % _arenas.size();
// transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
return Status::OK();
}
}
}
}
Expand All @@ -204,7 +205,7 @@ Status ChunkAllocator::allocate(size_t size, Chunk* chunk) {
void ChunkAllocator::free(const Chunk& chunk) {
DCHECK(chunk.core_id != -1);
CHECK((chunk.size & (chunk.size - 1)) == 0);
if (config::disable_mem_pools) {
if (config::disable_chunk_allocator) {
Copy link
Contributor

@xinyiZzz xinyiZzz Oct 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it better to move the condition into MemPool and rename disable_chunk_allocator_in_mem_pool, Similar to disable_chunk_allocator_in_vec

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not use this. I will remove disable_mem_pools in the future after non-vectorized engine is removed. It is very confused with MemPool.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try removing disable_chunk_allocator_in_vec and replace with disable_chunk_allocator. (more detailed config comments)

SystemAllocator::free(chunk.data, chunk.size);
return;
}
Expand Down
10 changes: 8 additions & 2 deletions be/src/util/bit_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ class BitUtil {
return value / divisor + (value % divisor != 0);
}

static inline size_t round_up_to_page_size(size_t s) { return (s + 4096 - 1) / 4096 * 4096; }

// Returns 'value' rounded up to the nearest multiple of 'factor'
static inline int64_t round_up(int64_t value, int64_t factor) {
return (value + (factor - 1)) / factor * factor;
Expand Down Expand Up @@ -304,8 +306,12 @@ class BitUtil {
}

/// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
/// a power of two
static inline int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
/// a power of two, for example
/// Factor has to be a power of two
/// factor = 16, value = 10 --> result = 16
/// factor = 16, value = 17 --> result = 32
/// factor = 16, value = 33 --> result = 48
static inline int64_t RoundUpToMultiplyOfFactor(int64_t value, int64_t factor) {
DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
return (value + (factor - 1)) & ~(factor - 1);
}
Expand Down
11 changes: 8 additions & 3 deletions be/src/vec/common/arena.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,16 @@ class Arena : private boost::noncopyable {

public:
Arena(size_t initial_size_ = 4096, size_t growth_factor_ = 2,
size_t linear_growth_threshold_ = 128 * 1024 * 1024)
size_t linear_growth_threshold_ = -1)
: growth_factor(growth_factor_),
linear_growth_threshold(linear_growth_threshold_),
head(new Chunk(initial_size_, nullptr)),
size_in_bytes(head->size()) {}
size_in_bytes(head->size()) {
if (linear_growth_threshold_ < 0) {
linear_growth_threshold = config::memory_linear_growth_threshold;
} else {
linear_growth_threshold = linear_growth_threshold_;
}
}

~Arena() { delete head; }

Expand Down
22 changes: 16 additions & 6 deletions be/src/vec/common/pod_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include <cstddef>
#include <memory>

#include "common/config.h"
#include "util/bit_util.h"
#include "vec/common/allocator.h"
#include "vec/common/bit_helpers.h"
#include "vec/common/memcpy_small.h"
Expand Down Expand Up @@ -120,8 +122,9 @@ class PODArrayBase : private boost::noncopyable,
}
}

/// Not round up, keep the size just as the application pass in like std::vector
void alloc_for_num_elements(size_t num_elements) {
alloc(round_up_to_power_of_two_or_zero(minimum_memory_for_elements(num_elements)));
alloc(minimum_memory_for_elements(num_elements));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Allocating in powers of 2 has a positive impact on performance, if you wish to reduce memory usage,

join #ifndef STRICT_MEMORY_USE, similar to hash_table.h expansion

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think this should be a config, because if it is a config, we do not know when to open the config since it is a macro. Actually, there are two types of memory allocation in PODArray:

  1. reserve, sometimes the developer know the expected size of the array, then he should call reserve method to allocate the EXPECTED memory.
  2. push_back, the developer does not know the expected size of the array, then he just call push back to allocate memory. In this scenario, we should allocate memory using power of 2.

For most cases, we should reserve or resize memory size before push back, then we could reduce memory reallocation or memory copy.
This PR try to fix some problems. #13088.

}

template <typename... TAllocatorParams>
Expand Down Expand Up @@ -181,6 +184,7 @@ class PODArrayBase : private boost::noncopyable,
return (stack_threshold > 0) && (allocated_bytes() <= stack_threshold);
}

/// This method is called by push back or emplace back, this is the same behaviour with std::vector
template <typename... TAllocatorParams>
void reserve_for_next_size(TAllocatorParams&&... allocator_params) {
if (size() == 0) {
Expand All @@ -189,8 +193,10 @@ class PODArrayBase : private boost::noncopyable,
realloc(std::max(integerRoundUp(initial_bytes, ELEMENT_SIZE),
minimum_memory_for_elements(1)),
std::forward<TAllocatorParams>(allocator_params)...);
} else
} else {
// There is still a power of 2 expansion here, this method is used in push back method
realloc(allocated_bytes() * 2, std::forward<TAllocatorParams>(allocator_params)...);
}
}

#ifndef NDEBUG
Expand Down Expand Up @@ -229,7 +235,7 @@ class PODArrayBase : private boost::noncopyable,
template <typename... TAllocatorParams>
void reserve(size_t n, TAllocatorParams&&... allocator_params) {
if (n > capacity())
realloc(round_up_to_power_of_two_or_zero(minimum_memory_for_elements(n)),
realloc(minimum_memory_for_elements(n),
std::forward<TAllocatorParams>(allocator_params)...);
}

Expand Down Expand Up @@ -444,9 +450,11 @@ class PODArray : public PODArrayBase<sizeof(T), initial_bytes, TAllocator, pad_r
template <typename It1, typename It2, typename... TAllocatorParams>
void insert_prepare(It1 from_begin, It2 from_end, TAllocatorParams&&... allocator_params) {
size_t required_capacity = this->size() + (from_end - from_begin);
if (required_capacity > this->capacity())
if (required_capacity > this->capacity()) {
// std::vector's insert method will expand if required capactiy is larger than current
this->reserve(round_up_to_power_of_two_or_zero(required_capacity),
std::forward<TAllocatorParams>(allocator_params)...);
}
}

/// Do not insert into the array a piece of itself. Because with the resize, the iterators on themselves can be invalidated.
Expand Down Expand Up @@ -623,8 +631,10 @@ class PODArray : public PODArrayBase<sizeof(T), initial_bytes, TAllocator, pad_r
template <typename It1, typename It2>
void assign(It1 from_begin, It2 from_end) {
size_t required_capacity = from_end - from_begin;
if (required_capacity > this->capacity())
this->reserve(round_up_to_power_of_two_or_zero(required_capacity));
if (required_capacity > this->capacity()) {
// std::vector assign just expand the capacity to the required capacity
this->reserve(required_capacity);
}

size_t bytes_to_copy = this->byte_size(required_capacity);
memcpy(this->c_start, reinterpret_cast<const void*>(&*from_begin), bytes_to_copy);
Expand Down