From 1b030fa718fe684feed2749b293c2ef1156f190b Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 3 Apr 2025 19:37:53 +0000 Subject: [PATCH 1/3] Merge pull request #78423 from azat/MADV_GUARD Better guard pages with MADV_GUARD_INSTALL/MADV_GUARD_REMOVE (linux 6.13+) --- programs/keeper/Keeper.cpp | 1 + src/Client/BuzzHouse/Generator/FuzzConfig.cpp | 1 + src/Client/ClientBase.cpp | 1 + src/Client/Connection.cpp | 1 + src/Common/AsyncTaskExecutor.cpp | 1 + src/Common/AsyncTaskExecutor.h | 3 + src/Common/FiberStack.cpp | 79 +++++++++++++++++++ src/Common/FiberStack.h | 76 ++---------------- src/Common/ThreadStatus.cpp | 28 +++++-- src/Common/memory.cpp | 62 +++++++++++++++ src/Common/memory.h | 22 ++++-- src/Coordination/FourLetterCommand.h | 4 +- .../DistributedAsyncInsertBatch.cpp | 1 + 13 files changed, 195 insertions(+), 85 deletions(-) create mode 100644 src/Common/FiberStack.cpp create mode 100644 src/Common/memory.cpp diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 76a97213d9cd..658a21ae8999 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -1,6 +1,7 @@ #include "Keeper.h" #include +#include #include #include #include diff --git a/src/Client/BuzzHouse/Generator/FuzzConfig.cpp b/src/Client/BuzzHouse/Generator/FuzzConfig.cpp index b2b159ccfd5c..2e0958dfbc1f 100644 --- a/src/Client/BuzzHouse/Generator/FuzzConfig.cpp +++ b/src/Client/BuzzHouse/Generator/FuzzConfig.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index f5c4bf43e472..5324bf871fa8 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 7f659e348a50..42c426992c31 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Common/AsyncTaskExecutor.cpp b/src/Common/AsyncTaskExecutor.cpp index 44781ab5b83b..c3c38efd1b79 100644 --- a/src/Common/AsyncTaskExecutor.cpp +++ b/src/Common/AsyncTaskExecutor.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace DB diff --git a/src/Common/AsyncTaskExecutor.h b/src/Common/AsyncTaskExecutor.h index 0225b415c166..a0dfd7f66274 100644 --- a/src/Common/AsyncTaskExecutor.h +++ b/src/Common/AsyncTaskExecutor.h @@ -1,5 +1,8 @@ #pragma once +#include +#include +#include #include #include #include diff --git a/src/Common/FiberStack.cpp b/src/Common/FiberStack.cpp new file mode 100644 index 000000000000..d9bf2593aa39 --- /dev/null +++ b/src/Common/FiberStack.cpp @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(BOOST_USE_VALGRIND) +#include +#endif + +/// Required for older Darwin builds, that lack definition of MAP_ANONYMOUS +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + + +namespace DB::ErrorCodes +{ + extern const int CANNOT_ALLOCATE_MEMORY; +} + + +FiberStack::FiberStack(size_t stack_size_) + : stack_size(stack_size_) + , page_size(getPageSize()) +{ +} + +boost::context::stack_context FiberStack::allocate() const +{ + size_t num_pages = 1 + (stack_size - 1) / page_size; + size_t num_bytes = (num_pages + 1) * page_size; /// Add one page at bottom that will be used as guard-page + + void * vp = ::mmap(nullptr, num_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (MAP_FAILED == vp) + throw DB::ErrnoException(DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY, "FiberStack: Cannot mmap {}.", ReadableSize(num_bytes)); + + /// TODO: make reports on illegal guard page access more clear. + /// Currently we will see segfault and almost random stacktrace. + try + { + memoryGuardInstall(vp, page_size); + } + catch (...) + { + ::munmap(vp, num_bytes); + throw; + } + + /// Do not count guard page in memory usage. + auto trace = CurrentMemoryTracker::alloc(num_pages * page_size); + trace.onAlloc(vp, num_pages * page_size); + + boost::context::stack_context sctx; + sctx.size = num_bytes; + sctx.sp = static_cast< char * >(vp) + sctx.size; +#if defined(BOOST_USE_VALGRIND) + sctx.valgrind_stack_id = VALGRIND_STACK_REGISTER(sctx.sp, vp); +#endif + return sctx; +} + +void FiberStack::deallocate(boost::context::stack_context & sctx) const +{ +#if defined(BOOST_USE_VALGRIND) + VALGRIND_STACK_DEREGISTER(sctx.valgrind_stack_id); +#endif + void * vp = static_cast< char * >(sctx.sp) - sctx.size; + ::munmap(vp, sctx.size); + + /// Do not count guard page in memory usage. + auto trace = CurrentMemoryTracker::free(sctx.size - page_size); + trace.onFree(vp, sctx.size - page_size); +} diff --git a/src/Common/FiberStack.h b/src/Common/FiberStack.h index 9d135f27306f..acc4eb8d0a97 100644 --- a/src/Common/FiberStack.h +++ b/src/Common/FiberStack.h @@ -1,36 +1,11 @@ #pragma once -#include #include -#include -#include -#include -#include -#include -#include -#include - -#if defined(BOOST_USE_VALGRIND) -#include -#endif - -/// Required for older Darwin builds, that lack definition of MAP_ANONYMOUS -#ifndef MAP_ANONYMOUS -#define MAP_ANONYMOUS MAP_ANON -#endif - -namespace DB::ErrorCodes -{ - extern const int CANNOT_ALLOCATE_MEMORY; -} /// This is an implementation of allocator for fiber stack. /// The reference implementation is protected_fixedsize_stack from boost::context. /// This implementation additionally track memory usage. It is the main reason why it is needed. class FiberStack { -private: - size_t stack_size; - size_t page_size = 0; public: /// NOTE: If you see random segfaults in CI and stack starts from boost::context::...fiber... /// probably it worth to try to increase stack size for coroutines. @@ -39,51 +14,12 @@ class FiberStack /// way. We will have 80 pages with 4KB page size. static constexpr size_t default_stack_size = 320 * 1024; /// 64KB was not enough for tests - explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_) - { - page_size = getPageSize(); - } - - boost::context::stack_context allocate() const - { - size_t num_pages = 1 + (stack_size - 1) / page_size; - size_t num_bytes = (num_pages + 1) * page_size; /// Add one page at bottom that will be used as guard-page + explicit FiberStack(size_t stack_size_ = default_stack_size); - void * vp = ::mmap(nullptr, num_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (MAP_FAILED == vp) - throw DB::ErrnoException(DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY, "FiberStack: Cannot mmap {}.", ReadableSize(num_bytes)); + boost::context::stack_context allocate() const; + void deallocate(boost::context::stack_context & sctx) const; - /// TODO: make reports on illegal guard page access more clear. - /// Currently we will see segfault and almost random stacktrace. - if (-1 == ::mprotect(vp, page_size, PROT_NONE)) - { - ::munmap(vp, num_bytes); - throw DB::ErrnoException(DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY, "FiberStack: cannot protect guard page"); - } - - /// Do not count guard page in memory usage. - auto trace = CurrentMemoryTracker::alloc(num_pages * page_size); - trace.onAlloc(vp, num_pages * page_size); - - boost::context::stack_context sctx; - sctx.size = num_bytes; - sctx.sp = static_cast< char * >(vp) + sctx.size; -#if defined(BOOST_USE_VALGRIND) - sctx.valgrind_stack_id = VALGRIND_STACK_REGISTER(sctx.sp, vp); -#endif - return sctx; - } - - void deallocate(boost::context::stack_context & sctx) const - { -#if defined(BOOST_USE_VALGRIND) - VALGRIND_STACK_DEREGISTER(sctx.valgrind_stack_id); -#endif - void * vp = static_cast< char * >(sctx.sp) - sctx.size; - ::munmap(vp, sctx.size); - - /// Do not count guard page in memory usage. - auto trace = CurrentMemoryTracker::free(sctx.size - page_size); - trace.onFree(vp, sctx.size - page_size); - } +private: + const size_t stack_size; + const size_t page_size; }; diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index 47912a7922a4..7850b771492a 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,11 @@ namespace DB { thread_local ThreadStatus constinit * current_thread = nullptr; +namespace ErrorCodes +{ + extern const int CANNOT_ALLOCATE_MEMORY; +} + #if !defined(SANITIZER) namespace { @@ -41,15 +47,25 @@ constexpr size_t UNWIND_MINSIGSTKSZ = 32 << 10; struct ThreadStack { ThreadStack() - : data(aligned_alloc(getPageSize(), getSize())) { - /// Add a guard page - /// (and since the stack grows downward, we need to protect the first page). - mprotect(data, getPageSize(), PROT_NONE); + data = aligned_alloc(getPageSize(), getSize()); + if (!data) + throw ErrnoException(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Cannot allocate ThreadStack"); + + try + { + /// Since the stack grows downward, we need to protect the first page + memoryGuardInstall(data, getPageSize()); + } + catch (...) + { + free(data); + throw; + } } ~ThreadStack() { - mprotect(data, getPageSize(), PROT_WRITE|PROT_READ); + memoryGuardRemove(data, getPageSize()); free(data); } @@ -58,7 +74,7 @@ struct ThreadStack private: /// 16 KiB - not too big but enough to handle error. - void * data; + void * data = nullptr; }; } diff --git a/src/Common/memory.cpp b/src/Common/memory.cpp new file mode 100644 index 000000000000..332d2812041c --- /dev/null +++ b/src/Common/memory.cpp @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include + +#ifdef OS_LINUX +#if !defined(MADV_GUARD_INSTALL) +#define MADV_GUARD_INSTALL 102 +#endif + +#if !defined(MADV_GUARD_REMOVE) +#define MADV_GUARD_REMOVE 103 +#endif + +static bool supportsGuardPages() +{ + DB::VersionNumber madv_guard_minimal_version(6, 13, 0); + DB::VersionNumber linux_version(Poco::Environment::osVersion()); + return (linux_version >= madv_guard_minimal_version); +} +static bool supports_guard_pages = supportsGuardPages(); +#endif // OS_LINUX + +namespace DB::ErrorCodes +{ + extern const int SYSTEM_ERROR; +} + +/// Uses MADV_GUARD_INSTALL if available, or mprotect() if not +void memoryGuardInstall(void *addr, size_t len) +{ +#ifdef OS_LINUX + if (supports_guard_pages) + { + if (madvise(addr, len, MADV_GUARD_INSTALL)) + throw DB::ErrnoException(DB::ErrorCodes::SYSTEM_ERROR, "Cannot madvise(MADV_GUARD_INSTALL)"); + } + else +#endif + { + if (mprotect(addr, len, PROT_NONE)) + throw DB::ErrnoException(DB::ErrorCodes::SYSTEM_ERROR, "Cannot mprotect(PROT_NONE)"); + } +} + +/// Uses MADV_GUARD_REMOVE if available, or mprotect() if not +void memoryGuardRemove(void *addr, size_t len) +{ +#ifdef OS_LINUX + if (supports_guard_pages) + { + if (madvise(addr, len, MADV_GUARD_REMOVE)) + throw DB::ErrnoException(DB::ErrorCodes::SYSTEM_ERROR, "Cannot madvise(MADV_GUARD_INSTALL)"); + } + else +#endif + { + if (mprotect(addr, len, PROT_READ|PROT_WRITE)) + throw DB::ErrnoException(DB::ErrorCodes::SYSTEM_ERROR, "Cannot mprotect(PROT_NONE)"); + } +} diff --git a/src/Common/memory.h b/src/Common/memory.h index 11b8e7dfd3ad..3fd2c9444cca 100644 --- a/src/Common/memory.h +++ b/src/Common/memory.h @@ -17,6 +17,12 @@ # include #endif +#if defined(OS_LINUX) +# include +#elif defined(OS_DARWIN) +# include +#endif + namespace ProfileEvents { extern const Event GWPAsanAllocateSuccess; @@ -24,6 +30,16 @@ namespace ProfileEvents extern const Event GWPAsanFree; } +/// Guard pages interface. +/// +/// Uses MADV_GUARD_INSTALL/MADV_GUARD_REMOVE (since Linux 6.13+) which does +/// not splits VMA (unlike mprotect()), or fallback to mprotect() +/// +/// Uses MADV_GUARD_INSTALL if available, or mprotect() if not +void memoryGuardInstall(void *addr, size_t len); +/// Uses MADV_GUARD_REMOVE if available, or mprotect() if not +void memoryGuardRemove(void *addr, size_t len); + namespace Memory { @@ -165,12 +181,6 @@ inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size [[maybe_unuse #endif -#if defined(OS_LINUX) -# include -#elif defined(OS_DARWIN) -# include -#endif - template ... TAlign> requires DB::OptionalArgument inline ALWAYS_INLINE size_t getActualAllocationSize(size_t size, TAlign... align [[maybe_unused]]) diff --git a/src/Coordination/FourLetterCommand.h b/src/Coordination/FourLetterCommand.h index 32db8a724c6e..28977a284fa1 100644 --- a/src/Coordination/FourLetterCommand.h +++ b/src/Coordination/FourLetterCommand.h @@ -5,8 +5,8 @@ #include #include #include -#include #include +#include #include namespace DB @@ -14,8 +14,6 @@ namespace DB class KeeperDispatcher; -using String = std::string; - struct IFourLetterCommand; using FourLetterCommandPtr = std::shared_ptr; diff --git a/src/Storages/Distributed/DistributedAsyncInsertBatch.cpp b/src/Storages/Distributed/DistributedAsyncInsertBatch.cpp index 14e503f586ab..4996398ef598 100644 --- a/src/Storages/Distributed/DistributedAsyncInsertBatch.cpp +++ b/src/Storages/Distributed/DistributedAsyncInsertBatch.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include From 06a37c530a977f872b336300bd0bdb21e1086251 Mon Sep 17 00:00:00 2001 From: Sema Checherinda <104093494+CheSema@users.noreply.github.com> Date: Thu, 17 Apr 2025 18:40:29 +0000 Subject: [PATCH 2/3] Merge pull request #79147 from ClickHouse/chesema-reduce-stack-protection stack allocation in Fiber and alternative stack for signals --- src/Common/FiberStack.cpp | 67 +++++++++++++++++++++++++------------ src/Common/ThreadStatus.cpp | 45 +++++++++++++++++++------ 2 files changed, 79 insertions(+), 33 deletions(-) diff --git a/src/Common/FiberStack.cpp b/src/Common/FiberStack.cpp index d9bf2593aa39..6fc5c0df4e8e 100644 --- a/src/Common/FiberStack.cpp +++ b/src/Common/FiberStack.cpp @@ -18,6 +18,17 @@ #define MAP_ANONYMOUS MAP_ANON #endif +namespace +{ +constexpr bool guardPagesEnabled() +{ +#ifdef DEBUG_OR_SANITIZER_BUILD + return true; +#else + return false; +#endif +} +} namespace DB::ErrorCodes { @@ -34,33 +45,41 @@ FiberStack::FiberStack(size_t stack_size_) boost::context::stack_context FiberStack::allocate() const { size_t num_pages = 1 + (stack_size - 1) / page_size; - size_t num_bytes = (num_pages + 1) * page_size; /// Add one page at bottom that will be used as guard-page - void * vp = ::mmap(nullptr, num_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (MAP_FAILED == vp) - throw DB::ErrnoException(DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY, "FiberStack: Cannot mmap {}.", ReadableSize(num_bytes)); + if constexpr (guardPagesEnabled()) + /// Add one page at bottom that will be used as guard-page + num_pages += 1; - /// TODO: make reports on illegal guard page access more clear. - /// Currently we will see segfault and almost random stacktrace. - try - { - memoryGuardInstall(vp, page_size); - } - catch (...) + size_t num_bytes = num_pages * page_size; + + void * data = aligned_alloc(page_size, num_bytes); + + if (!data) + throw DB::ErrnoException(DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Cannot allocate FiberStack"); + + if constexpr (guardPagesEnabled()) { - ::munmap(vp, num_bytes); - throw; + /// TODO: make reports on illegal guard page access more clear. + /// Currently we will see segfault and almost random stacktrace. + try + { + memoryGuardInstall(data, page_size); + } + catch (...) + { + free(data); + throw; + } } - /// Do not count guard page in memory usage. - auto trace = CurrentMemoryTracker::alloc(num_pages * page_size); - trace.onAlloc(vp, num_pages * page_size); + auto trace = CurrentMemoryTracker::alloc(num_bytes); + trace.onAlloc(data, num_bytes); boost::context::stack_context sctx; sctx.size = num_bytes; - sctx.sp = static_cast< char * >(vp) + sctx.size; + sctx.sp = static_cast< char * >(data) + sctx.size; #if defined(BOOST_USE_VALGRIND) - sctx.valgrind_stack_id = VALGRIND_STACK_REGISTER(sctx.sp, vp); + sctx.valgrind_stack_id = VALGRIND_STACK_REGISTER(sctx.sp, data); #endif return sctx; } @@ -70,10 +89,14 @@ void FiberStack::deallocate(boost::context::stack_context & sctx) const #if defined(BOOST_USE_VALGRIND) VALGRIND_STACK_DEREGISTER(sctx.valgrind_stack_id); #endif - void * vp = static_cast< char * >(sctx.sp) - sctx.size; - ::munmap(vp, sctx.size); + void * data = static_cast< char * >(sctx.sp) - sctx.size; + + if constexpr (guardPagesEnabled()) + memoryGuardRemove(data, page_size); + + free(data); /// Do not count guard page in memory usage. - auto trace = CurrentMemoryTracker::free(sctx.size - page_size); - trace.onFree(vp, sctx.size - page_size); + auto trace = CurrentMemoryTracker::free(sctx.size); + trace.onFree(data, sctx.size - page_size); } diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index 7850b771492a..c77766de5968 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -29,6 +29,15 @@ namespace ErrorCodes namespace { +constexpr bool guardPagesEnabled() +{ +#ifdef DEBUG_OR_SANITIZER_BUILD + return true; +#else + return false; +#endif +} + /// For aarch64 16K is not enough (likely due to tons of registers) constexpr size_t UNWIND_MINSIGSTKSZ = 32 << 10; @@ -48,28 +57,42 @@ struct ThreadStack { ThreadStack() { - data = aligned_alloc(getPageSize(), getSize()); + auto page_size = getPageSize(); + data = aligned_alloc(page_size, getSize()); if (!data) throw ErrnoException(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Cannot allocate ThreadStack"); - try - { - /// Since the stack grows downward, we need to protect the first page - memoryGuardInstall(data, getPageSize()); - } - catch (...) + if constexpr (guardPagesEnabled()) { - free(data); - throw; + try + { + /// Since the stack grows downward, we need to protect the first page + memoryGuardInstall(data, page_size); + } + catch (...) + { + free(data); + throw; + } } } ~ThreadStack() { - memoryGuardRemove(data, getPageSize()); + if constexpr (guardPagesEnabled()) + memoryGuardRemove(data, getPageSize()); + free(data); } - static size_t getSize() { return std::max(UNWIND_MINSIGSTKSZ, MINSIGSTKSZ); } + static size_t getSize() + { + auto size = std::max(UNWIND_MINSIGSTKSZ, MINSIGSTKSZ); + + if constexpr (guardPagesEnabled()) + size += 1; + + return size; + } void * getData() const { return data; } private: From 91639256c4817443001a70c7c5792e94f26f556d Mon Sep 17 00:00:00 2001 From: Sema Checherinda <104093494+CheSema@users.noreply.github.com> Date: Fri, 25 Apr 2025 10:59:50 +0000 Subject: [PATCH 3/3] Merge pull request #79533 from ClickHouse/chesema-fix-alloc-alt-thread fix ThreadStack::getSize --- src/Common/ThreadStatus.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index c77766de5968..e2b82295dab6 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -89,7 +89,7 @@ struct ThreadStack auto size = std::max(UNWIND_MINSIGSTKSZ, MINSIGSTKSZ); if constexpr (guardPagesEnabled()) - size += 1; + size += getPageSize(); return size; }