From 938d3e870ef14bc24ede1abc73cb3bc38209d799 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Sun, 5 Apr 2026 14:16:00 +0200
Subject: [PATCH 01/15] Update CHANGELOG for v2.0.0 with breaking changes and
 new features

- Introduced breaking changes including type aliases for `ThreadPool` and `FastThreadPool`, updated return types for thread configuration methods, and removal of `submit_range()`.
- Added new types such as `ThreadPoolBase<WaitPolicy>` and `GlobalPool<PoolType>`.
- Centralized OS-level logic for thread priority, scheduling, and affinity into detail functions, reducing code duplication by ~1000 lines.
- Refactored `apply_profile()` methods for better consistency and clarity across thread types.
- Updated `VERSION` to 2.0.0.
---
 CHANGELOG.md                                  |  82 ++
 VERSION                                       |   2 +-
 include/threadschedule/profiles.hpp           | 116 ++-
 include/threadschedule/pthread_wrapper.hpp    |  28 +-
 include/threadschedule/scheduled_pool.hpp     |  62 +-
 include/threadschedule/scheduler_policy.hpp   | 141 ++++
 include/threadschedule/thread_pool.hpp        | 718 +++++-------------
 .../thread_pool_with_errors.hpp               | 329 +-------
 include/threadschedule/thread_registry.hpp    |  67 +-
 include/threadschedule/thread_wrapper.hpp     | 144 +---
 include/threadschedule/threadschedule.hpp     |   5 +
 11 files changed, 534 insertions(+), 1160 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 93ea620..e1b01d1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,85 @@
+# Changelog
+
+## v2.0.0 (unreleased)
+
+### Breaking Changes
+
+- **`ThreadPool` and `FastThreadPool` are now type aliases** for
+  `ThreadPoolBase<IndefiniteWait>` and `ThreadPoolBase<PollingWait>`. Behavior
+  is unchanged, but code that forward-declares or specializes on the concrete
+  class name may need adjustment.
+
+- **`configure_threads()`, `set_affinity()`, `distribute_across_cpus()`** on
+  `ThreadPool` and `FastThreadPool` now return `expected<void, std::error_code>`
+  instead of `bool`. `HighPerformancePool` already used this return type.
+  Migration: `if (pool.configure_threads(...))` still compiles (expected has
+  `operator bool`), but code that stores the result in a `bool` variable needs
+  updating to `auto` or the expected type.
+
+- **`ThreadPool::Statistics`** now includes `tasks_per_second` and
+  `avg_task_time` fields (previously only on `FastThreadPool` and
+  `HighPerformancePool`).
+
+- **`submit_range()` removed** from `ThreadPool`. Use `submit_batch()` instead
+  (consistent with `FastThreadPool` and `HighPerformancePool`). `submit_batch()`
+  is also more efficient: it acquires the queue lock once for the entire batch
+  instead of per-item.
+
+- **`GlobalThreadPool::submit_range()` removed**. Use
+  `GlobalThreadPool::submit_batch()`.
+
+- **`HighPerformancePoolWithErrors`, `FastThreadPoolWithErrors`,
+  `ThreadPoolWithErrors`** are now type aliases for `PoolWithErrors<Pool>`. The
+  public API is unchanged.
+
+- **`GlobalThreadPool`, `GlobalHighPerformancePool`** are now type aliases for
+  `GlobalPool<Pool>`. The public API is unchanged.
+
+### New Types
+
+- `ThreadPoolBase<WaitPolicy>` - parameterized single-queue thread pool.
+- `IndefiniteWait` / `PollingWait` - wait policy types for `ThreadPoolBase`.
+- `PoolWithErrors<PoolType>` - generic error-handling pool wrapper.
+- `GlobalPool<PoolType>` - generic singleton pool accessor.
+
+### Internal Improvements
+
+- **~1000 lines of code duplication removed** across `thread_pool.hpp`,
+  `thread_pool_with_errors.hpp`, `thread_wrapper.hpp`, `thread_registry.hpp`,
+  `pthread_wrapper.hpp`, `profiles.hpp`, and `scheduled_pool.hpp`.
+
+- **Priority / affinity / scheduling policy** OS-level logic centralized into
+  `detail::apply_priority()`, `detail::apply_scheduling_policy()`, and
+  `detail::apply_affinity()` free functions (overloaded for `pthread_t`,
+  `pid_t`, and `HANDLE`). `BaseThreadWrapper`, `ThreadControlBlock`,
+  `PThreadWrapper`, and `ThreadByNameView` now delegate to these shared
+  implementations.
+
+- **`apply_profile()` overloads** refactored to use shared
+  `detail::apply_profile_to()` and `detail::apply_profile_to_pool()` helpers.
+
+- **`ScheduledThreadPoolT`**: `schedule_at()` and `schedule_periodic_after()`
+  now share a private `insert_task()` helper.
+
+### Migration Guide
+
+```cpp
+// v1: bool return
+bool ok = pool.configure_threads("worker");
+
+// v2: expected return (operator bool still works in conditions)
+auto result = pool.configure_threads("worker");
+if (!result.has_value()) {
+    std::cerr << result.error().message() << std::endl;
+}
+
+// v1: submit_range
+auto futures = pool.submit_range(tasks.begin(), tasks.end());
+
+// v2: submit_batch (same signature, more efficient)
+auto futures = pool.submit_batch(tasks.begin(), tasks.end());
+```
+
 ## v1.4.1
 
 - Fix: `*WrapperReg` types (`ThreadWrapperReg`, `JThreadWrapperReg`,
diff --git a/VERSION b/VERSION
index 3eefcb9..227cea2 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.0.0
+2.0.0
diff --git a/include/threadschedule/profiles.hpp b/include/threadschedule/profiles.hpp
index 9bafd81..82e7aa7 100644
--- a/include/threadschedule/profiles.hpp
+++ b/include/threadschedule/profiles.hpp
@@ -58,7 +58,7 @@ struct ThreadProfile
     std::string name;
     SchedulingPolicy policy;
     ThreadPriority priority;
-    std::optional<ThreadAffinity> affinity; // optional pinning
+    std::optional<ThreadAffinity> affinity;
 };
 
 namespace profiles
@@ -103,20 +103,15 @@ inline auto background() -> ThreadProfile
 }
 } // namespace profiles
 
+namespace detail
+{
+
 /**
- * @brief Apply a profile to a thread wrapper or view.
- *
- * SFINAE-constrained: only participates in overload resolution when
- * @c is_thread_like_v<ThreadLike> is true (ThreadWrapper,
- * JThreadWrapper, PThreadWrapper, and their views).
- *
- * @tparam ThreadLike A type satisfying the is_thread_like trait.
- * @param t   Thread wrapper or view to configure.
- * @param p   Profile to apply.
- * @return    Empty expected on success, or @c operation_not_permitted.
+ * @brief Apply policy + optional affinity to any type exposing
+ *        set_scheduling_policy() and set_affinity().
  */
-template <typename ThreadLike, std::enable_if_t<is_thread_like_v<ThreadLike>, int> = 0>
-inline auto apply_profile(ThreadLike& t, ThreadProfile const& p) -> expected<void, std::error_code>
+template <typename T>
+inline auto apply_profile_to(T& t, ThreadProfile const& p) -> expected<void, std::error_code>
 {
     bool ok = true;
     if (!t.set_scheduling_policy(p.policy, p.priority).has_value())
@@ -132,20 +127,18 @@ inline auto apply_profile(ThreadLike& t, ThreadProfile const& p) -> expected<voi
 }
 
 /**
- * @brief Apply a profile to a ThreadControlBlock directly.
- *
- * @param t   Control block whose underlying thread will be reconfigured.
- * @param p   Profile to apply.
- * @return    Empty expected on success, or @c operation_not_permitted.
+ * @brief Apply configure_threads + optional affinity to any pool type.
  */
-inline auto apply_profile(ThreadControlBlock& t, ThreadProfile const& p) -> expected<void, std::error_code>
+template <typename PoolType>
+inline auto apply_profile_to_pool(PoolType& pool, std::string const& name_prefix, ThreadProfile const& p)
+    -> expected<void, std::error_code>
 {
     bool ok = true;
-    if (!t.set_scheduling_policy(p.policy, p.priority).has_value())
+    if (!pool.configure_threads(name_prefix, p.policy, p.priority).has_value())
         ok = false;
     if (p.affinity.has_value())
     {
-        if (!t.set_affinity(*p.affinity).has_value())
+        if (!pool.set_affinity(*p.affinity).has_value())
             ok = false;
     }
     if (ok)
@@ -153,18 +146,33 @@ inline auto apply_profile(ThreadControlBlock& t, ThreadProfile const& p) -> expe
     return unexpected(std::make_error_code(std::errc::operation_not_permitted));
 }
 
+} // namespace detail
+
 /**
- * @brief Apply a profile to a registered thread via its info record.
- *
- * Dereferences @c t.control and delegates to the ThreadControlBlock
- * overload.
- *
- * @warning Undefined behaviour if @c t.control is @c nullptr.
+ * @brief Apply a profile to a thread wrapper or view.
  *
- * @param t   Registered thread info whose control pointer is dereferenced.
+ * @tparam ThreadLike A type satisfying the is_thread_like trait.
+ * @param t   Thread wrapper or view to configure.
  * @param p   Profile to apply.
  * @return    Empty expected on success, or @c operation_not_permitted.
  */
+template <typename ThreadLike, std::enable_if_t<is_thread_like_v<ThreadLike>, int> = 0>
+inline auto apply_profile(ThreadLike& t, ThreadProfile const& p) -> expected<void, std::error_code>
+{
+    return detail::apply_profile_to(t, p);
+}
+
+/**
+ * @brief Apply a profile to a ThreadControlBlock directly.
+ */
+inline auto apply_profile(ThreadControlBlock& t, ThreadProfile const& p) -> expected<void, std::error_code>
+{
+    return detail::apply_profile_to(t, p);
+}
+
+/**
+ * @brief Apply a profile to a registered thread via its info record.
+ */
 inline auto apply_profile(RegisteredThreadInfo& t, ThreadProfile const& p) -> expected<void, std::error_code>
 {
     return apply_profile(*t.control, p);
@@ -172,62 +180,30 @@ inline auto apply_profile(RegisteredThreadInfo& t, ThreadProfile const& p) -> ex
 
 /**
  * @brief Apply a profile to every worker in a ThreadPool.
- *
- * Uses @c "pool" as the thread name prefix passed to
- * ThreadPool::configure_threads().
- *
- * @param pool  Thread pool to configure.
- * @param p     Profile to apply.
- * @return      Empty expected on success, or @c operation_not_permitted.
  */
 inline auto apply_profile(ThreadPool& pool, ThreadProfile const& p) -> expected<void, std::error_code>
 {
-    bool ok = true;
-    // Name prefix left to caller via configure_threads; here just policy/priority
-    if (!pool.configure_threads("pool", p.policy, p.priority))
-        ok = false;
-    if (p.affinity.has_value())
-    {
-        if (!pool.set_affinity(*p.affinity))
-            ok = false;
-    }
-    if (ok)
-        return {};
-    return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+    return detail::apply_profile_to_pool(pool, "pool", p);
+}
+
+/**
+ * @brief Apply a profile to every worker in a FastThreadPool.
+ */
+inline auto apply_profile(FastThreadPool& pool, ThreadProfile const& p) -> expected<void, std::error_code>
+{
+    return detail::apply_profile_to_pool(pool, "fast", p);
 }
 
 /**
  * @brief Apply a profile to every worker in a HighPerformancePool.
- *
- * Uses @c "hp" as the thread name prefix passed to
- * HighPerformancePool::configure_threads().
- *
- * @param pool  High-performance pool to configure.
- * @param p     Profile to apply.
- * @return      Empty expected on success, or @c operation_not_permitted.
  */
 inline auto apply_profile(HighPerformancePool& pool, ThreadProfile const& p) -> expected<void, std::error_code>
 {
-    bool ok = true;
-    if (!pool.configure_threads("hp", p.policy, p.priority).has_value())
-        ok = false;
-    if (p.affinity.has_value())
-    {
-        if (!pool.set_affinity(*p.affinity).has_value())
-            ok = false;
-    }
-    if (ok)
-        return {};
-    return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+    return detail::apply_profile_to_pool(pool, "hp", p);
 }
 
 /**
  * @brief Apply a profile to a registry-managed thread identified by TID.
- *
- * @param reg  Thread registry that owns the thread.
- * @param tid  Thread identifier within the registry.
- * @param p    Profile to apply.
- * @return     Empty expected on success, or @c operation_not_permitted.
  */
 inline auto apply_profile(ThreadRegistry& reg, Tid tid, ThreadProfile const& p) -> expected<void, std::error_code>
 {
diff --git a/include/threadschedule/pthread_wrapper.hpp b/include/threadschedule/pthread_wrapper.hpp
index 61d5f8c..86c8b4d 100644
--- a/include/threadschedule/pthread_wrapper.hpp
+++ b/include/threadschedule/pthread_wrapper.hpp
@@ -168,40 +168,18 @@ class PThreadWrapper
 
     [[nodiscard]] auto set_priority(ThreadPriority priority) const -> expected<void, std::error_code>
     {
-        int const policy = SCHED_OTHER;
-        auto params_result = SchedulerParams::create_for_policy(SchedulingPolicy::OTHER, priority);
-
-        if (!params_result.has_value())
-        {
-            return unexpected(params_result.error());
-        }
-
-        if (pthread_setschedparam(thread_, policy, &params_result.value()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
+        return detail::apply_priority(thread_, priority);
     }
 
     [[nodiscard]] auto set_scheduling_policy(SchedulingPolicy policy, ThreadPriority priority) const
         -> expected<void, std::error_code>
     {
-        int const policy_int = static_cast<int>(policy);
-        auto params_result = SchedulerParams::create_for_policy(policy, priority);
-
-        if (!params_result.has_value())
-        {
-            return unexpected(params_result.error());
-        }
-
-        if (pthread_setschedparam(thread_, policy_int, &params_result.value()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
+        return detail::apply_scheduling_policy(thread_, policy, priority);
     }
 
     [[nodiscard]] auto set_affinity(ThreadAffinity const& affinity) const -> expected<void, std::error_code>
     {
-        if (pthread_setaffinity_np(thread_, sizeof(cpu_set_t), &affinity.native_handle()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
+        return detail::apply_affinity(thread_, affinity);
     }
 
     [[nodiscard]] auto get_affinity() const -> std::optional<ThreadAffinity>
diff --git a/include/threadschedule/scheduled_pool.hpp b/include/threadschedule/scheduled_pool.hpp
index 99912b2..31c149f 100644
--- a/include/threadschedule/scheduled_pool.hpp
+++ b/include/threadschedule/scheduled_pool.hpp
@@ -184,23 +184,7 @@ class ScheduledThreadPoolT
      */
     auto schedule_at(TimePoint time_point, Task task) -> ScheduledTaskHandle
     {
-        std::lock_guard<std::mutex> lock(mutex_);
-
-        uint64_t const task_id = next_task_id_++;
-        ScheduledTaskHandle handle(task_id);
-
-        ScheduledTaskInfo info;
-        info.id = task_id;
-        info.next_run = time_point;
-        info.interval = Duration::zero();
-        info.task = std::move(task);
-        info.cancelled = handle.get_cancel_flag();
-        info.periodic = false;
-
-        scheduled_tasks_.insert({time_point, std::move(info)});
-        condition_.notify_one();
-
-        return handle;
+        return insert_task(time_point, Duration::zero(), std::move(task), false);
     }
 
     /**
@@ -226,23 +210,8 @@ class ScheduledThreadPoolT
      */
     auto schedule_periodic_after(Duration initial_delay, Duration interval, Task task) -> ScheduledTaskHandle
     {
-        std::lock_guard<std::mutex> lock(mutex_);
-
-        uint64_t const task_id = next_task_id_++;
-        ScheduledTaskHandle handle(task_id);
-
-        ScheduledTaskInfo info;
-        info.id = task_id;
-        info.next_run = std::chrono::steady_clock::now() + initial_delay;
-        info.interval = interval;
-        info.task = std::move(task);
-        info.cancelled = handle.get_cancel_flag();
-        info.periodic = true;
-
-        scheduled_tasks_.insert({info.next_run, std::move(info)});
-        condition_.notify_one();
-
-        return handle;
+        auto const run_time = std::chrono::steady_clock::now() + initial_delay;
+        return insert_task(run_time, interval, std::move(task), true);
     }
 
     /**
@@ -298,9 +267,7 @@ class ScheduledThreadPoolT
     /**
      * @brief Configure worker threads
      *
-     * Note: Return type depends on the underlying pool type.
-     * @ref ThreadPool returns bool, @ref HighPerformancePool returns expected<void, std::error_code>.
-     * For consistent behavior, access the pool directly via thread_pool().
+     * Returns expected<void, std::error_code> from the underlying pool.
      */
     auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
                            ThreadPriority priority = ThreadPriority::normal())
@@ -319,6 +286,27 @@ class ScheduledThreadPoolT
     std::multimap<TimePoint, ScheduledTaskInfo> scheduled_tasks_;
     std::atomic<uint64_t> next_task_id_;
 
+    auto insert_task(TimePoint run_time, Duration interval, Task task, bool periodic) -> ScheduledTaskHandle
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+
+        uint64_t const task_id = next_task_id_++;
+        ScheduledTaskHandle handle(task_id);
+
+        ScheduledTaskInfo info;
+        info.id = task_id;
+        info.next_run = run_time;
+        info.interval = interval;
+        info.task = std::move(task);
+        info.cancelled = handle.get_cancel_flag();
+        info.periodic = periodic;
+
+        scheduled_tasks_.insert({run_time, std::move(info)});
+        condition_.notify_one();
+
+        return handle;
+    }
+
     void scheduler_loop()
     {
         while (true)
diff --git a/include/threadschedule/scheduler_policy.hpp b/include/threadschedule/scheduler_policy.hpp
index 8f99628..3b201fd 100644
--- a/include/threadschedule/scheduler_policy.hpp
+++ b/include/threadschedule/scheduler_policy.hpp
@@ -11,6 +11,7 @@
 #ifdef _WIN32
 #include <windows.h>
 #else
+#include <pthread.h>
 #include <sched.h>
 #include <sys/resource.h>
 #endif
@@ -482,4 +483,144 @@ inline auto to_string(SchedulingPolicy policy) -> std::string
     }
 }
 
+// ---------------------------------------------------------------------------
+// detail:: free functions for thread configuration (priority, policy, affinity)
+//
+// Overloaded by handle type so that every wrapper class can delegate with a
+// single call: detail::apply_priority(handle, priority).
+// ---------------------------------------------------------------------------
+
+namespace detail
+{
+
+#ifdef _WIN32
+
+inline auto map_priority_to_win32(int prio_val) -> int
+{
+    if (prio_val <= -10)
+        return THREAD_PRIORITY_IDLE;
+    if (prio_val <= -5)
+        return THREAD_PRIORITY_LOWEST;
+    if (prio_val < 0)
+        return THREAD_PRIORITY_BELOW_NORMAL;
+    if (prio_val == 0)
+        return THREAD_PRIORITY_NORMAL;
+    if (prio_val <= 5)
+        return THREAD_PRIORITY_ABOVE_NORMAL;
+    if (prio_val <= 10)
+        return THREAD_PRIORITY_HIGHEST;
+    return THREAD_PRIORITY_TIME_CRITICAL;
+}
+
+inline auto apply_priority(HANDLE handle, ThreadPriority priority) -> expected<void, std::error_code>
+{
+    if (!handle)
+        return unexpected(std::make_error_code(std::errc::no_such_process));
+    if (SetThreadPriority(handle, map_priority_to_win32(priority.value())) != 0)
+        return {};
+    return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+}
+
+inline auto apply_scheduling_policy(HANDLE handle, SchedulingPolicy /*policy*/, ThreadPriority priority)
+    -> expected<void, std::error_code>
+{
+    return apply_priority(handle, priority);
+}
+
+inline auto apply_affinity(HANDLE handle, ThreadAffinity const& affinity) -> expected<void, std::error_code>
+{
+    if (!handle)
+        return unexpected(std::make_error_code(std::errc::no_such_process));
+    using SetThreadGroupAffinityFn = BOOL(WINAPI*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
+    HMODULE hMod = GetModuleHandleW(L"kernel32.dll");
+    if (hMod)
+    {
+        auto set_group_affinity = reinterpret_cast<SetThreadGroupAffinityFn>(
+            reinterpret_cast<void*>(GetProcAddress(hMod, "SetThreadGroupAffinity")));
+        if (set_group_affinity && affinity.has_any())
+        {
+            GROUP_AFFINITY ga{};
+            ga.Mask = static_cast<KAFFINITY>(affinity.get_mask());
+            ga.Group = affinity.get_group();
+            if (set_group_affinity(handle, &ga, nullptr) != 0)
+                return {};
+            return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+        }
+    }
+    DWORD_PTR mask = static_cast<DWORD_PTR>(affinity.get_mask());
+    if (SetThreadAffinityMask(handle, mask) != 0)
+        return {};
+    return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+}
+
+#else // POSIX
+
+// --- pthread_t overloads (BaseThreadWrapper, ThreadControlBlock, PThreadWrapper) ---
+
+inline auto apply_priority(pthread_t handle, ThreadPriority priority) -> expected<void, std::error_code>
+{
+    int const policy = SCHED_OTHER;
+    auto params_result = SchedulerParams::create_for_policy(SchedulingPolicy::OTHER, priority);
+    if (!params_result.has_value())
+        return unexpected(params_result.error());
+    if (pthread_setschedparam(handle, policy, &params_result.value()) == 0)
+        return {};
+    return unexpected(std::error_code(errno, std::generic_category()));
+}
+
+inline auto apply_scheduling_policy(pthread_t handle, SchedulingPolicy policy, ThreadPriority priority)
+    -> expected<void, std::error_code>
+{
+    int const policy_int = static_cast<int>(policy);
+    auto params_result = SchedulerParams::create_for_policy(policy, priority);
+    if (!params_result.has_value())
+        return unexpected(params_result.error());
+    if (pthread_setschedparam(handle, policy_int, &params_result.value()) == 0)
+        return {};
+    return unexpected(std::error_code(errno, std::generic_category()));
+}
+
+inline auto apply_affinity(pthread_t handle, ThreadAffinity const& affinity) -> expected<void, std::error_code>
+{
+    if (pthread_setaffinity_np(handle, sizeof(cpu_set_t), &affinity.native_handle()) == 0)
+        return {};
+    return unexpected(std::error_code(errno, std::generic_category()));
+}
+
+// --- pid_t / TID overloads (ThreadByNameView) ---
+
+inline auto apply_priority(pid_t tid, ThreadPriority priority) -> expected<void, std::error_code>
+{
+    int const policy = SCHED_OTHER;
+    auto params_result = SchedulerParams::create_for_policy(SchedulingPolicy::OTHER, priority);
+    if (!params_result.has_value())
+        return unexpected(params_result.error());
+    if (sched_setscheduler(tid, policy, &params_result.value()) == 0)
+        return {};
+    return unexpected(std::error_code(errno, std::generic_category()));
+}
+
+inline auto apply_scheduling_policy(pid_t tid, SchedulingPolicy policy, ThreadPriority priority)
+    -> expected<void, std::error_code>
+{
+    int const policy_int = static_cast<int>(policy);
+    auto params_result = SchedulerParams::create_for_policy(policy, priority);
+    if (!params_result.has_value())
+        return unexpected(params_result.error());
+    if (sched_setscheduler(tid, policy_int, &params_result.value()) == 0)
+        return {};
+    return unexpected(std::error_code(errno, std::generic_category()));
+}
+
+inline auto apply_affinity(pid_t tid, ThreadAffinity const& affinity) -> expected<void, std::error_code>
+{
+    if (sched_setaffinity(tid, sizeof(cpu_set_t), &affinity.native_handle()) == 0)
+        return {};
+    return unexpected(std::error_code(errno, std::generic_category()));
+}
+
+#endif
+
+} // namespace detail
+
 } // namespace threadschedule
diff --git a/include/threadschedule/thread_pool.hpp b/include/threadschedule/thread_pool.hpp
index 99e94bd..5a488f2 100644
--- a/include/threadschedule/thread_pool.hpp
+++ b/include/threadschedule/thread_pool.hpp
@@ -79,7 +79,6 @@ class WorkStealingDeque
     {
     }
 
-    // Thread-safe operations
     [[nodiscard]] auto push(T&& item) -> bool
     {
         std::lock_guard<std::mutex> lock(mutex_);
@@ -88,7 +87,7 @@ class WorkStealingDeque
 
         if (t - b >= capacity_)
         {
-            return false; // Queue full
+            return false;
         }
 
         buffer_[t % capacity_] = AlignedItem(std::move(item));
@@ -104,7 +103,7 @@ class WorkStealingDeque
 
         if (t - b >= capacity_)
         {
-            return false; // Queue full
+            return false;
         }
 
         buffer_[t % capacity_] = AlignedItem(item);
@@ -120,7 +119,7 @@ class WorkStealingDeque
 
         if (t <= b)
         {
-            return false; // Empty
+            return false;
         }
 
         size_t const new_top = t - 1;
@@ -129,7 +128,6 @@ class WorkStealingDeque
         return true;
     }
 
-    // Thief operations (other threads stealing work)
     [[nodiscard]] auto steal(T& item) -> bool
     {
         std::lock_guard<std::mutex> lock(mutex_);
@@ -138,7 +136,7 @@ class WorkStealingDeque
 
         if (b >= t)
         {
-            return false; // Empty
+            return false;
         }
 
         item = std::move(buffer_[b % capacity_].item);
@@ -259,7 +257,6 @@ class HighPerformancePool
         : num_threads_(num_threads == 0 ? 1 : num_threads), stop_(false), next_victim_(0),
           start_time_(std::chrono::steady_clock::now())
     {
-        // Initialize per-thread work queues
         worker_queues_.resize(num_threads_);
         for (size_t i = 0; i < num_threads_; ++i)
         {
@@ -268,7 +265,6 @@ class HighPerformancePool
 
         workers_.reserve(num_threads_);
 
-        // Create worker threads with thread-local storage
         for (size_t i = 0; i < num_threads_; ++i)
         {
             workers_.emplace_back(&HighPerformancePool::worker_function, this, i);
@@ -298,20 +294,17 @@ class HighPerformancePool
 
         if (stop_.load(std::memory_order_acquire))
         {
-            throw std::runtime_error("ThreadPool is shutting down");
+            throw std::runtime_error("HighPerformancePool is shutting down");
         }
 
-        // Try to submit to least loaded queue (round-robin with fallback)
         size_t const preferred_queue = next_victim_.fetch_add(1, std::memory_order_relaxed) % num_threads_;
 
-        // First try the preferred queue
         if (worker_queues_[preferred_queue]->push([task]() { (*task)(); }))
         {
             wakeup_condition_.notify_one();
             return result;
         }
 
-        // If preferred queue is full, try a few random ones
         for (size_t attempts = 0; attempts < (std::min)(num_threads_, size_t(3)); ++attempts)
         {
             size_t const idx = (preferred_queue + attempts + 1) % num_threads_;
@@ -322,12 +315,11 @@ class HighPerformancePool
             }
         }
 
-        // All local queues full, use overflow queue
         {
             std::lock_guard<std::mutex> lock(overflow_mutex_);
             if (stop_.load(std::memory_order_relaxed))
             {
-                throw std::runtime_error("ThreadPool is shutting down");
+                throw std::runtime_error("HighPerformancePool is shutting down");
             }
             overflow_tasks_.emplace([task]() { (*task)(); });
         }
@@ -348,10 +340,9 @@ class HighPerformancePool
 
         if (stop_.load(std::memory_order_acquire))
         {
-            throw std::runtime_error("ThreadPool is shutting down");
+            throw std::runtime_error("HighPerformancePool is shutting down");
         }
 
-        // Distribute batch across worker queues
         size_t queue_idx = next_victim_.fetch_add(batch_size, std::memory_order_relaxed) % num_threads_;
 
         for (auto it = begin; it != end; ++it)
@@ -359,7 +350,6 @@ class HighPerformancePool
             auto task = std::make_shared<std::packaged_task<void()>>(*it);
             futures.push_back(task->get_future());
 
-            // Try to place in worker queue, round-robin style
             bool queued = false;
             for (size_t attempts = 0; attempts < num_threads_; ++attempts)
             {
@@ -373,13 +363,11 @@ class HighPerformancePool
 
             if (!queued)
             {
-                // Overflow to global queue
                 std::lock_guard<std::mutex> lock(overflow_mutex_);
                 overflow_tasks_.emplace([task]() { (*task)(); });
             }
         }
 
-        // Wake up workers for the batch
         wakeup_condition_.notify_all();
         return futures;
     }
@@ -394,7 +382,6 @@ class HighPerformancePool
         if (total_items == 0)
             return;
 
-        // Calculate optimal chunk size for cache efficiency
         size_t const chunk_size = (std::max)(size_t(1), total_items / (num_threads_ * 4));
         std::vector<std::future<void>> futures;
 
@@ -412,7 +399,6 @@ class HighPerformancePool
             it = chunk_end;
         }
 
-        // Wait for all chunks to complete
         for (auto& future : futures)
         {
             future.wait();
@@ -514,7 +500,7 @@ class HighPerformancePool
             std::lock_guard<std::mutex> lock(overflow_mutex_);
             if (stop_.exchange(true, std::memory_order_acq_rel))
             {
-                return; // Already shutting down
+                return;
             }
         }
 
@@ -573,11 +559,9 @@ class HighPerformancePool
     std::vector<ThreadWrapper> workers_;
     std::vector<std::unique_ptr<WorkStealingDeque<Task>>> worker_queues_;
 
-    // Overflow queue for when worker queues are full
     std::queue<Task> overflow_tasks_;
     mutable std::mutex overflow_mutex_;
 
-    // Synchronization
     std::atomic<bool> stop_;
     std::condition_variable wakeup_condition_;
     std::mutex wakeup_mutex_;
@@ -585,19 +569,17 @@ class HighPerformancePool
     std::condition_variable completion_condition_;
     std::mutex completion_mutex_;
 
-    // Load balancing and statistics
     std::atomic<size_t> next_victim_;
     std::atomic<size_t> active_tasks_{0};
     std::atomic<size_t> completed_tasks_{0};
     std::atomic<size_t> stolen_tasks_{0};
-    std::atomic<uint64_t> total_task_time_{0}; // microseconds
+    std::atomic<uint64_t> total_task_time_{0};
 
     std::chrono::steady_clock::time_point start_time_;
 
     // NOLINTNEXTLINE(readability-function-cognitive-complexity)
     void worker_function(size_t worker_id)
     {
-        // Thread-local random number generator for work stealing
         thread_local std::mt19937 gen = []() {
             std::random_device device;
             return std::mt19937(device());
@@ -610,12 +592,10 @@ class HighPerformancePool
         {
             bool found_task = false;
 
-            // 1. Try to get task from own queue (fast path)
             if (worker_queues_[worker_id]->pop(task))
             {
                 found_task = true;
             }
-            // 2. Try to steal from other workers (limit attempts to reduce contention)
             else
             {
                 size_t const max_steal_attempts = (std::min)(num_threads_, size_t(4));
@@ -631,7 +611,6 @@ class HighPerformancePool
                 }
             }
 
-            // 3. Try overflow queue
             if (!found_task)
             {
                 std::lock_guard<std::mutex> lock(overflow_mutex_);
@@ -645,7 +624,6 @@ class HighPerformancePool
 
             if (found_task)
             {
-                // Execute task with timing
                 active_tasks_.fetch_add(1, std::memory_order_relaxed);
 
                 auto const start_time = std::chrono::steady_clock::now();
@@ -655,7 +633,6 @@ class HighPerformancePool
                 }
                 catch (...)
                 {
-                    // Log exception or handle as needed
                 }
                 auto const end_time = std::chrono::steady_clock::now();
 
@@ -669,13 +646,11 @@ class HighPerformancePool
             }
             else
             {
-                // No work found, check if we should stop
                 if (stop_.load(std::memory_order_acquire))
                 {
                     break;
                 }
 
-                // Wait for work with adaptive timeout
                 std::unique_lock<std::mutex> lock(wakeup_mutex_);
                 wakeup_condition_.wait_for(lock, std::chrono::microseconds(100));
             }
@@ -683,68 +658,100 @@ class HighPerformancePool
     }
 };
 
+// ---------------------------------------------------------------------------
+// Wait policies for ThreadPoolBase
+// ---------------------------------------------------------------------------
+
 /**
- * @brief Single-queue thread pool with optimized locking for medium workloads.
+ * @brief Wait policy that blocks indefinitely until work is available.
  *
- * Alternative to @ref HighPerformancePool for cases where work-stealing overhead is
- * not justified. All tasks share one std::queue protected by a single mutex,
- * which keeps per-task overhead low while still scaling to multiple workers.
+ * Workers consume zero CPU while idle but wake instantly when a task is
+ * enqueued. Used by the @c ThreadPool type alias.
+ */
+struct IndefiniteWait
+{
+    template <typename Lock, typename Pred>
+    static auto wait(std::condition_variable& cv, Lock& lock, Pred pred) -> bool
+    {
+        cv.wait(lock, pred);
+        return true;
+    }
+};
+
+/**
+ * @brief Wait policy that polls with a 10 ms timeout.
  *
- * Best for: Medium workloads (100-10k tasks), consistent task patterns where
- * work-stealing complexity is not needed but better performance than the basic
- * @ref ThreadPool is desired.
+ * Workers periodically re-check the queue even without notification, trading
+ * a small amount of CPU for lower wake-up latency under bursty workloads.
+ * Used by the @c FastThreadPool type alias.
+ */
+struct PollingWait
+{
+    template <typename Lock, typename Pred>
+    static auto wait(std::condition_variable& cv, Lock& lock, Pred pred) -> bool
+    {
+        return cv.wait_for(lock, std::chrono::milliseconds(10), pred);
+    }
+};
+
+// ---------------------------------------------------------------------------
+// ThreadPoolBase
+// ---------------------------------------------------------------------------
+
+/**
+ * @brief Single-queue thread pool parameterized by its idle-wait strategy.
+ *
+ * All tasks share one std::queue protected by a single mutex. The
+ * @p WaitPolicy template parameter controls how workers wait for new
+ * work:
+ * - @ref IndefiniteWait - blocks on condition_variable::wait() (zero CPU
+ *   while idle, instant wake). Instantiated as @c ThreadPool.
+ * - @ref PollingWait - polls with condition_variable::wait_for(10 ms).
+ *   Slightly higher idle CPU but lower worst-case latency under bursty
+ *   loads. Instantiated as @c FastThreadPool.
  *
  * @par How task execution works
  * When you call submit(), the callable is wrapped in a std::packaged_task,
- * pushed into the single shared task queue under a mutex lock, and one
- * sleeping worker is woken via condition_variable::notify_one(). The woken
- * worker pops the front element from the queue and executes it. If the queue
- * is empty when a worker wakes up, it goes back to sleep with a 10 ms
- * timeout before checking again.
+ * pushed into the shared task queue under a mutex lock, and one sleeping
+ * worker is woken via condition_variable::notify_one(). The woken worker
+ * pops the front element and executes it.
  *
  * @par Execution guarantees
  * - Every successfully submitted task (submit() returned without throwing)
- *   is guaranteed to eventually execute, as long as the pool is not
- *   destroyed while shutdown() is draining remaining work.
+ *   is guaranteed to eventually execute.
  * - submit() throws std::runtime_error if the pool is already shutting
- *   down. In that case the task is NOT enqueued and will NOT execute.
- * - Tasks are stored in a FIFO queue, so they are picked up roughly in
- *   submission order. However, since multiple workers pop concurrently,
- *   the actual completion order is non-deterministic.
+ *   down. In that case the task is NOT enqueued.
+ * - Tasks are stored in a FIFO queue. Multiple workers pop concurrently,
+ *   so submission order is roughly preserved but completion order is
+ *   non-deterministic.
  * - The returned std::future becomes ready once the task finishes. If the
- *   task threw an exception, future.get() rethrows it. The worker thread
- *   itself is not affected and continues processing further tasks.
+ *   task threw an exception, future.get() rethrows it.
  * - On shutdown(), workers finish their current task, then drain all
- *   remaining queued tasks before exiting. Tasks submitted before
- *   shutdown() are guaranteed to execute.
+ *   remaining queued tasks before exiting.
+ * - wait_for_tasks() blocks until the queue is empty AND no worker is
+ *   currently executing a task.
  *
  * @par Thread safety
  * submit() and submit_batch() may be called from any thread concurrently.
  * shutdown() is internally guarded and safe to call more than once.
  *
- * @par Polling / wake-up
- * Workers use condition_variable::wait_for with a 10 ms timeout, so an idle
- * worker may take up to 10 ms to notice the stop flag after shutdown() is
- * called.
- *
  * @par Exception handling
  * Exceptions thrown by tasks are caught inside the worker loop. They are
  * stored in the std::future returned by submit(). The worker thread
  * continues processing.
  *
- * @par Configuration return type
- * configure_threads() and set_affinity() return bool (not
- * expected<void, std::error_code> as in @ref HighPerformancePool). A return
- * value of false means at least one worker could not be configured.
- *
  * @par Lifetime
  * The destructor calls shutdown() and joins all worker threads. Can block
  * if tasks are still running.
  *
  * @par Copyability / movability
  * Not copyable, not movable.
+ *
+ * @tparam WaitPolicy Strategy type with a static
+ *         @c wait(cv, lock, predicate) -> bool method.
  */
-class FastThreadPool
+template <typename WaitPolicy>
+class ThreadPoolBase
 {
   public:
     using Task = std::function<void()>;
@@ -759,28 +766,28 @@ class FastThreadPool
         std::chrono::microseconds avg_task_time;
     };
 
-    explicit FastThreadPool(size_t num_threads = std::thread::hardware_concurrency())
-        : num_threads_(num_threads == 0 ? 1 : num_threads), stop_(false), start_time_(std::chrono::steady_clock::now())
+    explicit ThreadPoolBase(size_t num_threads = std::thread::hardware_concurrency())
+        : num_threads_(num_threads == 0 ? 1 : num_threads), stop_(false),
+          start_time_(std::chrono::steady_clock::now())
     {
         workers_.reserve(num_threads_);
 
-        // Create worker threads
         for (size_t i = 0; i < num_threads_; ++i)
         {
-            workers_.emplace_back(&FastThreadPool::worker_function, this, i);
+            workers_.emplace_back(&ThreadPoolBase::worker_function, this, i);
         }
     }
 
-    FastThreadPool(FastThreadPool const&) = delete;
-    auto operator=(FastThreadPool const&) -> FastThreadPool& = delete;
+    ThreadPoolBase(ThreadPoolBase const&) = delete;
+    auto operator=(ThreadPoolBase const&) -> ThreadPoolBase& = delete;
 
-    ~FastThreadPool()
+    ~ThreadPoolBase()
     {
         shutdown();
     }
 
     /**
-     * @brief Optimized task submission with minimal locking
+     * @brief Submit a task to the thread pool
      */
     template <typename F, typename... Args>
     auto submit(F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
@@ -796,7 +803,7 @@ class FastThreadPool
             std::lock_guard<std::mutex> lock(queue_mutex_);
             if (stop_)
             {
-                throw std::runtime_error("FastThreadPool is shutting down");
+                throw std::runtime_error("Pool is shutting down");
             }
             tasks_.emplace([task]() { (*task)(); });
         }
@@ -806,20 +813,19 @@ class FastThreadPool
     }
 
     /**
-     * @brief Efficient batch processing
+     * @brief Submit multiple tasks under a single lock acquisition
      */
     template <typename Iterator>
     auto submit_batch(Iterator begin, Iterator end) -> std::vector<std::future<void>>
     {
         std::vector<std::future<void>> futures;
-        size_t const batch_size = std::distance(begin, end);
-        futures.reserve(batch_size);
+        futures.reserve(std::distance(begin, end));
 
         {
             std::lock_guard<std::mutex> lock(queue_mutex_);
             if (stop_)
             {
-                throw std::runtime_error("FastThreadPool is shutting down");
+                throw std::runtime_error("Pool is shutting down");
             }
 
             for (auto it = begin; it != end; ++it)
@@ -830,35 +836,46 @@ class FastThreadPool
             }
         }
 
-        // Wake up all workers for batch processing
         condition_.notify_all();
         return futures;
     }
 
-    void shutdown()
+    /**
+     * @brief Apply a function to a range of values in parallel
+     */
+    template <typename Iterator, typename F>
+    void parallel_for_each(Iterator begin, Iterator end, F&& func)
     {
+        std::vector<std::future<void>> futures;
+        futures.reserve(std::distance(begin, end));
+
+        for (auto it = begin; it != end; ++it)
         {
-            std::lock_guard<std::mutex> lock(queue_mutex_);
-            if (stop_)
-                return;
-            stop_ = true;
+            futures.push_back(submit([func, it]() { func(*it); }));
         }
 
-        condition_.notify_all();
-
-        for (auto& worker : workers_)
+        for (auto& future : futures)
         {
-            if (worker.joinable())
-            {
-                worker.join();
-            }
+            future.wait();
         }
+    }
 
-        workers_.clear();
+    [[nodiscard]] auto size() const noexcept -> size_t
+    {
+        return num_threads_;
     }
 
+    [[nodiscard]] auto pending_tasks() const -> size_t
+    {
+        std::lock_guard<std::mutex> lock(queue_mutex_);
+        return tasks_.size();
+    }
+
+    /**
+     * @brief Configure all worker threads (name, scheduling policy, priority)
+     */
     auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
-                           ThreadPriority priority = ThreadPriority::normal()) -> bool
+                           ThreadPriority priority = ThreadPriority::normal()) -> expected<void, std::error_code>
     {
         bool success = true;
 
@@ -866,64 +883,62 @@ class FastThreadPool
         {
             std::string const thread_name = name_prefix + "_" + std::to_string(i);
 
-            if (!workers_[i].set_name(thread_name))
+            if (!workers_[i].set_name(thread_name).has_value())
             {
                 success = false;
             }
 
-            if (!workers_[i].set_scheduling_policy(policy, priority))
+            if (!workers_[i].set_scheduling_policy(policy, priority).has_value())
             {
                 success = false;
             }
         }
-
-        return success;
+        if (success)
+            return {};
+        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
     }
 
-    auto set_affinity(ThreadAffinity const& affinity) -> bool
+    /**
+     * @brief Set CPU affinity for all worker threads
+     */
+    auto set_affinity(ThreadAffinity const& affinity) -> expected<void, std::error_code>
     {
         bool success = true;
 
         for (auto& worker : workers_)
         {
-            if (!worker.set_affinity(affinity))
+            if (!worker.set_affinity(affinity).has_value())
             {
                 success = false;
             }
         }
-
-        return success;
+        if (success)
+            return {};
+        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
     }
 
-    auto distribute_across_cpus() -> bool
+    /**
+     * @brief Distribute workers across available CPUs (round-robin)
+     */
+    auto distribute_across_cpus() -> expected<void, std::error_code>
     {
         auto const cpu_count = std::thread::hardware_concurrency();
         if (cpu_count == 0)
-            return false;
+            return unexpected(std::make_error_code(std::errc::invalid_argument));
 
         bool success = true;
 
         for (size_t i = 0; i < workers_.size(); ++i)
         {
             ThreadAffinity affinity({static_cast<int>(i % cpu_count)});
-            if (!workers_[i].set_affinity(affinity))
+            if (!workers_[i].set_affinity(affinity).has_value())
             {
                 success = false;
             }
         }
-
-        return success;
-    }
-
-    [[nodiscard]] auto size() const noexcept -> size_t
-    {
-        return num_threads_;
-    }
-
-    [[nodiscard]] auto pending_tasks() const -> size_t
-    {
-        std::lock_guard<std::mutex> lock(queue_mutex_);
-        return tasks_.size();
+        if (success)
+            return {};
+        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
     }
 
     void wait_for_tasks()
@@ -933,6 +948,31 @@ class FastThreadPool
             lock, [this] { return tasks_.empty() && active_tasks_.load(std::memory_order_acquire) == 0; });
     }
 
+    void shutdown()
+    {
+        {
+            std::lock_guard<std::mutex> lock(queue_mutex_);
+            if (stop_)
+                return;
+            stop_ = true;
+        }
+
+        condition_.notify_all();
+
+        for (auto& worker : workers_)
+        {
+            if (worker.joinable())
+            {
+                worker.join();
+            }
+        }
+
+        workers_.clear();
+    }
+
+    /**
+     * @brief Get performance statistics
+     */
     [[nodiscard]] auto get_statistics() const -> Statistics
     {
         auto const now = std::chrono::steady_clock::now();
@@ -978,11 +1018,11 @@ class FastThreadPool
     std::atomic<bool> stop_;
     std::atomic<size_t> active_tasks_{0};
     std::atomic<size_t> completed_tasks_{0};
-    std::atomic<uint64_t> total_task_time_{0}; // microseconds
+    std::atomic<uint64_t> total_task_time_{0};
 
     std::chrono::steady_clock::time_point start_time_;
 
-    void worker_function(size_t /* worker_id */)
+    void worker_function(size_t /*worker_id*/)
     {
         while (true)
         {
@@ -992,8 +1032,7 @@ class FastThreadPool
             {
                 std::unique_lock<std::mutex> lock(queue_mutex_);
 
-                if (condition_.wait_for(lock, std::chrono::milliseconds(10),
-                                        [this] { return stop_ || !tasks_.empty(); }))
+                if (WaitPolicy::wait(condition_, lock, [this] { return stop_ || !tasks_.empty(); }))
                 {
                     if (stop_ && tasks_.empty())
                     {
@@ -1039,426 +1078,64 @@ class FastThreadPool
 };
 
 /**
- * @brief Simple, general-purpose thread pool.
- *
- * This is a straightforward thread pool implementation suitable for:
- * - Simple workloads with low task counts (< 1k tasks)
- * - General application use (50k-500k tasks/second)
- * - Simple task submission patterns
- * - Lower memory overhead and complexity
- * - Easier to understand and debug
+ * @brief General-purpose thread pool with indefinite blocking wait.
  *
- * For high-throughput scenarios (> 1k tasks), consider @ref FastThreadPool or
- * @ref HighPerformancePool.
+ * Workers block on condition_variable::wait() when idle - zero CPU
+ * consumption, instant wake-up on task submission. Suitable for most
+ * workloads.
  *
- * @par How task execution works
- * When you call submit(), the callable is wrapped in a std::packaged_task
- * and pushed into a single shared std::queue under a mutex lock. One
- * sleeping worker is then woken via condition_variable::notify_one(). The
- * woken worker pops the front task from the queue and executes it. Workers
- * block indefinitely on the condition_variable when the queue is empty (no
- * polling timeout), so they consume zero CPU while idle.
- *
- * @par Execution guarantees
- * - Every successfully submitted task (submit() returned without throwing)
- *   is guaranteed to eventually execute.
- * - submit() throws std::runtime_error if the pool is already shutting
- *   down. In that case the task is NOT enqueued.
- * - Tasks are stored in a FIFO queue. Multiple workers pop concurrently, so
- *   submission order is roughly preserved but completion order is
- *   non-deterministic.
- * - The returned std::future becomes ready once the task finishes. If the
- *   task threw an exception, future.get() rethrows it.
- * - On shutdown(), the stop flag is set and all workers are woken. Each
- *   worker finishes its current task and then exits only if the queue is
- *   empty. This means all tasks that were enqueued before shutdown() are
- *   guaranteed to execute.
- * - wait_for_tasks() blocks until the queue is empty AND no worker is
- *   currently executing a task.
- *
- * @par Thread safety
- * submit() may be called from any thread concurrently. All task-queue access
- * is serialized through queue_mutex_.
- *
- * @par Wake-up behaviour
- * Workers block on a std::condition_variable (no polling timeout), so they
- * consume no CPU while idle but wake instantly when a task is enqueued.
- *
- * @par Internal counter note
- * Unlike @ref FastThreadPool and @ref HighPerformancePool, active_tasks_ and
- * completed_tasks_ are incremented/decremented while queue_mutex_ is held.
- * This means they are always consistent with the queue size, but every task
- * completion acquires the mutex an extra time.
- *
- * @par Exception handling
- * Exceptions thrown by tasks are caught inside the worker loop. They are
- * stored in the std::future returned by submit(). The worker thread
- * continues processing.
- *
- * @par Lifetime
- * The destructor calls shutdown() and joins all worker threads. Can block
- * if tasks are still running.
- *
- * @par Copyability / movability
- * Not copyable, not movable.
+ * @see ThreadPoolBase, IndefiniteWait
  */
-class ThreadPool
-{
-  public:
-    using Task = std::function<void()>;
-
-    struct Statistics
-    {
-        size_t total_threads;
-        size_t active_threads;
-        size_t pending_tasks;
-        size_t completed_tasks;
-    };
-
-    explicit ThreadPool(size_t num_threads = std::thread::hardware_concurrency())
-        : num_threads_(num_threads == 0 ? 1 : num_threads), stop_(false)
-    {
-        workers_.reserve(num_threads_);
-
-        // Create worker threads
-        for (size_t i = 0; i < num_threads_; ++i)
-        {
-            workers_.emplace_back(&ThreadPool::worker_function, this);
-        }
-    }
-
-    ThreadPool(ThreadPool const&) = delete;
-    auto operator=(ThreadPool const&) -> ThreadPool& = delete;
-
-    ~ThreadPool()
-    {
-        shutdown();
-    }
-
-    /**
-     * @brief Submit a task to the thread pool
-     */
-    template <typename F, typename... Args>
-    auto submit(F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
-    {
-        using return_type = std::invoke_result_t<F, Args...>;
-
-        auto task = std::make_shared<std::packaged_task<return_type()>>(
-            std::bind(std::forward<F>(f), std::forward<Args>(args)...));
-
-        std::future<return_type> result = task->get_future();
-
-        {
-            std::lock_guard<std::mutex> lock(queue_mutex_);
-
-            if (stop_)
-            {
-                throw std::runtime_error("ThreadPool is shutting down");
-            }
-
-            tasks_.emplace([task]() { (*task)(); });
-        }
-
-        condition_.notify_one();
-        return result;
-    }
-
-    /**
-     * @brief Submit multiple tasks
-     */
-    template <typename Iterator>
-    auto submit_range(Iterator begin, Iterator end) -> std::vector<std::future<void>>
-    {
-        std::vector<std::future<void>> futures;
-        futures.reserve(std::distance(begin, end));
-
-        for (auto it = begin; it != end; ++it)
-        {
-            futures.push_back(submit(*it));
-        }
-
-        return futures;
-    }
-
-    /**
-     * @brief Apply a function to a range of values in parallel
-     */
-    template <typename Iterator, typename F>
-    void parallel_for_each(Iterator begin, Iterator end, F&& func)
-    {
-        std::vector<std::future<void>> futures;
-        futures.reserve(std::distance(begin, end));
-
-        for (auto it = begin; it != end; ++it)
-        {
-            futures.push_back(submit([func, it]() { func(*it); }));
-        }
-
-        // Wait for all tasks to complete
-        for (auto& future : futures)
-        {
-            future.wait();
-        }
-    }
-
-    [[nodiscard]] auto size() const noexcept -> size_t
-    {
-        return num_threads_;
-    }
-
-    [[nodiscard]] auto pending_tasks() const -> size_t
-    {
-        std::lock_guard<std::mutex> lock(queue_mutex_);
-        return tasks_.size();
-    }
-
-    /**
-     * @brief Configure thread properties
-     */
-    auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
-                           ThreadPriority priority = ThreadPriority::normal()) -> bool
-    {
-        bool success = true;
-
-        for (size_t i = 0; i < workers_.size(); ++i)
-        {
-            std::string const thread_name = name_prefix + "_" + std::to_string(i);
-
-            if (!workers_[i].set_name(thread_name))
-            {
-                success = false;
-            }
-
-            if (!workers_[i].set_scheduling_policy(policy, priority))
-            {
-                success = false;
-            }
-        }
-
-        return success;
-    }
-
-    auto set_affinity(ThreadAffinity const& affinity) -> bool
-    {
-        bool success = true;
-
-        for (auto& worker : workers_)
-        {
-            if (!worker.set_affinity(affinity))
-            {
-                success = false;
-            }
-        }
-
-        return success;
-    }
-
-    auto distribute_across_cpus() -> bool
-    {
-        auto const cpu_count = std::thread::hardware_concurrency();
-        if (cpu_count == 0)
-            return false;
-
-        bool success = true;
-
-        for (size_t i = 0; i < workers_.size(); ++i)
-        {
-            ThreadAffinity affinity({static_cast<int>(i % cpu_count)});
-            if (!workers_[i].set_affinity(affinity))
-            {
-                success = false;
-            }
-        }
-
-        return success;
-    }
-
-    void wait_for_tasks()
-    {
-        std::unique_lock<std::mutex> lock(queue_mutex_);
-        task_finished_condition_.wait(lock, [this] { return tasks_.empty() && active_tasks_ == 0; });
-    }
-
-    void shutdown()
-    {
-        {
-            std::lock_guard<std::mutex> lock(queue_mutex_);
-            if (stop_)
-                return;
-            stop_ = true;
-        }
-
-        condition_.notify_all();
-
-        for (auto& worker : workers_)
-        {
-            if (worker.joinable())
-            {
-                worker.join();
-            }
-        }
-
-        workers_.clear();
-    }
-
-    [[nodiscard]] auto get_statistics() const -> Statistics
-    {
-        std::lock_guard<std::mutex> lock(queue_mutex_);
-        Statistics stats;
-        stats.total_threads = num_threads_;
-        stats.active_threads = active_tasks_;
-        stats.pending_tasks = tasks_.size();
-        stats.completed_tasks = completed_tasks_;
-        return stats;
-    }
-
-  private:
-    size_t num_threads_;
-    std::vector<ThreadWrapper> workers_;
-    std::queue<Task> tasks_;
-
-    mutable std::mutex queue_mutex_;
-    std::condition_variable condition_;
-    std::condition_variable task_finished_condition_;
-    std::atomic<bool> stop_;
-    std::atomic<size_t> active_tasks_{0};
-    std::atomic<size_t> completed_tasks_{0};
-
-    void worker_function()
-    {
-        while (true)
-        {
-            Task task;
-
-            {
-                std::unique_lock<std::mutex> lock(queue_mutex_);
-
-                condition_.wait(lock, [this] { return stop_ || !tasks_.empty(); });
-
-                if (stop_ && tasks_.empty())
-                {
-                    return;
-                }
-
-                task = std::move(tasks_.front());
-                tasks_.pop();
-                ++active_tasks_;
-            }
-
-            try
-            {
-                task();
-            }
-            catch (...)
-            {
-                // Log exception or handle as needed
-            }
-
-            {
-                std::lock_guard<std::mutex> lock(queue_mutex_);
-                --active_tasks_;
-                ++completed_tasks_;
-            }
-
-            task_finished_condition_.notify_all();
-        }
-    }
-};
+using ThreadPool = ThreadPoolBase<IndefiniteWait>;
 
 /**
- * @brief Singleton accessor for a process-wide @ref ThreadPool instance.
+ * @brief Thread pool with 10 ms polling wait for lower wake-up latency.
  *
- * Provides static convenience methods that forward to a single @ref ThreadPool
- * whose lifetime is managed as a function-local static (Meyer's singleton).
+ * Workers poll with condition_variable::wait_for(10 ms), trading a small
+ * amount of idle CPU for more consistent latency under bursty workloads.
  *
- * @par Thread safety
- * The underlying @ref ThreadPool is created on the first call to instance() and is
- * guaranteed to be thread-safe in C++11 and later (magic statics). All
- * forwarded methods (submit, submit_range, parallel_for_each) are as
- * thread-safe as the corresponding @ref ThreadPool methods.
- *
- * @par Pool size
- * The pool is created with @c std::thread::hardware_concurrency() threads.
- * This size is fixed for the lifetime of the process; there is no API to
- * resize the singleton pool after creation.
- *
- * @par Static destruction order
- * Because the pool is a function-local static, it is destroyed during static
- * destruction in reverse order of construction. Submitting work to the global
- * pool from destructors of other static objects is undefined behaviour if the
- * pool has already been destroyed. Prefer explicit lifetime management in
- * programs with complex static initialization dependencies.
- *
- * @par Copyability / movability
- * Not instantiable (private constructor). All access is through static
- * methods.
+ * @see ThreadPoolBase, PollingWait
  */
-class GlobalThreadPool
-{
-  public:
-    static auto instance() -> ThreadPool&
-    {
-        static ThreadPool pool(std::thread::hardware_concurrency());
-        return pool;
-    }
-
-    template <typename F, typename... Args>
-    static auto submit(F&& f, Args&&... args)
-    {
-        return instance().submit(std::forward<F>(f), std::forward<Args>(args)...);
-    }
-
-    template <typename Iterator>
-    static auto submit_range(Iterator begin, Iterator end)
-    {
-        return instance().submit_range(begin, end);
-    }
-
-    template <typename Iterator, typename F>
-    static void parallel_for_each(Iterator begin, Iterator end, F&& func)
-    {
-        instance().parallel_for_each(begin, end, std::forward<F>(func));
-    }
+using FastThreadPool = ThreadPoolBase<PollingWait>;
 
-  private:
-    GlobalThreadPool() = default;
-};
+// ---------------------------------------------------------------------------
+// GlobalPool
+// ---------------------------------------------------------------------------
 
 /**
- * @brief Singleton accessor for a process-wide @ref HighPerformancePool instance.
+ * @brief Singleton accessor for a process-wide pool instance.
  *
- * Provides static convenience methods that forward to a single
- * @ref HighPerformancePool whose lifetime is managed as a function-local static
- * (Meyer's singleton).
+ * Provides static convenience methods that forward to a single pool
+ * whose lifetime is managed as a function-local static (Meyer's singleton).
  *
  * @par Thread safety
  * The underlying pool is created on the first call to instance() and is
  * guaranteed to be thread-safe in C++11 and later (magic statics). All
- * forwarded methods (submit, submit_batch, parallel_for_each) are as
- * thread-safe as the corresponding @ref HighPerformancePool methods.
+ * forwarded methods are as thread-safe as the corresponding pool methods.
  *
  * @par Pool size
  * The pool is created with @c std::thread::hardware_concurrency() threads.
- * This size is fixed for the lifetime of the process; there is no API to
- * resize the singleton pool after creation.
+ * This size is fixed for the lifetime of the process.
  *
  * @par Static destruction order
  * Because the pool is a function-local static, it is destroyed during static
  * destruction in reverse order of construction. Submitting work to the global
  * pool from destructors of other static objects is undefined behaviour if the
- * pool has already been destroyed. Prefer explicit lifetime management in
- * programs with complex static initialization dependencies.
+ * pool has already been destroyed.
  *
  * @par Copyability / movability
  * Not instantiable (private constructor). All access is through static
  * methods.
+ *
+ * @tparam PoolType The concrete pool type to wrap.
  */
-class GlobalHighPerformancePool
+template <typename PoolType>
+class GlobalPool
 {
   public:
-    static auto instance() -> HighPerformancePool&
+    static auto instance() -> PoolType&
     {
-        static HighPerformancePool pool(std::thread::hardware_concurrency());
+        static PoolType pool(std::thread::hardware_concurrency());
         return pool;
     }
 
@@ -1481,9 +1158,15 @@ class GlobalHighPerformancePool
     }
 
   private:
-    GlobalHighPerformancePool() = default;
+    GlobalPool() = default;
 };
 
+/** @brief Singleton @ref ThreadPool accessor. */
+using GlobalThreadPool = GlobalPool<ThreadPool>;
+
+/** @brief Singleton @ref HighPerformancePool accessor. */
+using GlobalHighPerformancePool = GlobalPool<HighPerformancePool>;
+
 /**
  * @brief Convenience wrapper that applies a callable to every element of a
  *        container in parallel using the @ref GlobalThreadPool singleton.
@@ -1495,17 +1178,6 @@ class GlobalHighPerformancePool
  *
  * The call blocks until every element has been processed.
  *
- * @par Thread safety
- * The function itself is thread-safe (it forwards to @ref GlobalThreadPool which
- * guards its queue with a mutex). However, the caller must ensure that
- * concurrent invocations of @p func on different elements do not race on
- * shared state.
- *
- * @par Pool lifetime
- * On the first call, GlobalThreadPool::instance() lazily creates the
- * singleton pool sized to @c std::thread::hardware_concurrency(). See
- * @ref GlobalThreadPool for static-destruction-order caveats.
- *
  * @tparam Container Any type exposing begin() / end() iterators.
  * @tparam F         Callable compatible with @c void(Container::value_type&).
  *
diff --git a/include/threadschedule/thread_pool_with_errors.hpp b/include/threadschedule/thread_pool_with_errors.hpp
index 1694cd4..36b5c61 100644
--- a/include/threadschedule/thread_pool_with_errors.hpp
+++ b/include/threadschedule/thread_pool_with_errors.hpp
@@ -8,10 +8,9 @@ namespace threadschedule
 {
 
 /**
- * @brief @ref HighPerformancePool combined with an @ref ErrorHandler.
+ * @brief Thread pool wrapper that combines any pool type with an @ref ErrorHandler.
  *
- * Non-copyable, non-movable. Thread-safe (delegates to the underlying
- * @ref HighPerformancePool).
+ * Non-copyable, non-movable. Thread-safe (delegates to the underlying pool).
  *
  * submit() wraps every task so that exceptions are both reported to
  * the @ref ErrorHandler (via registered callbacks) **and** re-thrown, making
@@ -20,11 +19,15 @@ namespace threadschedule
  * description string to the error report for easier diagnostics.
  *
  * @see FutureWithErrorHandler, ErrorHandler, TaskError
+ *
+ * @tparam PoolType The underlying pool type (e.g. ThreadPool,
+ *         FastThreadPool, HighPerformancePool).
  */
-class HighPerformancePoolWithErrors
+template <typename PoolType>
+class PoolWithErrors
 {
   public:
-    explicit HighPerformancePoolWithErrors(size_t num_threads = std::thread::hardware_concurrency())
+    explicit PoolWithErrors(size_t num_threads = std::thread::hardware_concurrency())
         : pool_(num_threads), error_handler_(std::make_shared<ErrorHandler>())
     {
     }
@@ -86,167 +89,6 @@ class HighPerformancePoolWithErrors
         return FutureWithErrorHandler<std::invoke_result_t<F, Args...>>(std::move(future));
     }
 
-    /**
-     * @brief Add a global error callback for all tasks
-     */
-    auto add_error_callback(ErrorCallback callback) -> size_t
-    {
-        return error_handler_->add_callback(std::move(callback));
-    }
-
-    /**
-     * @brief Clear all error callbacks
-     */
-    void clear_error_callbacks()
-    {
-        error_handler_->clear_callbacks();
-    }
-
-    /**
-     * @brief Get total error count
-     */
-    [[nodiscard]] auto error_count() const -> size_t
-    {
-        return error_handler_->error_count();
-    }
-
-    /**
-     * @brief Reset error count
-     */
-    void reset_error_count()
-    {
-        error_handler_->reset_error_count();
-    }
-
-    /**
-     * @brief Get the underlying pool
-     */
-    [[nodiscard]] auto pool() -> HighPerformancePool&
-    {
-        return pool_;
-    }
-
-    /**
-     * @brief Get statistics from underlying pool
-     */
-    [[nodiscard]] auto get_statistics() const -> HighPerformancePool::Statistics
-    {
-        return pool_.get_statistics();
-    }
-
-    /**
-     * @brief Configure threads
-     */
-    auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
-                           ThreadPriority priority = ThreadPriority::normal()) -> expected<void, std::error_code>
-    {
-        return pool_.configure_threads(name_prefix, policy, priority);
-    }
-
-    auto set_affinity(ThreadAffinity const& affinity) -> expected<void, std::error_code>
-    {
-        return pool_.set_affinity(affinity);
-    }
-
-    auto distribute_across_cpus() -> expected<void, std::error_code>
-    {
-        return pool_.distribute_across_cpus();
-    }
-
-    void shutdown()
-    {
-        pool_.shutdown();
-    }
-
-    void wait_for_tasks()
-    {
-        pool_.wait_for_tasks();
-    }
-
-    [[nodiscard]] auto size() const noexcept -> size_t
-    {
-        return pool_.size();
-    }
-
-    [[nodiscard]] auto pending_tasks() const -> size_t
-    {
-        return pool_.pending_tasks();
-    }
-
-  private:
-    HighPerformancePool pool_;
-    std::shared_ptr<ErrorHandler> error_handler_;
-};
-
-/**
- * @brief FastThreadPool combined with an ErrorHandler.
- *
- * Non-copyable, non-movable. Thread-safe (delegates to the underlying
- * FastThreadPool). Same error-handling semantics as
- * HighPerformancePoolWithErrors: exceptions are reported to the
- * ErrorHandler **and** re-thrown through the future.
- *
- * @see HighPerformancePoolWithErrors for detailed behaviour.
- */
-class FastThreadPoolWithErrors
-{
-  public:
-    explicit FastThreadPoolWithErrors(size_t num_threads = std::thread::hardware_concurrency())
-        : pool_(num_threads), error_handler_(std::make_shared<ErrorHandler>())
-    {
-    }
-
-    template <typename F, typename... Args>
-    auto submit(F&& f, Args&&... args) -> FutureWithErrorHandler<std::invoke_result_t<F, Args...>>
-    {
-        auto handler = error_handler_;
-        auto wrapped_task = [f = std::forward<F>(f), args = std::make_tuple(std::forward<Args>(args)...), handler]() {
-            try
-            {
-                return std::apply(f, args);
-            }
-            catch (...)
-            {
-                TaskError error;
-                error.exception = std::current_exception();
-                error.thread_id = std::this_thread::get_id();
-                error.timestamp = std::chrono::steady_clock::now();
-                handler->handle_error(error);
-                throw;
-            }
-        };
-
-        auto future = pool_.submit(std::move(wrapped_task));
-        return FutureWithErrorHandler<std::invoke_result_t<F, Args...>>(std::move(future));
-    }
-
-    template <typename F, typename... Args>
-    auto submit_with_description(std::string const& description, F&& f, Args&&... args)
-        -> FutureWithErrorHandler<std::invoke_result_t<F, Args...>>
-    {
-        auto handler = error_handler_;
-        auto wrapped_task = [f = std::forward<F>(f), args = std::make_tuple(std::forward<Args>(args)...), handler,
-                             description]() {
-            try
-            {
-                return std::apply(f, args);
-            }
-            catch (...)
-            {
-                TaskError error;
-                error.exception = std::current_exception();
-                error.task_description = description;
-                error.thread_id = std::this_thread::get_id();
-                error.timestamp = std::chrono::steady_clock::now();
-                handler->handle_error(error);
-                throw;
-            }
-        };
-
-        auto future = pool_.submit(std::move(wrapped_task));
-        return FutureWithErrorHandler<std::invoke_result_t<F, Args...>>(std::move(future));
-    }
-
     auto add_error_callback(ErrorCallback callback) -> size_t
     {
         return error_handler_->add_callback(std::move(callback));
@@ -267,28 +109,28 @@ class FastThreadPoolWithErrors
         error_handler_->reset_error_count();
     }
 
-    [[nodiscard]] auto pool() -> FastThreadPool&
+    [[nodiscard]] auto pool() -> PoolType&
     {
         return pool_;
     }
 
-    [[nodiscard]] auto get_statistics() const -> FastThreadPool::Statistics
+    [[nodiscard]] auto get_statistics() const -> decltype(auto)
     {
         return pool_.get_statistics();
     }
 
     auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
-                           ThreadPriority priority = ThreadPriority::normal()) -> bool
+                           ThreadPriority priority = ThreadPriority::normal()) -> decltype(auto)
     {
         return pool_.configure_threads(name_prefix, policy, priority);
     }
 
-    auto set_affinity(ThreadAffinity const& affinity) -> bool
+    auto set_affinity(ThreadAffinity const& affinity) -> decltype(auto)
     {
         return pool_.set_affinity(affinity);
     }
 
-    auto distribute_across_cpus() -> bool
+    auto distribute_across_cpus() -> decltype(auto)
     {
         return pool_.distribute_across_cpus();
     }
@@ -314,148 +156,17 @@ class FastThreadPoolWithErrors
     }
 
   private:
-    FastThreadPool pool_;
+    PoolType pool_;
     std::shared_ptr<ErrorHandler> error_handler_;
 };
 
-/**
- * @brief ThreadPool combined with an ErrorHandler.
- *
- * Non-copyable, non-movable. Thread-safe (delegates to the underlying
- * ThreadPool). Same error-handling semantics as
- * HighPerformancePoolWithErrors: exceptions are reported to the
- * ErrorHandler **and** re-thrown through the future.
- *
- * @see HighPerformancePoolWithErrors for detailed behaviour.
- */
-class ThreadPoolWithErrors
-{
-  public:
-    explicit ThreadPoolWithErrors(size_t num_threads = std::thread::hardware_concurrency())
-        : pool_(num_threads), error_handler_(std::make_shared<ErrorHandler>())
-    {
-    }
-
-    template <typename F, typename... Args>
-    auto submit(F&& f, Args&&... args) -> FutureWithErrorHandler<std::invoke_result_t<F, Args...>>
-    {
-        auto handler = error_handler_;
-        auto wrapped_task = [f = std::forward<F>(f), args = std::make_tuple(std::forward<Args>(args)...), handler]() {
-            try
-            {
-                return std::apply(f, args);
-            }
-            catch (...)
-            {
-                TaskError error;
-                error.exception = std::current_exception();
-                error.thread_id = std::this_thread::get_id();
-                error.timestamp = std::chrono::steady_clock::now();
-                handler->handle_error(error);
-                throw;
-            }
-        };
-
-        auto future = pool_.submit(std::move(wrapped_task));
-        return FutureWithErrorHandler<std::invoke_result_t<F, Args...>>(std::move(future));
-    }
-
-    template <typename F, typename... Args>
-    auto submit_with_description(std::string const& description, F&& f, Args&&... args)
-        -> FutureWithErrorHandler<std::invoke_result_t<F, Args...>>
-    {
-        auto handler = error_handler_;
-        auto wrapped_task = [f = std::forward<F>(f), args = std::make_tuple(std::forward<Args>(args)...), handler,
-                             description]() {
-            try
-            {
-                return std::apply(f, args);
-            }
-            catch (...)
-            {
-                TaskError error;
-                error.exception = std::current_exception();
-                error.task_description = description;
-                error.thread_id = std::this_thread::get_id();
-                error.timestamp = std::chrono::steady_clock::now();
-                handler->handle_error(error);
-                throw;
-            }
-        };
-
-        auto future = pool_.submit(std::move(wrapped_task));
-        return FutureWithErrorHandler<std::invoke_result_t<F, Args...>>(std::move(future));
-    }
-
-    auto add_error_callback(ErrorCallback callback) -> size_t
-    {
-        return error_handler_->add_callback(std::move(callback));
-    }
-
-    void clear_error_callbacks()
-    {
-        error_handler_->clear_callbacks();
-    }
-
-    [[nodiscard]] auto error_count() const -> size_t
-    {
-        return error_handler_->error_count();
-    }
-
-    void reset_error_count()
-    {
-        error_handler_->reset_error_count();
-    }
-
-    [[nodiscard]] auto pool() -> ThreadPool&
-    {
-        return pool_;
-    }
-
-    [[nodiscard]] auto get_statistics() const -> ThreadPool::Statistics
-    {
-        return pool_.get_statistics();
-    }
-
-    auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
-                           ThreadPriority priority = ThreadPriority::normal()) -> bool
-    {
-        return pool_.configure_threads(name_prefix, policy, priority);
-    }
+/** @brief @ref HighPerformancePool with integrated error handling. */
+using HighPerformancePoolWithErrors = PoolWithErrors<HighPerformancePool>;
 
-    auto set_affinity(ThreadAffinity const& affinity) -> bool
-    {
-        return pool_.set_affinity(affinity);
-    }
+/** @brief @ref FastThreadPool with integrated error handling. */
+using FastThreadPoolWithErrors = PoolWithErrors<FastThreadPool>;
 
-    auto distribute_across_cpus() -> bool
-    {
-        return pool_.distribute_across_cpus();
-    }
-
-    void wait_for_tasks()
-    {
-        pool_.wait_for_tasks();
-    }
-
-    void shutdown()
-    {
-        pool_.shutdown();
-    }
-
-    [[nodiscard]] auto size() const noexcept -> size_t
-    {
-        return pool_.size();
-    }
-
-    [[nodiscard]] auto pending_tasks() const -> size_t
-    {
-        return pool_.pending_tasks();
-    }
-
-  private:
-    ThreadPool pool_;
-    std::shared_ptr<ErrorHandler> error_handler_;
-};
+/** @brief @ref ThreadPool with integrated error handling. */
+using ThreadPoolWithErrors = PoolWithErrors<ThreadPool>;
 
 } // namespace threadschedule
diff --git a/include/threadschedule/thread_registry.hpp b/include/threadschedule/thread_registry.hpp
index 3a0dae5..bb35270 100644
--- a/include/threadschedule/thread_registry.hpp
+++ b/include/threadschedule/thread_registry.hpp
@@ -159,67 +159,18 @@ class ThreadControlBlock
     [[nodiscard]] auto set_affinity(ThreadAffinity const& affinity) const -> expected<void, std::error_code>
     {
 #ifdef _WIN32
-        if (!handle_)
-            return unexpected(std::make_error_code(std::errc::no_such_process));
-        using SetThreadGroupAffinityFn = BOOL(WINAPI*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
-        HMODULE hMod = GetModuleHandleW(L"kernel32.dll");
-        if (hMod)
-        {
-            auto set_group_affinity = reinterpret_cast<SetThreadGroupAffinityFn>(
-                reinterpret_cast<void*>(GetProcAddress(hMod, "SetThreadGroupAffinity")));
-            if (set_group_affinity && affinity.has_any())
-            {
-                GROUP_AFFINITY ga{};
-                ga.Mask = static_cast<KAFFINITY>(affinity.get_mask());
-                ga.Group = affinity.get_group();
-                if (set_group_affinity(handle_, &ga, nullptr) != 0)
-                    return {};
-                return unexpected(std::make_error_code(std::errc::operation_not_permitted));
-            }
-        }
-        DWORD_PTR mask = static_cast<DWORD_PTR>(affinity.get_mask());
-        if (SetThreadAffinityMask(handle_, mask) != 0)
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+        return detail::apply_affinity(handle_, affinity);
 #else
-        if (pthread_setaffinity_np(pthreadHandle_, sizeof(cpu_set_t), &affinity.native_handle()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
+        return detail::apply_affinity(pthreadHandle_, affinity);
 #endif
     }
 
     [[nodiscard]] auto set_priority(ThreadPriority priority) const -> expected<void, std::error_code>
     {
 #ifdef _WIN32
-        if (!handle_)
-            return unexpected(std::make_error_code(std::errc::no_such_process));
-        int win_priority;
-        int prio_val = priority.value();
-        if (prio_val <= -10)
-            win_priority = THREAD_PRIORITY_IDLE;
-        else if (prio_val <= -5)
-            win_priority = THREAD_PRIORITY_LOWEST;
-        else if (prio_val < 0)
-            win_priority = THREAD_PRIORITY_BELOW_NORMAL;
-        else if (prio_val == 0)
-            win_priority = THREAD_PRIORITY_NORMAL;
-        else if (prio_val <= 5)
-            win_priority = THREAD_PRIORITY_ABOVE_NORMAL;
-        else if (prio_val <= 10)
-            win_priority = THREAD_PRIORITY_HIGHEST;
-        else
-            win_priority = THREAD_PRIORITY_TIME_CRITICAL;
-        if (SetThreadPriority(handle_, win_priority) != 0)
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+        return detail::apply_priority(handle_, priority);
 #else
-        const int policy = SCHED_OTHER;
-        auto params_result = SchedulerParams::create_for_policy(SchedulingPolicy::OTHER, priority);
-        if (!params_result.has_value())
-            return unexpected(params_result.error());
-        if (pthread_setschedparam(pthreadHandle_, policy, &params_result.value()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
+        return detail::apply_priority(pthreadHandle_, priority);
 #endif
     }
 
@@ -227,15 +178,9 @@ class ThreadControlBlock
         -> expected<void, std::error_code>
     {
 #ifdef _WIN32
-        return set_priority(priority);
+        return detail::apply_scheduling_policy(handle_, policy, priority);
 #else
-        const int policy_int = static_cast<int>(policy);
-        auto params_result = SchedulerParams::create_for_policy(policy, priority);
-        if (!params_result.has_value())
-            return unexpected(params_result.error());
-        if (pthread_setschedparam(pthreadHandle_, policy_int, &params_result.value()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
+        return detail::apply_scheduling_policy(pthreadHandle_, policy, priority);
 #endif
     }
 
diff --git a/include/threadschedule/thread_wrapper.hpp b/include/threadschedule/thread_wrapper.hpp
index 0f055dd..df270c0 100644
--- a/include/threadschedule/thread_wrapper.hpp
+++ b/include/threadschedule/thread_wrapper.hpp
@@ -279,118 +279,18 @@ class BaseThreadWrapper : protected detail::ThreadStorage<ThreadType, OwnershipT
 
     [[nodiscard]] auto set_priority(ThreadPriority priority) -> expected<void, std::error_code>
     {
-#ifdef _WIN32
-        const auto handle = native_handle();
-        // Map ThreadPriority to Windows priority
-        // Windows thread priorities range from -15 (THREAD_PRIORITY_IDLE) to +15 (THREAD_PRIORITY_TIME_CRITICAL)
-        // We'll map the priority value to Windows constants
-        int win_priority;
-        int prio_val = priority.value();
-
-        if (prio_val <= -10)
-        {
-            win_priority = THREAD_PRIORITY_IDLE;
-        }
-        else if (prio_val <= -5)
-        {
-            win_priority = THREAD_PRIORITY_LOWEST;
-        }
-        else if (prio_val < 0)
-        {
-            win_priority = THREAD_PRIORITY_BELOW_NORMAL;
-        }
-        else if (prio_val == 0)
-        {
-            win_priority = THREAD_PRIORITY_NORMAL;
-        }
-        else if (prio_val <= 5)
-        {
-            win_priority = THREAD_PRIORITY_ABOVE_NORMAL;
-        }
-        else if (prio_val <= 10)
-        {
-            win_priority = THREAD_PRIORITY_HIGHEST;
-        }
-        else
-        {
-            win_priority = THREAD_PRIORITY_TIME_CRITICAL;
-        }
-
-        if (SetThreadPriority(handle, win_priority) != 0)
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
-#else
-        const auto handle = native_handle();
-        int const policy = SCHED_OTHER;
-
-        auto params_result = SchedulerParams::create_for_policy(SchedulingPolicy::OTHER, priority);
-
-        if (!params_result.has_value())
-        {
-            return unexpected(params_result.error());
-        }
-
-        if (pthread_setschedparam(handle, policy, &params_result.value()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
-#endif
+        return detail::apply_priority(native_handle(), priority);
     }
 
     [[nodiscard]] auto set_scheduling_policy(SchedulingPolicy policy, ThreadPriority priority)
         -> expected<void, std::error_code>
     {
-#ifdef _WIN32
-        // Windows doesn't have the same scheduling policy concept as Linux
-        // We'll just set the priority and return success
-        return set_priority(priority);
-#else
-        const auto handle = native_handle();
-        int const policy_int = static_cast<int>(policy);
-
-        auto params_result = SchedulerParams::create_for_policy(policy, priority);
-        if (!params_result.has_value())
-        {
-            return unexpected(params_result.error());
-        }
-
-        if (pthread_setschedparam(handle, policy_int, &params_result.value()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
-#endif
+        return detail::apply_scheduling_policy(native_handle(), policy, priority);
     }
 
     [[nodiscard]] auto set_affinity(ThreadAffinity const& affinity) -> expected<void, std::error_code>
     {
-#ifdef _WIN32
-        const auto handle = native_handle();
-        // Prefer Group Affinity if available
-        using SetThreadGroupAffinityFn = BOOL(WINAPI*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
-        HMODULE hMod = GetModuleHandleW(L"kernel32.dll");
-        if (hMod)
-        {
-            auto set_group_affinity = reinterpret_cast<SetThreadGroupAffinityFn>(
-                reinterpret_cast<void*>(GetProcAddress(hMod, "SetThreadGroupAffinity")));
-            if (set_group_affinity && affinity.has_any())
-            {
-                GROUP_AFFINITY ga{};
-                ga.Mask = static_cast<KAFFINITY>(affinity.get_mask());
-                ga.Group = affinity.get_group();
-                if (set_group_affinity(handle, &ga, nullptr) != 0)
-                    return {};
-                return unexpected(std::make_error_code(std::errc::operation_not_permitted));
-            }
-        }
-        // Fallback to legacy mask (single-group systems)
-        DWORD_PTR mask = static_cast<DWORD_PTR>(affinity.get_mask());
-        if (SetThreadAffinityMask(handle, mask) != 0)
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
-#else
-        const auto handle = native_handle();
-        if (pthread_setaffinity_np(handle, sizeof(cpu_set_t), &affinity.native_handle()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
-#endif
+        return detail::apply_affinity(native_handle(), affinity);
     }
 
     [[nodiscard]] auto get_affinity() const -> std::optional<ThreadAffinity>
@@ -610,14 +510,9 @@ class ThreadWrapper : public BaseThreadWrapper<std::thread, detail::OwningTag>
     static auto create_with_config(std::string const& name, SchedulingPolicy policy, ThreadPriority priority, F&& f,
                                    Args&&... args) -> ThreadWrapper
     {
-
         ThreadWrapper wrapper(std::forward<F>(f), std::forward<Args>(args)...);
-        if (auto r = wrapper.set_name(name); !r.has_value())
-        {
-        }
-        if (auto r = wrapper.set_scheduling_policy(policy, priority); !r.has_value())
-        {
-        }
+        (void)wrapper.set_name(name);
+        (void)wrapper.set_scheduling_policy(policy, priority);
         return wrapper;
     }
 };
@@ -765,14 +660,9 @@ class JThreadWrapper : public BaseThreadWrapper<std::jthread, detail::OwningTag>
     static auto create_with_config(std::string const& name, SchedulingPolicy policy, ThreadPriority priority, F&& f,
                                    Args&&... args) -> JThreadWrapper
     {
-
         JThreadWrapper wrapper(std::forward<F>(f), std::forward<Args>(args)...);
-        if (auto r = wrapper.set_name(name); !r.has_value())
-        {
-        }
-        if (auto r = wrapper.set_scheduling_policy(policy, priority); !r.has_value())
-        {
-        }
+        (void)wrapper.set_name(name);
+        (void)wrapper.set_scheduling_policy(policy, priority);
         return wrapper;
     }
 };
@@ -977,13 +867,7 @@ class ThreadByNameView
 #else
         if (!found())
             return unexpected(std::make_error_code(std::errc::no_such_process));
-        int const policy = SCHED_OTHER;
-        auto params_result = SchedulerParams::create_for_policy(SchedulingPolicy::OTHER, priority);
-        if (!params_result.has_value())
-            return unexpected(params_result.error());
-        if (sched_setscheduler(handle_, policy, &params_result.value()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
+        return detail::apply_priority(handle_, priority);
 #endif
     }
 
@@ -995,13 +879,7 @@ class ThreadByNameView
 #else
         if (!found())
             return unexpected(std::make_error_code(std::errc::no_such_process));
-        int policy_int = static_cast<int>(policy);
-        auto params_result = SchedulerParams::create_for_policy(policy, priority);
-        if (!params_result.has_value())
-            return unexpected(params_result.error());
-        if (sched_setscheduler(handle_, policy_int, &params_result.value()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
+        return detail::apply_scheduling_policy(handle_, policy, priority);
 #endif
     }
 
@@ -1012,9 +890,7 @@ class ThreadByNameView
 #else
         if (!found())
             return unexpected(std::make_error_code(std::errc::no_such_process));
-        if (sched_setaffinity(handle_, sizeof(cpu_set_t), &affinity.native_handle()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
+        return detail::apply_affinity(handle_, affinity);
 #endif
     }
 
diff --git a/include/threadschedule/threadschedule.hpp b/include/threadschedule/threadschedule.hpp
index c698e43..06b293d 100644
--- a/include/threadschedule/threadschedule.hpp
+++ b/include/threadschedule/threadschedule.hpp
@@ -55,6 +55,9 @@ using ts::ErrorHandler;
 using ts::FastThreadPool;
 using ts::FastThreadPoolWithErrors;
 using ts::FutureWithErrorHandler;
+using ts::GlobalHighPerformancePool;
+using ts::GlobalPool;
+using ts::GlobalThreadPool;
 using ts::HighPerformancePool;
 using ts::HighPerformancePoolWithErrors;
 using ts::JThreadWrapper;
@@ -70,7 +73,9 @@ using ts::TaskError;
 using ts::ThreadAffinity;
 using ts::ThreadByNameView;
 using ts::ThreadPool;
+using ts::ThreadPoolBase;
 using ts::ThreadPoolWithErrors;
+using ts::PoolWithErrors;
 using ts::ThreadPriority;
 using ts::ThreadProfile;
 using ts::ThreadWrapper;

From 10e7d0dcdd8e2f5be51a05aa8b0a74aa28dc0791 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Sun, 5 Apr 2026 16:09:28 +0200
Subject: [PATCH 02/15] Refactor thread pool and worker configurations for
 improved maintainability

- Centralized thread configuration methods in `detail` namespace, reducing code duplication and enhancing clarity.
- Removed the `FutureWithErrorHandler<void>` specialization, streamlining the handling of void futures.
- Deduplicated thread naming and affinity methods across various classes, improving consistency.
- Introduced a new `QueryFacadeMixin` for `CompositeThreadRegistry`, simplifying query operations.
- Updated CHANGELOG to reflect internal improvements and code reductions across multiple files.
---
 CHANGELOG.md                                |  24 +++
 include/threadschedule/error_handler.hpp    |  80 +--------
 include/threadschedule/pthread_wrapper.hpp  |  21 +--
 include/threadschedule/scheduler_policy.hpp | 108 ++++++++++++
 include/threadschedule/thread_pool.hpp      | 158 +++++++----------
 include/threadschedule/thread_registry.hpp  | 177 ++++++++++----------
 include/threadschedule/thread_wrapper.hpp   | 110 +-----------
 7 files changed, 293 insertions(+), 385 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1b01d1..29f1f26 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -80,6 +80,30 @@ auto futures = pool.submit_range(tasks.begin(), tasks.end());
 auto futures = pool.submit_batch(tasks.begin(), tasks.end());
 ```
 
+### Internal improvements (v2.0.0 continued)
+
+- **Pool worker configuration deduplicated**: `configure_threads()`,
+  `set_affinity()`, `distribute_across_cpus()` in `HighPerformancePool` and
+  `ThreadPoolBase` now delegate to shared `detail::configure_worker_threads`,
+  `detail::set_worker_affinity`, `detail::distribute_workers_across_cpus`
+  templates.
+
+- **Thread naming/affinity reading centralized**: `set_name()`, `get_name()`,
+  `get_affinity()` across `BaseThreadWrapper`, `PThreadWrapper`, and
+  `ThreadControlBlock` now delegate to `detail::apply_name`,
+  `detail::read_name`, `detail::read_affinity` in `scheduler_policy.hpp`.
+
+- **`FutureWithErrorHandler<void>` specialization removed**: The primary
+  template now handles both `T` and `void` via `if constexpr`, eliminating
+  ~70 lines of duplicated code. No API change.
+
+- **`CompositeThreadRegistry` facade deduplicated**: The 12 query facade
+  methods (filter, map, for_each, find_if, any, all, none, take, skip, count,
+  empty, apply) are now inherited from `detail::QueryFacadeMixin<Derived>`
+  CRTP base. No API change.
+
+- Net reduction: ~116 lines across 6 files.
+
 ## v1.4.1
 
 - Fix: `*WrapperReg` types (`ThreadWrapperReg`, `JThreadWrapperReg`,
diff --git a/include/threadschedule/error_handler.hpp b/include/threadschedule/error_handler.hpp
index b3e3fb6..ebc4161 100644
--- a/include/threadschedule/error_handler.hpp
+++ b/include/threadschedule/error_handler.hpp
@@ -8,6 +8,7 @@
 #include <mutex>
 #include <string>
 #include <thread>
+#include <type_traits>
 #include <vector>
 
 namespace threadschedule
@@ -327,14 +328,17 @@ class FutureWithErrorHandler
      * If the underlying future holds an exception, the error callback (if any)
      * is called **before** the exception is re-thrown to the caller.
      *
-     * @return The stored value of type @p T.
+     * @return The stored value of type @p T (void when @p T is @c void).
      * @throws Any exception stored in the underlying @c std::future.
      */
     auto get() -> T
     {
         try
         {
-            return future_.get();
+            if constexpr (std::is_void_v<T>)
+                future_.get();
+            else
+                return future_.get();
         }
         catch (...)
         {
@@ -398,76 +402,4 @@ class FutureWithErrorHandler
     bool has_callback_{false};
 };
 
-/**
- * @brief Specialization of FutureWithErrorHandler for @c void futures.
- *
- * Behaves identically to the primary template except that get() returns
- * @c void instead of a value.
- *
- * @see FutureWithErrorHandler
- */
-template <>
-class FutureWithErrorHandler<void>
-{
-  public:
-    explicit FutureWithErrorHandler(std::future<void> future) : future_(std::move(future)), error_callback_(nullptr)
-    {
-    }
-
-    FutureWithErrorHandler(FutureWithErrorHandler const&) = delete;
-    auto operator=(FutureWithErrorHandler const&) -> FutureWithErrorHandler& = delete;
-    FutureWithErrorHandler(FutureWithErrorHandler&&) = default;
-    auto operator=(FutureWithErrorHandler&&) -> FutureWithErrorHandler& = default;
-
-    auto on_error(std::function<void(std::exception_ptr)> callback) -> FutureWithErrorHandler&
-    {
-        error_callback_ = std::move(callback);
-        has_callback_ = true;
-        return *this;
-    }
-
-    void get()
-    {
-        try
-        {
-            future_.get();
-        }
-        catch (...)
-        {
-            if (has_callback_ && error_callback_)
-            {
-                error_callback_(std::current_exception());
-            }
-            throw;
-        }
-    }
-
-    void wait() const
-    {
-        future_.wait();
-    }
-
-    template <typename Rep, typename Period>
-    auto wait_for(std::chrono::duration<Rep, Period> const& timeout_duration) const
-    {
-        return future_.wait_for(timeout_duration);
-    }
-
-    template <typename Clock, typename Duration>
-    auto wait_until(std::chrono::time_point<Clock, Duration> const& timeout_time) const
-    {
-        return future_.wait_until(timeout_time);
-    }
-
-    [[nodiscard]] auto valid() const -> bool
-    {
-        return future_.valid();
-    }
-
-  private:
-    std::future<void> future_;
-    std::function<void(std::exception_ptr)> error_callback_;
-    bool has_callback_{};
-};
-
 } // namespace threadschedule
diff --git a/include/threadschedule/pthread_wrapper.hpp b/include/threadschedule/pthread_wrapper.hpp
index 86c8b4d..db6485a 100644
--- a/include/threadschedule/pthread_wrapper.hpp
+++ b/include/threadschedule/pthread_wrapper.hpp
@@ -146,24 +146,14 @@ class PThreadWrapper
         return thread_;
     }
 
-    // Extended pthread functionality
     [[nodiscard]] auto set_name(std::string const& name) const -> expected<void, std::error_code>
     {
-        if (name.length() > 15)
-            return expected<void, std::error_code>(unexpect, std::make_error_code(std::errc::invalid_argument));
-        if (pthread_setname_np(thread_, name.c_str()) == 0)
-            return {};
-        return expected<void, std::error_code>(unexpect, std::error_code(errno, std::generic_category()));
+        return detail::apply_name(thread_, name);
     }
 
     [[nodiscard]] auto get_name() const -> std::optional<std::string>
     {
-        char name[16]; // Linux limit + 1
-        if (pthread_getname_np(thread_, name, sizeof(name)) == 0)
-        {
-            return std::string(name);
-        }
-        return std::nullopt;
+        return detail::read_name(thread_);
     }
 
     [[nodiscard]] auto set_priority(ThreadPriority priority) const -> expected<void, std::error_code>
@@ -184,12 +174,7 @@ class PThreadWrapper
 
     [[nodiscard]] auto get_affinity() const -> std::optional<ThreadAffinity>
     {
-        ThreadAffinity affinity;
-        if (pthread_getaffinity_np(thread_, sizeof(cpu_set_t), const_cast<cpu_set_t*>(&affinity.native_handle())) == 0)
-        {
-            return affinity;
-        }
-        return std::nullopt;
+        return detail::read_affinity(thread_);
     }
 
     // Cancellation support
diff --git a/include/threadschedule/scheduler_policy.hpp b/include/threadschedule/scheduler_policy.hpp
index 3b201fd..cb75892 100644
--- a/include/threadschedule/scheduler_policy.hpp
+++ b/include/threadschedule/scheduler_policy.hpp
@@ -3,6 +3,7 @@
 #include "expected.hpp"
 #include <algorithm>
 #include <cstdint>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <system_error>
@@ -553,6 +554,79 @@ inline auto apply_affinity(HANDLE handle, ThreadAffinity const& affinity) -> exp
     return unexpected(std::make_error_code(std::errc::operation_not_permitted));
 }
 
+inline auto apply_name(HANDLE handle, std::string const& name) -> expected<void, std::error_code>
+{
+    if (!handle)
+        return unexpected(std::make_error_code(std::errc::no_such_process));
+    using SetThreadDescriptionFn = HRESULT(WINAPI*)(HANDLE, PCWSTR);
+    HMODULE hMod = GetModuleHandleW(L"kernel32.dll");
+    if (!hMod)
+        return unexpected(std::make_error_code(std::errc::function_not_supported));
+    auto set_desc = reinterpret_cast<SetThreadDescriptionFn>(
+        reinterpret_cast<void*>(GetProcAddress(hMod, "SetThreadDescription")));
+    if (!set_desc)
+        return unexpected(std::make_error_code(std::errc::function_not_supported));
+    std::wstring wide(name.begin(), name.end());
+    if (SUCCEEDED(set_desc(handle, wide.c_str())))
+        return {};
+    return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+}
+
+inline auto read_name(HANDLE handle) -> std::optional<std::string>
+{
+    if (!handle)
+        return std::nullopt;
+    using GetThreadDescriptionFn = HRESULT(WINAPI*)(HANDLE, PWSTR*);
+    HMODULE hMod = GetModuleHandleW(L"kernel32.dll");
+    if (!hMod)
+        return std::nullopt;
+    auto get_desc = reinterpret_cast<GetThreadDescriptionFn>(
+        reinterpret_cast<void*>(GetProcAddress(hMod, "GetThreadDescription")));
+    if (!get_desc)
+        return std::nullopt;
+    PWSTR thread_name = nullptr;
+    if (SUCCEEDED(get_desc(handle, &thread_name)) && thread_name)
+    {
+        int size = WideCharToMultiByte(CP_UTF8, 0, thread_name, -1, nullptr, 0, nullptr, nullptr);
+        if (size > 0)
+        {
+            std::string result(size - 1, '\0');
+            WideCharToMultiByte(CP_UTF8, 0, thread_name, -1, &result[0], size, nullptr, nullptr);
+            LocalFree(thread_name);
+            return result;
+        }
+        LocalFree(thread_name);
+    }
+    return std::nullopt;
+}
+
+inline auto read_affinity(HANDLE handle) -> std::optional<ThreadAffinity>
+{
+    if (!handle)
+        return std::nullopt;
+    using GetThreadGroupAffinityFn = BOOL(WINAPI*)(HANDLE, PGROUP_AFFINITY);
+    HMODULE hMod = GetModuleHandleW(L"kernel32.dll");
+    if (!hMod)
+        return std::nullopt;
+    auto get_group_affinity = reinterpret_cast<GetThreadGroupAffinityFn>(
+        reinterpret_cast<void*>(GetProcAddress(hMod, "GetThreadGroupAffinity")));
+    if (!get_group_affinity)
+        return std::nullopt;
+    GROUP_AFFINITY ga{};
+    if (get_group_affinity(handle, &ga) != 0)
+    {
+        ThreadAffinity affinity;
+        for (int i = 0; i < 64; ++i)
+        {
+            if ((ga.Mask & (static_cast<KAFFINITY>(1) << i)) != 0)
+                affinity.add_cpu(static_cast<int>(ga.Group) * 64 + i);
+        }
+        if (affinity.has_any())
+            return affinity;
+    }
+    return std::nullopt;
+}
+
 #else // POSIX
 
 // --- pthread_t overloads (BaseThreadWrapper, ThreadControlBlock, PThreadWrapper) ---
@@ -587,6 +661,40 @@ inline auto apply_affinity(pthread_t handle, ThreadAffinity const& affinity) ->
     return unexpected(std::error_code(errno, std::generic_category()));
 }
 
+inline auto apply_name(pthread_t handle, std::string const& name) -> expected<void, std::error_code>
+{
+    if (name.length() > 15)
+        return unexpected(std::make_error_code(std::errc::invalid_argument));
+    if (pthread_setname_np(handle, name.c_str()) == 0)
+        return {};
+    return unexpected(std::error_code(errno, std::generic_category()));
+}
+
+inline auto read_name(pthread_t handle) -> std::optional<std::string>
+{
+    char name[16];
+    if (pthread_getname_np(handle, name, sizeof(name)) == 0)
+        return std::string(name);
+    return std::nullopt;
+}
+
+inline auto read_affinity(pthread_t handle) -> std::optional<ThreadAffinity>
+{
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    if (pthread_getaffinity_np(handle, sizeof(cpu_set_t), &cpuset) == 0)
+    {
+        std::vector<int> cpus;
+        for (int i = 0; i < CPU_SETSIZE; ++i)
+        {
+            if (CPU_ISSET(i, &cpuset))
+                cpus.push_back(i);
+        }
+        return ThreadAffinity(cpus);
+    }
+    return std::nullopt;
+}
+
 // --- pid_t / TID overloads (ThreadByNameView) ---
 
 inline auto apply_priority(pid_t tid, ThreadPriority priority) -> expected<void, std::error_code>
diff --git a/include/threadschedule/thread_pool.hpp b/include/threadschedule/thread_pool.hpp
index 5a488f2..c26e6f2 100644
--- a/include/threadschedule/thread_pool.hpp
+++ b/include/threadschedule/thread_pool.hpp
@@ -16,6 +16,62 @@
 namespace threadschedule
 {
 
+namespace detail
+{
+
+template <typename WorkerRange>
+inline auto configure_worker_threads(WorkerRange& workers, std::string const& name_prefix, SchedulingPolicy policy,
+                                     ThreadPriority priority) -> expected<void, std::error_code>
+{
+    bool success = true;
+    for (size_t i = 0; i < workers.size(); ++i)
+    {
+        std::string const thread_name = name_prefix + "_" + std::to_string(i);
+        if (!workers[i].set_name(thread_name).has_value())
+            success = false;
+        if (!workers[i].set_scheduling_policy(policy, priority).has_value())
+            success = false;
+    }
+    if (success)
+        return {};
+    return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+}
+
+template <typename WorkerRange>
+inline auto set_worker_affinity(WorkerRange& workers, ThreadAffinity const& affinity) -> expected<void, std::error_code>
+{
+    bool success = true;
+    for (auto& worker : workers)
+    {
+        if (!worker.set_affinity(affinity).has_value())
+            success = false;
+    }
+    if (success)
+        return {};
+    return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+}
+
+template <typename WorkerRange>
+inline auto distribute_workers_across_cpus(WorkerRange& workers) -> expected<void, std::error_code>
+{
+    auto const cpu_count = std::thread::hardware_concurrency();
+    if (cpu_count == 0)
+        return unexpected(std::make_error_code(std::errc::invalid_argument));
+
+    bool success = true;
+    for (size_t i = 0; i < workers.size(); ++i)
+    {
+        ThreadAffinity affinity({static_cast<int>(i % cpu_count)});
+        if (!workers[i].set_affinity(affinity).has_value())
+            success = false;
+    }
+    if (success)
+        return {};
+    return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+}
+
+} // namespace detail
+
 /**
  * @brief Work-stealing deque for per-thread task queues in a thread pool.
  *
@@ -429,62 +485,17 @@ class HighPerformancePool
     auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
                            ThreadPriority priority = ThreadPriority::normal()) -> expected<void, std::error_code>
     {
-        bool success = true;
-
-        for (size_t i = 0; i < workers_.size(); ++i)
-        {
-            std::string const thread_name = name_prefix + "_" + std::to_string(i);
-
-            if (!workers_[i].set_name(thread_name).has_value())
-            {
-                success = false;
-            }
-
-            if (!workers_[i].set_scheduling_policy(policy, priority).has_value())
-            {
-                success = false;
-            }
-        }
-        if (success)
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+        return detail::configure_worker_threads(workers_, name_prefix, policy, priority);
     }
 
     auto set_affinity(ThreadAffinity const& affinity) -> expected<void, std::error_code>
     {
-        bool success = true;
-
-        for (auto& worker : workers_)
-        {
-            if (!worker.set_affinity(affinity).has_value())
-            {
-                success = false;
-            }
-        }
-        if (success)
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+        return detail::set_worker_affinity(workers_, affinity);
     }
 
     auto distribute_across_cpus() -> expected<void, std::error_code>
     {
-        auto const cpu_count = std::thread::hardware_concurrency();
-        if (cpu_count == 0)
-            return unexpected(std::make_error_code(std::errc::invalid_argument));
-
-        bool success = true;
-
-        for (size_t i = 0; i < workers_.size(); ++i)
-        {
-            ThreadAffinity affinity({static_cast<int>(i % cpu_count)});
-            if (!workers_[i].set_affinity(affinity).has_value())
-            {
-                success = false;
-            }
-        }
-        if (success)
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+        return detail::distribute_workers_across_cpus(workers_);
     }
 
     void wait_for_tasks()
@@ -877,25 +888,7 @@ class ThreadPoolBase
     auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
                            ThreadPriority priority = ThreadPriority::normal()) -> expected<void, std::error_code>
     {
-        bool success = true;
-
-        for (size_t i = 0; i < workers_.size(); ++i)
-        {
-            std::string const thread_name = name_prefix + "_" + std::to_string(i);
-
-            if (!workers_[i].set_name(thread_name).has_value())
-            {
-                success = false;
-            }
-
-            if (!workers_[i].set_scheduling_policy(policy, priority).has_value())
-            {
-                success = false;
-            }
-        }
-        if (success)
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+        return detail::configure_worker_threads(workers_, name_prefix, policy, priority);
     }
 
     /**
@@ -903,18 +896,7 @@ class ThreadPoolBase
      */
     auto set_affinity(ThreadAffinity const& affinity) -> expected<void, std::error_code>
     {
-        bool success = true;
-
-        for (auto& worker : workers_)
-        {
-            if (!worker.set_affinity(affinity).has_value())
-            {
-                success = false;
-            }
-        }
-        if (success)
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+        return detail::set_worker_affinity(workers_, affinity);
     }
 
     /**
@@ -922,23 +904,7 @@ class ThreadPoolBase
      */
     auto distribute_across_cpus() -> expected<void, std::error_code>
     {
-        auto const cpu_count = std::thread::hardware_concurrency();
-        if (cpu_count == 0)
-            return unexpected(std::make_error_code(std::errc::invalid_argument));
-
-        bool success = true;
-
-        for (size_t i = 0; i < workers_.size(); ++i)
-        {
-            ThreadAffinity affinity({static_cast<int>(i % cpu_count)});
-            if (!workers_[i].set_affinity(affinity).has_value())
-            {
-                success = false;
-            }
-        }
-        if (success)
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+        return detail::distribute_workers_across_cpus(workers_);
     }
 
     void wait_for_tasks()
diff --git a/include/threadschedule/thread_registry.hpp b/include/threadschedule/thread_registry.hpp
index bb35270..699a830 100644
--- a/include/threadschedule/thread_registry.hpp
+++ b/include/threadschedule/thread_registry.hpp
@@ -187,26 +187,9 @@ class ThreadControlBlock
     [[nodiscard]] auto set_name(std::string const& name) const -> expected<void, std::error_code>
     {
 #ifdef _WIN32
-        if (!handle_)
-            return unexpected(std::make_error_code(std::errc::no_such_process));
-        using SetThreadDescriptionFn = HRESULT(WINAPI*)(HANDLE, PCWSTR);
-        HMODULE hMod = GetModuleHandleW(L"kernel32.dll");
-        if (!hMod)
-            return unexpected(std::make_error_code(std::errc::function_not_supported));
-        auto set_desc = reinterpret_cast<SetThreadDescriptionFn>(
-            reinterpret_cast<void*>(GetProcAddress(hMod, "SetThreadDescription")));
-        if (!set_desc)
-            return unexpected(std::make_error_code(std::errc::function_not_supported));
-        std::wstring wide(name.begin(), name.end());
-        if (SUCCEEDED(set_desc(handle_, wide.c_str())))
-            return {};
-        return unexpected(std::make_error_code(std::errc::operation_not_permitted));
+        return detail::apply_name(handle_, name);
 #else
-        if (name.length() > 15)
-            return unexpected(std::make_error_code(std::errc::invalid_argument));
-        if (pthread_setname_np(pthreadHandle_, name.c_str()) == 0)
-            return {};
-        return unexpected(std::error_code(errno, std::generic_category()));
+        return detail::apply_name(pthreadHandle_, name);
 #endif
     }
 
@@ -795,6 +778,89 @@ inline auto build_mode_string() -> char const*
     return is_runtime_build ? "runtime" : "header-only";
 }
 
+namespace detail
+{
+
+/**
+ * @brief CRTP mixin that provides functional-style query facade methods.
+ *
+ * The derived class must implement a public @c query() method returning a
+ * @ref ThreadRegistry::QueryView. All facade methods (filter, map, for_each,
+ * find_if, any, all, none, take, skip, count, empty, apply) delegate to it.
+ *
+ * @tparam Derived CRTP derived type.
+ */
+template <typename Derived>
+class QueryFacadeMixin
+{
+    auto self() const -> Derived const& { return static_cast<Derived const&>(*this); }
+
+  public:
+    template <typename Predicate>
+    [[nodiscard]] auto filter(Predicate&& pred) const -> ThreadRegistry::QueryView
+    {
+        return self().query().filter(std::forward<Predicate>(pred));
+    }
+
+    [[nodiscard]] auto count() const -> size_t { return self().query().count(); }
+
+    [[nodiscard]] auto empty() const -> bool { return self().query().empty(); }
+
+    template <typename Fn>
+    void for_each(Fn&& fn) const
+    {
+        self().query().for_each(std::forward<Fn>(fn));
+    }
+
+    template <typename Predicate, typename Fn>
+    void apply(Predicate&& pred, Fn&& fn) const
+    {
+        self().query().filter(std::forward<Predicate>(pred)).for_each(std::forward<Fn>(fn));
+    }
+
+    template <typename Fn>
+    [[nodiscard]] auto map(Fn&& fn) const -> std::vector<std::invoke_result_t<Fn, RegisteredThreadInfo const&>>
+    {
+        return self().query().map(std::forward<Fn>(fn));
+    }
+
+    template <typename Predicate>
+    [[nodiscard]] auto find_if(Predicate&& pred) const -> std::optional<RegisteredThreadInfo>
+    {
+        return self().query().find_if(std::forward<Predicate>(pred));
+    }
+
+    template <typename Predicate>
+    [[nodiscard]] auto any(Predicate&& pred) const -> bool
+    {
+        return self().query().any(std::forward<Predicate>(pred));
+    }
+
+    template <typename Predicate>
+    [[nodiscard]] auto all(Predicate&& pred) const -> bool
+    {
+        return self().query().all(std::forward<Predicate>(pred));
+    }
+
+    template <typename Predicate>
+    [[nodiscard]] auto none(Predicate&& pred) const -> bool
+    {
+        return self().query().none(std::forward<Predicate>(pred));
+    }
+
+    [[nodiscard]] auto take(size_t n) const -> ThreadRegistry::QueryView
+    {
+        return self().query().take(n);
+    }
+
+    [[nodiscard]] auto skip(size_t n) const -> ThreadRegistry::QueryView
+    {
+        return self().query().skip(n);
+    }
+};
+
+} // namespace detail
+
 /**
  * @brief Aggregates multiple ThreadRegistry instances into a single queryable
  *        view.
@@ -824,9 +890,9 @@ inline auto build_mode_string() -> char const*
  * query() iterates over every attached registry, calls its own query(), and
  * concatenates the results into a single @ref ThreadRegistry::QueryView snapshot.
  * The same functional-style helpers (filter, map, for_each, etc.) are
- * available directly on CompositeThreadRegistry for convenience.
+ * inherited from @ref detail::QueryFacadeMixin.
  */
-class CompositeThreadRegistry
+class CompositeThreadRegistry : public detail::QueryFacadeMixin<CompositeThreadRegistry>
 {
   public:
     void attach(ThreadRegistry* reg)
@@ -837,7 +903,6 @@ class CompositeThreadRegistry
         registries_.push_back(reg);
     }
 
-    // Chainable query API
     [[nodiscard]] auto query() const -> ThreadRegistry::QueryView
     {
         std::vector<RegisteredThreadInfo> merged;
@@ -855,74 +920,6 @@ class CompositeThreadRegistry
         return ThreadRegistry::QueryView(std::move(merged));
     }
 
-    template <typename Predicate>
-    [[nodiscard]] auto filter(Predicate&& pred) const -> ThreadRegistry::QueryView
-    {
-        return query().filter(std::forward<Predicate>(pred));
-    }
-
-    [[nodiscard]] auto count() const -> size_t
-    {
-        return query().count();
-    }
-
-    [[nodiscard]] auto empty() const -> bool
-    {
-        return query().empty();
-    }
-
-    template <typename Fn>
-    void for_each(Fn&& fn) const
-    {
-        query().for_each(std::forward<Fn>(fn));
-    }
-
-    template <typename Predicate, typename Fn>
-    void apply(Predicate&& pred, Fn&& fn) const
-    {
-        query().filter(std::forward<Predicate>(pred)).for_each(std::forward<Fn>(fn));
-    }
-
-    template <typename Fn>
-    [[nodiscard]] auto map(Fn&& fn) const -> std::vector<std::invoke_result_t<Fn, RegisteredThreadInfo const&>>
-    {
-        return query().map(std::forward<Fn>(fn));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto find_if(Predicate&& pred) const -> std::optional<RegisteredThreadInfo>
-    {
-        return query().find_if(std::forward<Predicate>(pred));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto any(Predicate&& pred) const -> bool
-    {
-        return query().any(std::forward<Predicate>(pred));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto all(Predicate&& pred) const -> bool
-    {
-        return query().all(std::forward<Predicate>(pred));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto none(Predicate&& pred) const -> bool
-    {
-        return query().none(std::forward<Predicate>(pred));
-    }
-
-    [[nodiscard]] auto take(size_t n) const -> ThreadRegistry::QueryView
-    {
-        return query().take(n);
-    }
-
-    [[nodiscard]] auto skip(size_t n) const -> ThreadRegistry::QueryView
-    {
-        return query().skip(n);
-    }
-
   private:
     mutable std::mutex mutex_;
     std::vector<ThreadRegistry*> registries_;
diff --git a/include/threadschedule/thread_wrapper.hpp b/include/threadschedule/thread_wrapper.hpp
index df270c0..2847cb5 100644
--- a/include/threadschedule/thread_wrapper.hpp
+++ b/include/threadschedule/thread_wrapper.hpp
@@ -202,79 +202,14 @@ class BaseThreadWrapper : protected detail::ThreadStorage<ThreadType, OwnershipT
         return underlying().native_handle();
     }
 
-    // Extended functionality
     [[nodiscard]] auto set_name(std::string const& name) -> expected<void, std::error_code>
     {
-#ifdef _WIN32
-        // Windows supports longer thread names. Try SetThreadDescription dynamically.
-        auto const handle = native_handle();
-        std::wstring wide_name(name.begin(), name.end());
-
-        using SetThreadDescriptionFn = HRESULT(WINAPI*)(HANDLE, PCWSTR);
-        HMODULE hMod = GetModuleHandleW(L"kernel32.dll");
-        if (hMod)
-        {
-            auto set_desc = reinterpret_cast<SetThreadDescriptionFn>(
-                reinterpret_cast<void*>(GetProcAddress(hMod, "SetThreadDescription")));
-            if (set_desc)
-            {
-                if (SUCCEEDED(set_desc(handle, wide_name.c_str())))
-                    return expected<void, std::error_code>();
-                return expected<void, std::error_code>(unexpect, std::make_error_code(std::errc::invalid_argument));
-            }
-        }
-        // Fallback unavailable
-        return expected<void, std::error_code>(unexpect, std::make_error_code(std::errc::function_not_supported));
-#else
-        if (name.length() > 15)
-            return expected<void, std::error_code>(unexpect, std::make_error_code(std::errc::invalid_argument));
-
-        auto const handle = native_handle();
-        if (pthread_setname_np(handle, name.c_str()) == 0)
-            return {};
-        return expected<void, std::error_code>(unexpect, std::error_code(errno, std::generic_category()));
-#endif
+        return detail::apply_name(native_handle(), name);
     }
 
     [[nodiscard]] auto get_name() const -> std::optional<std::string>
     {
-#ifdef _WIN32
-        const auto handle = const_cast<BaseThreadWrapper*>(this)->native_handle();
-        using GetThreadDescriptionFn = HRESULT(WINAPI*)(HANDLE, PWSTR*);
-        HMODULE hMod = GetModuleHandleW(L"kernel32.dll");
-        if (hMod)
-        {
-            auto get_desc = reinterpret_cast<GetThreadDescriptionFn>(
-                reinterpret_cast<void*>(GetProcAddress(hMod, "GetThreadDescription")));
-            if (get_desc)
-            {
-                PWSTR thread_name = nullptr;
-                HRESULT hr = get_desc(handle, &thread_name);
-                if (SUCCEEDED(hr) && thread_name)
-                {
-                    int size = WideCharToMultiByte(CP_UTF8, 0, thread_name, -1, nullptr, 0, nullptr, nullptr);
-                    if (size > 0)
-                    {
-                        std::string result(size - 1, '\0');
-                        WideCharToMultiByte(CP_UTF8, 0, thread_name, -1, &result[0], size, nullptr, nullptr);
-                        LocalFree(thread_name);
-                        return result;
-                    }
-                    LocalFree(thread_name);
-                }
-            }
-        }
-        return std::nullopt;
-#else
-        char name[16]; // Linux limit + 1
-        auto const handle = const_cast<BaseThreadWrapper*>(this)->native_handle();
-
-        if (pthread_getname_np(handle, name, sizeof(name)) == 0)
-        {
-            return std::string(name);
-        }
-        return std::nullopt;
-#endif
+        return detail::read_name(const_cast<BaseThreadWrapper*>(this)->native_handle());
     }
 
     [[nodiscard]] auto set_priority(ThreadPriority priority) -> expected<void, std::error_code>
@@ -295,46 +230,7 @@ class BaseThreadWrapper : protected detail::ThreadStorage<ThreadType, OwnershipT
 
     [[nodiscard]] auto get_affinity() const -> std::optional<ThreadAffinity>
     {
-#ifdef _WIN32
-        const auto handle = const_cast<BaseThreadWrapper*>(this)->native_handle();
-        using GetThreadGroupAffinityFn = BOOL(WINAPI*)(HANDLE, PGROUP_AFFINITY);
-        HMODULE hMod = GetModuleHandleW(L"kernel32.dll");
-        if (hMod)
-        {
-            auto get_group_affinity = reinterpret_cast<GetThreadGroupAffinityFn>(
-                reinterpret_cast<void*>(GetProcAddress(hMod, "GetThreadGroupAffinity")));
-            if (get_group_affinity)
-            {
-                GROUP_AFFINITY ga{};
-                if (get_group_affinity(handle, &ga) != 0)
-                {
-                    ThreadAffinity affinity;
-                    for (int i = 0; i < 64; ++i)
-                    {
-                        if ((ga.Mask & (static_cast<KAFFINITY>(1) << i)) != 0)
-                        {
-                            affinity.add_cpu(static_cast<int>(ga.Group) * 64 + i);
-                        }
-                    }
-                    if (affinity.has_any())
-                    {
-                        return affinity;
-                    }
-                    return std::nullopt;
-                }
-            }
-            return std::nullopt;
-        }
-#else
-        ThreadAffinity affinity;
-        auto const handle = const_cast<BaseThreadWrapper*>(this)->native_handle();
-
-        if (pthread_getaffinity_np(handle, sizeof(cpu_set_t), &affinity.native_handle()) == 0)
-        {
-            return affinity;
-        }
-        return std::nullopt;
-#endif
+        return detail::read_affinity(const_cast<BaseThreadWrapper*>(this)->native_handle());
     }
 
     // Nice value (process-level, affects all threads)

From b727db9e225652aa0af3e71e6ae61a19ec075dc6 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Sun, 5 Apr 2026 16:19:23 +0200
Subject: [PATCH 03/15] Refactor thread management and error handling for
 improved clarity and maintainability

- Consolidated `ThreadRegistry` methods to inherit from `detail::QueryFacadeMixin`, reducing code duplication in query operations.
- Streamlined POSIX scheduling helpers by merging `apply_priority` and `apply_scheduling_policy` into a shared implementation.
- Simplified error handling in `PoolWithErrors` by introducing a new static factory method `TaskError::capture()` to centralize exception capturing.
- Reduced duplicated logic in `ThreadRegistry::register_current_thread` by delegating to a private `try_register` method.
- Updated CHANGELOG to reflect these enhancements and code reductions across multiple files.
---
 CHANGELOG.md                                  |  26 +-
 include/threadschedule/error_handler.hpp      |  26 +-
 include/threadschedule/scheduler_policy.hpp   |  51 ++-
 .../thread_pool_with_errors.hpp               |  63 ++--
 include/threadschedule/thread_registry.hpp    | 328 ++++++------------
 5 files changed, 202 insertions(+), 292 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 29f1f26..273677a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -102,7 +102,31 @@ auto futures = pool.submit_batch(tasks.begin(), tasks.end());
   empty, apply) are now inherited from `detail::QueryFacadeMixin<Derived>`
   CRTP base. No API change.
 
-- Net reduction: ~116 lines across 6 files.
+- **`ThreadRegistry` inherits `detail::QueryFacadeMixin`**: The 12 facade
+  methods (filter, map, for_each, find_if, any, all, none, take, skip, count,
+  empty, apply) are now provided by the same CRTP mixin as
+  `CompositeThreadRegistry`, eliminating the duplicate implementations.
+
+- **POSIX scheduling helpers consolidated**: `apply_priority` and
+  `apply_scheduling_policy` for both `pthread_t` and `pid_t` now share a
+  common `detail::apply_sched_params` template, eliminating duplicated param
+  validation and error handling.
+
+- **`ThreadRegistry::register_current_thread` consolidated**: Both overloads
+  now delegate to a private `try_register(RegisteredThreadInfo)` method,
+  removing the duplicated lock/emplace/callback logic.
+
+- **`PoolWithErrors` submit methods consolidated**: `submit()` and
+  `submit_with_description()` now delegate to a private `submit_impl` with
+  optional description parameter.
+
+- **`TaskError::capture()` factory**: New static factory method centralizes
+  the repeated exception/thread_id/timestamp capture pattern. Used by
+  `ErrorHandledTask` and `PoolWithErrors`.
+
+- **`ThreadControlBlock` native handle accessor**: Private `native_handle()`
+  method replaces four identical `#ifdef _WIN32` dispatch blocks in the
+  set_affinity/set_priority/set_scheduling_policy/set_name methods.
 
 ## v1.4.1
 
diff --git a/include/threadschedule/error_handler.hpp b/include/threadschedule/error_handler.hpp
index ebc4161..c9a30a0 100644
--- a/include/threadschedule/error_handler.hpp
+++ b/include/threadschedule/error_handler.hpp
@@ -37,6 +37,22 @@ struct TaskError
     /** @brief Monotonic timestamp recorded immediately after the exception was caught. */
     std::chrono::steady_clock::time_point timestamp;
 
+    /**
+     * @brief Capture the current in-flight exception into a TaskError.
+     *
+     * Must be called inside a @c catch block. Fills exception, thread_id,
+     * and timestamp; optionally sets task_description.
+     */
+    static auto capture(std::string description = {}) -> TaskError
+    {
+        TaskError err;
+        err.exception = std::current_exception();
+        err.task_description = std::move(description);
+        err.thread_id = std::this_thread::get_id();
+        err.timestamp = std::chrono::steady_clock::now();
+        return err;
+    }
+
     /**
      * @brief Extract the message string from the stored exception.
      *
@@ -239,15 +255,7 @@ class ErrorHandledTask
         catch (...)
         {
             if (handler_)
-            {
-                TaskError error;
-                error.exception = std::current_exception();
-                error.task_description = description_;
-                error.thread_id = std::this_thread::get_id();
-                error.timestamp = std::chrono::steady_clock::now();
-
-                handler_->handle_error(error);
-            }
+                handler_->handle_error(TaskError::capture(description_));
         }
     }
 
diff --git a/include/threadschedule/scheduler_policy.hpp b/include/threadschedule/scheduler_policy.hpp
index cb75892..61a9dfc 100644
--- a/include/threadschedule/scheduler_policy.hpp
+++ b/include/threadschedule/scheduler_policy.hpp
@@ -629,29 +629,33 @@ inline auto read_affinity(HANDLE handle) -> std::optional<ThreadAffinity>
 
 #else // POSIX
 
-// --- pthread_t overloads (BaseThreadWrapper, ThreadControlBlock, PThreadWrapper) ---
+// --- shared implementation for pthread_t and pid_t scheduling ---
 
-inline auto apply_priority(pthread_t handle, ThreadPriority priority) -> expected<void, std::error_code>
+template <typename SetSchedFn>
+inline auto apply_sched_params(SchedulingPolicy policy, ThreadPriority priority, SetSchedFn&& set_sched)
+    -> expected<void, std::error_code>
 {
-    int const policy = SCHED_OTHER;
-    auto params_result = SchedulerParams::create_for_policy(SchedulingPolicy::OTHER, priority);
+    int const policy_int = static_cast<int>(policy);
+    auto params_result = SchedulerParams::create_for_policy(policy, priority);
     if (!params_result.has_value())
         return unexpected(params_result.error());
-    if (pthread_setschedparam(handle, policy, &params_result.value()) == 0)
+    if (set_sched(policy_int, &params_result.value()) == 0)
         return {};
     return unexpected(std::error_code(errno, std::generic_category()));
 }
 
+// --- pthread_t overloads (BaseThreadWrapper, ThreadControlBlock, PThreadWrapper) ---
+
 inline auto apply_scheduling_policy(pthread_t handle, SchedulingPolicy policy, ThreadPriority priority)
     -> expected<void, std::error_code>
 {
-    int const policy_int = static_cast<int>(policy);
-    auto params_result = SchedulerParams::create_for_policy(policy, priority);
-    if (!params_result.has_value())
-        return unexpected(params_result.error());
-    if (pthread_setschedparam(handle, policy_int, &params_result.value()) == 0)
-        return {};
-    return unexpected(std::error_code(errno, std::generic_category()));
+    return apply_sched_params(policy, priority,
+                              [handle](int p, sched_param* sp) { return pthread_setschedparam(handle, p, sp); });
+}
+
+inline auto apply_priority(pthread_t handle, ThreadPriority priority) -> expected<void, std::error_code>
+{
+    return apply_scheduling_policy(handle, SchedulingPolicy::OTHER, priority);
 }
 
 inline auto apply_affinity(pthread_t handle, ThreadAffinity const& affinity) -> expected<void, std::error_code>
@@ -697,27 +701,16 @@ inline auto read_affinity(pthread_t handle) -> std::optional<ThreadAffinity>
 
 // --- pid_t / TID overloads (ThreadByNameView) ---
 
-inline auto apply_priority(pid_t tid, ThreadPriority priority) -> expected<void, std::error_code>
+inline auto apply_scheduling_policy(pid_t tid, SchedulingPolicy policy, ThreadPriority priority)
+    -> expected<void, std::error_code>
 {
-    int const policy = SCHED_OTHER;
-    auto params_result = SchedulerParams::create_for_policy(SchedulingPolicy::OTHER, priority);
-    if (!params_result.has_value())
-        return unexpected(params_result.error());
-    if (sched_setscheduler(tid, policy, &params_result.value()) == 0)
-        return {};
-    return unexpected(std::error_code(errno, std::generic_category()));
+    return apply_sched_params(policy, priority,
+                              [tid](int p, sched_param* sp) { return sched_setscheduler(tid, p, sp); });
 }
 
-inline auto apply_scheduling_policy(pid_t tid, SchedulingPolicy policy, ThreadPriority priority)
-    -> expected<void, std::error_code>
+inline auto apply_priority(pid_t tid, ThreadPriority priority) -> expected<void, std::error_code>
 {
-    int const policy_int = static_cast<int>(policy);
-    auto params_result = SchedulerParams::create_for_policy(policy, priority);
-    if (!params_result.has_value())
-        return unexpected(params_result.error());
-    if (sched_setscheduler(tid, policy_int, &params_result.value()) == 0)
-        return {};
-    return unexpected(std::error_code(errno, std::generic_category()));
+    return apply_scheduling_policy(tid, SchedulingPolicy::OTHER, priority);
 }
 
 inline auto apply_affinity(pid_t tid, ThreadAffinity const& affinity) -> expected<void, std::error_code>
diff --git a/include/threadschedule/thread_pool_with_errors.hpp b/include/threadschedule/thread_pool_with_errors.hpp
index 36b5c61..5e7e419 100644
--- a/include/threadschedule/thread_pool_with_errors.hpp
+++ b/include/threadschedule/thread_pool_with_errors.hpp
@@ -38,25 +38,7 @@ class PoolWithErrors
     template <typename F, typename... Args>
     auto submit(F&& f, Args&&... args) -> FutureWithErrorHandler<std::invoke_result_t<F, Args...>>
     {
-        auto handler = error_handler_;
-        auto wrapped_task = [f = std::forward<F>(f), args = std::make_tuple(std::forward<Args>(args)...), handler]() {
-            try
-            {
-                return std::apply(f, args);
-            }
-            catch (...)
-            {
-                TaskError error;
-                error.exception = std::current_exception();
-                error.thread_id = std::this_thread::get_id();
-                error.timestamp = std::chrono::steady_clock::now();
-                handler->handle_error(error);
-                throw;
-            }
-        };
-
-        auto future = pool_.submit(std::move(wrapped_task));
-        return FutureWithErrorHandler<std::invoke_result_t<F, Args...>>(std::move(future));
+        return submit_impl({}, std::forward<F>(f), std::forward<Args>(args)...);
     }
 
     /**
@@ -66,27 +48,7 @@ class PoolWithErrors
     auto submit_with_description(std::string const& description, F&& f, Args&&... args)
         -> FutureWithErrorHandler<std::invoke_result_t<F, Args...>>
     {
-        auto handler = error_handler_;
-        auto wrapped_task = [f = std::forward<F>(f), args = std::make_tuple(std::forward<Args>(args)...), handler,
-                             description]() {
-            try
-            {
-                return std::apply(f, args);
-            }
-            catch (...)
-            {
-                TaskError error;
-                error.exception = std::current_exception();
-                error.task_description = description;
-                error.thread_id = std::this_thread::get_id();
-                error.timestamp = std::chrono::steady_clock::now();
-                handler->handle_error(error);
-                throw;
-            }
-        };
-
-        auto future = pool_.submit(std::move(wrapped_task));
-        return FutureWithErrorHandler<std::invoke_result_t<F, Args...>>(std::move(future));
+        return submit_impl(description, std::forward<F>(f), std::forward<Args>(args)...);
     }
 
     auto add_error_callback(ErrorCallback callback) -> size_t
@@ -156,6 +118,27 @@ class PoolWithErrors
     }
 
   private:
+    template <typename F, typename... Args>
+    auto submit_impl(std::string description, F&& f, Args&&... args)
+        -> FutureWithErrorHandler<std::invoke_result_t<F, Args...>>
+    {
+        auto handler = error_handler_;
+        auto wrapped_task = [f = std::forward<F>(f), args = std::make_tuple(std::forward<Args>(args)...), handler,
+                             desc = std::move(description)]() {
+            try
+            {
+                return std::apply(f, args);
+            }
+            catch (...)
+            {
+                handler->handle_error(TaskError::capture(desc));
+                throw;
+            }
+        };
+        auto future = pool_.submit(std::move(wrapped_task));
+        return FutureWithErrorHandler<std::invoke_result_t<F, Args...>>(std::move(future));
+    }
+
     PoolType pool_;
     std::shared_ptr<ErrorHandler> error_handler_;
 };
diff --git a/include/threadschedule/thread_registry.hpp b/include/threadschedule/thread_registry.hpp
index 699a830..7dd5b9e 100644
--- a/include/threadschedule/thread_registry.hpp
+++ b/include/threadschedule/thread_registry.hpp
@@ -154,43 +154,36 @@ class ThreadControlBlock
     {
         return stdId_;
     }
-    // Removed name/component metadata from control block; metadata lives in RegisteredThreadInfo
-
-    [[nodiscard]] auto set_affinity(ThreadAffinity const& affinity) const -> expected<void, std::error_code>
+  private:
+    [[nodiscard]] auto native_handle() const
     {
 #ifdef _WIN32
-        return detail::apply_affinity(handle_, affinity);
+        return handle_;
 #else
-        return detail::apply_affinity(pthreadHandle_, affinity);
+        return pthreadHandle_;
 #endif
     }
 
+  public:
+    [[nodiscard]] auto set_affinity(ThreadAffinity const& affinity) const -> expected<void, std::error_code>
+    {
+        return detail::apply_affinity(native_handle(), affinity);
+    }
+
     [[nodiscard]] auto set_priority(ThreadPriority priority) const -> expected<void, std::error_code>
     {
-#ifdef _WIN32
-        return detail::apply_priority(handle_, priority);
-#else
-        return detail::apply_priority(pthreadHandle_, priority);
-#endif
+        return detail::apply_priority(native_handle(), priority);
     }
 
     [[nodiscard]] auto set_scheduling_policy(SchedulingPolicy policy, ThreadPriority priority) const
         -> expected<void, std::error_code>
     {
-#ifdef _WIN32
-        return detail::apply_scheduling_policy(handle_, policy, priority);
-#else
-        return detail::apply_scheduling_policy(pthreadHandle_, policy, priority);
-#endif
+        return detail::apply_scheduling_policy(native_handle(), policy, priority);
     }
 
     [[nodiscard]] auto set_name(std::string const& name) const -> expected<void, std::error_code>
     {
-#ifdef _WIN32
-        return detail::apply_name(handle_, name);
-#else
-        return detail::apply_name(pthreadHandle_, name);
-#endif
+        return detail::apply_name(native_handle(), name);
     }
 
     static auto create_for_current_thread() -> std::shared_ptr<ThreadControlBlock>
@@ -219,6 +212,86 @@ class ThreadControlBlock
 #endif
 };
 
+namespace detail
+{
+
+/**
+ * @brief CRTP mixin that provides functional-style query facade methods.
+ *
+ * The derived class must implement a public @c query() method returning a
+ * QueryView-like object. All facade methods (filter, map, for_each,
+ * find_if, any, all, none, take, skip, count, empty, apply) delegate to it.
+ *
+ * Return types are deduced via @c auto so the mixin can be used as a base
+ * class before the concrete QueryView type is fully defined (CRTP).
+ *
+ * @tparam Derived CRTP derived type.
+ */
+template <typename Derived>
+class QueryFacadeMixin
+{
+    auto self() const -> Derived const& { return static_cast<Derived const&>(*this); }
+
+  public:
+    template <typename Predicate>
+    [[nodiscard]] auto filter(Predicate&& pred) const
+    {
+        return self().query().filter(std::forward<Predicate>(pred));
+    }
+
+    [[nodiscard]] auto count() const -> size_t { return self().query().count(); }
+
+    [[nodiscard]] auto empty() const -> bool { return self().query().empty(); }
+
+    template <typename Fn>
+    void for_each(Fn&& fn) const
+    {
+        self().query().for_each(std::forward<Fn>(fn));
+    }
+
+    template <typename Predicate, typename Fn>
+    void apply(Predicate&& pred, Fn&& fn) const
+    {
+        self().query().filter(std::forward<Predicate>(pred)).for_each(std::forward<Fn>(fn));
+    }
+
+    template <typename Fn>
+    [[nodiscard]] auto map(Fn&& fn) const -> std::vector<std::invoke_result_t<Fn, RegisteredThreadInfo const&>>
+    {
+        return self().query().map(std::forward<Fn>(fn));
+    }
+
+    template <typename Predicate>
+    [[nodiscard]] auto find_if(Predicate&& pred) const -> std::optional<RegisteredThreadInfo>
+    {
+        return self().query().find_if(std::forward<Predicate>(pred));
+    }
+
+    template <typename Predicate>
+    [[nodiscard]] auto any(Predicate&& pred) const -> bool
+    {
+        return self().query().any(std::forward<Predicate>(pred));
+    }
+
+    template <typename Predicate>
+    [[nodiscard]] auto all(Predicate&& pred) const -> bool
+    {
+        return self().query().all(std::forward<Predicate>(pred));
+    }
+
+    template <typename Predicate>
+    [[nodiscard]] auto none(Predicate&& pred) const -> bool
+    {
+        return self().query().none(std::forward<Predicate>(pred));
+    }
+
+    [[nodiscard]] auto take(size_t n) const { return self().query().take(n); }
+
+    [[nodiscard]] auto skip(size_t n) const { return self().query().skip(n); }
+};
+
+} // namespace detail
+
 /**
  * @brief Central registry of threads indexed by OS-level thread ID (Tid).
  *
@@ -254,6 +327,8 @@ class ThreadControlBlock
  * query() returns a @ref QueryView holding a **snapshot** of the registry at the
  * moment of the call.  Subsequent changes to the registry (new
  * registrations, unregistrations) are not reflected in an existing @ref QueryView.
+ * The same functional-style helpers (filter, map, for_each, etc.) are
+ * inherited from @ref detail::QueryFacadeMixin.
  *
  * @par Scheduling helpers
  * set_affinity(), set_priority(), set_scheduling_policy(), and set_name()
@@ -261,43 +336,22 @@ class ThreadControlBlock
  * delegate to the control block.  Returns @c std::errc::no_such_process if
  * the TID is not registered or has no control block.
  */
-class ThreadRegistry
+class ThreadRegistry : public detail::QueryFacadeMixin<ThreadRegistry>
 {
   public:
     ThreadRegistry() = default;
     ThreadRegistry(ThreadRegistry const&) = delete;
     auto operator=(ThreadRegistry const&) -> ThreadRegistry& = delete;
 
-    // Register/unregister the CURRENT thread (to be called inside the running thread)
     void register_current_thread(std::string name = std::string(), std::string componentTag = std::string())
     {
-        Tid const tid = ThreadInfo::get_thread_id();
         RegisteredThreadInfo info;
-        info.tid = tid;
+        info.tid = ThreadInfo::get_thread_id();
         info.stdId = std::this_thread::get_id();
         info.name = std::move(name);
         info.componentTag = std::move(componentTag);
         info.alive = true;
-
-        {
-            std::unique_lock<std::shared_mutex> lock(mutex_);
-            auto it = threads_.find(tid);
-            if (it == threads_.end())
-            {
-                auto stored = info; // copy for callback
-                threads_.emplace(tid, std::move(info));
-                if (onRegister_)
-                {
-                    auto cb = onRegister_;
-                    lock.unlock();
-                    cb(stored);
-                }
-            }
-            else
-            {
-                // Duplicate registration of the same TID is a no-op (first registration wins)
-            }
-        }
+        try_register(std::move(info));
     }
 
     void register_current_thread(std::shared_ptr<ThreadControlBlock> const& controlBlock,
@@ -312,23 +366,7 @@ class ThreadRegistry
         info.componentTag = std::move(componentTag);
         info.alive = true;
         info.control = controlBlock;
-        std::unique_lock<std::shared_mutex> lock(mutex_);
-        auto it = threads_.find(info.tid);
-        if (it == threads_.end())
-        {
-            auto stored = info; // copy for callback
-            threads_.emplace(info.tid, std::move(info));
-            if (onRegister_)
-            {
-                auto cb = onRegister_;
-                lock.unlock();
-                cb(stored);
-            }
-        }
-        else
-        {
-            // Duplicate registration of the same TID is a no-op (first registration wins)
-        }
+        try_register(std::move(info));
     }
 
     void unregister_current_thread()
@@ -528,74 +566,6 @@ class ThreadRegistry
         return QueryView(std::move(snapshot));
     }
 
-    template <typename Predicate>
-    [[nodiscard]] auto filter(Predicate&& pred) const -> QueryView
-    {
-        return query().filter(std::forward<Predicate>(pred));
-    }
-
-    [[nodiscard]] auto count() const -> size_t
-    {
-        return query().count();
-    }
-
-    [[nodiscard]] auto empty() const -> bool
-    {
-        return query().empty();
-    }
-
-    template <typename Fn>
-    void for_each(Fn&& fn) const
-    {
-        query().for_each(std::forward<Fn>(fn));
-    }
-
-    template <typename Predicate, typename Fn>
-    void apply(Predicate&& pred, Fn&& fn) const
-    {
-        query().filter(std::forward<Predicate>(pred)).for_each(std::forward<Fn>(fn));
-    }
-
-    template <typename Fn>
-    [[nodiscard]] auto map(Fn&& fn) const -> std::vector<std::invoke_result_t<Fn, RegisteredThreadInfo const&>>
-    {
-        return query().map(std::forward<Fn>(fn));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto find_if(Predicate&& pred) const -> std::optional<RegisteredThreadInfo>
-    {
-        return query().find_if(std::forward<Predicate>(pred));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto any(Predicate&& pred) const -> bool
-    {
-        return query().any(std::forward<Predicate>(pred));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto all(Predicate&& pred) const -> bool
-    {
-        return query().all(std::forward<Predicate>(pred));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto none(Predicate&& pred) const -> bool
-    {
-        return query().none(std::forward<Predicate>(pred));
-    }
-
-    [[nodiscard]] auto take(size_t n) const -> QueryView
-    {
-        return query().take(n);
-    }
-
-    [[nodiscard]] auto skip(size_t n) const -> QueryView
-    {
-        return query().skip(n);
-    }
-
     [[nodiscard]] auto set_affinity(Tid tid, ThreadAffinity const& affinity) const -> expected<void, std::error_code>
     {
         auto blk = lock_block(tid);
@@ -643,6 +613,22 @@ class ThreadRegistry
     }
 
   private:
+    void try_register(RegisteredThreadInfo info)
+    {
+        std::unique_lock<std::shared_mutex> lock(mutex_);
+        auto it = threads_.find(info.tid);
+        if (it != threads_.end())
+            return;
+        auto stored = info;
+        threads_.emplace(info.tid, std::move(info));
+        if (onRegister_)
+        {
+            auto cb = onRegister_;
+            lock.unlock();
+            cb(stored);
+        }
+    }
+
     [[nodiscard]] auto lock_block(Tid tid) const -> std::shared_ptr<ThreadControlBlock>
     {
         std::shared_lock<std::shared_mutex> lock(mutex_);
@@ -654,7 +640,6 @@ class ThreadRegistry
     mutable std::shared_mutex mutex_;
     std::unordered_map<Tid, RegisteredThreadInfo> threads_;
 
-    // Integration hooks
     std::function<void(RegisteredThreadInfo const&)> onRegister_;
     std::function<void(RegisteredThreadInfo const&)> onUnregister_;
 };
@@ -778,89 +763,6 @@ inline auto build_mode_string() -> char const*
     return is_runtime_build ? "runtime" : "header-only";
 }
 
-namespace detail
-{
-
-/**
- * @brief CRTP mixin that provides functional-style query facade methods.
- *
- * The derived class must implement a public @c query() method returning a
- * @ref ThreadRegistry::QueryView. All facade methods (filter, map, for_each,
- * find_if, any, all, none, take, skip, count, empty, apply) delegate to it.
- *
- * @tparam Derived CRTP derived type.
- */
-template <typename Derived>
-class QueryFacadeMixin
-{
-    auto self() const -> Derived const& { return static_cast<Derived const&>(*this); }
-
-  public:
-    template <typename Predicate>
-    [[nodiscard]] auto filter(Predicate&& pred) const -> ThreadRegistry::QueryView
-    {
-        return self().query().filter(std::forward<Predicate>(pred));
-    }
-
-    [[nodiscard]] auto count() const -> size_t { return self().query().count(); }
-
-    [[nodiscard]] auto empty() const -> bool { return self().query().empty(); }
-
-    template <typename Fn>
-    void for_each(Fn&& fn) const
-    {
-        self().query().for_each(std::forward<Fn>(fn));
-    }
-
-    template <typename Predicate, typename Fn>
-    void apply(Predicate&& pred, Fn&& fn) const
-    {
-        self().query().filter(std::forward<Predicate>(pred)).for_each(std::forward<Fn>(fn));
-    }
-
-    template <typename Fn>
-    [[nodiscard]] auto map(Fn&& fn) const -> std::vector<std::invoke_result_t<Fn, RegisteredThreadInfo const&>>
-    {
-        return self().query().map(std::forward<Fn>(fn));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto find_if(Predicate&& pred) const -> std::optional<RegisteredThreadInfo>
-    {
-        return self().query().find_if(std::forward<Predicate>(pred));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto any(Predicate&& pred) const -> bool
-    {
-        return self().query().any(std::forward<Predicate>(pred));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto all(Predicate&& pred) const -> bool
-    {
-        return self().query().all(std::forward<Predicate>(pred));
-    }
-
-    template <typename Predicate>
-    [[nodiscard]] auto none(Predicate&& pred) const -> bool
-    {
-        return self().query().none(std::forward<Predicate>(pred));
-    }
-
-    [[nodiscard]] auto take(size_t n) const -> ThreadRegistry::QueryView
-    {
-        return self().query().take(n);
-    }
-
-    [[nodiscard]] auto skip(size_t n) const -> ThreadRegistry::QueryView
-    {
-        return self().query().skip(n);
-    }
-};
-
-} // namespace detail
-
 /**
  * @brief Aggregates multiple ThreadRegistry instances into a single queryable
  *        view.

From d42e1e1046b674711698d8117933c6ce8ace2459 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Sun, 5 Apr 2026 17:11:49 +0200
Subject: [PATCH 04/15] Enhance thread pool functionality and error handling
 features

- Introduced quality-of-life improvements including stable callback management in `ErrorHandler` with `remove_callback(id)` and `has_callback(id)` methods.
- Added non-throwing submission methods `try_submit()` and `try_submit_batch()` for all pool types, returning `expected<std::future<T>, std::error_code>`.
- Implemented chunked work distribution in `parallel_for_each` for better performance across thread pools.
- Configured `HighPerformancePool` with adjustable deque capacity and pre-configuration of thread count via `GlobalPool::init(n)`.
- Added C++20 ranges overloads for batch submissions and parallel processing.
- Introduced cooperative cancellation support with `submit(stop_token, F, Args...)` overloads.
- Added new future combinators in `futures.hpp` for enhanced future management.
- Updated CHANGELOG to reflect these new features and improvements.
---
 CHANGELOG.md                                  |  58 +-
 include/threadschedule/error_handler.hpp      |  51 +-
 include/threadschedule/futures.hpp            | 186 ++++++
 include/threadschedule/task.hpp               | 115 +++-
 include/threadschedule/thread_pool.hpp        | 558 +++++++++++++++---
 .../thread_pool_with_errors.hpp               |  38 ++
 include/threadschedule/threadschedule.hpp     |  18 +-
 7 files changed, 904 insertions(+), 120 deletions(-)
 create mode 100644 include/threadschedule/futures.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 273677a..f6ca2fd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,12 +35,68 @@
 - **`GlobalThreadPool`, `GlobalHighPerformancePool`** are now type aliases for
   `GlobalPool<Pool>`. The public API is unchanged.
 
+### Quality-of-Life Features
+
+- **`ErrorHandler::remove_callback(id)` / `has_callback(id)`** -- callbacks
+  are now stored in a `std::map` with stable IDs. Individual callbacks can be
+  removed without clearing all of them.
+
+- **`try_submit()` / `try_submit_batch()`** -- non-throwing submission for all
+  pool types, returning `expected<std::future<T>, std::error_code>` instead of
+  throwing on shutdown.
+
+- **Chunked `parallel_for_each`** -- `ThreadPoolBase` now uses the same
+  chunked work distribution as `HighPerformancePool` via a shared
+  `detail::parallel_for_each_chunked` helper (one task per element is gone).
+
+- **`PollingWait<IntervalMs>`** -- tunable polling interval (default 10 ms).
+  `FastThreadPool` is `ThreadPoolBase<PollingWait<>>`.
+
+- **`HighPerformancePool` deque capacity** -- configurable via constructor:
+  `HighPerformancePool(threads, deque_capacity)`.
+
+- **`GlobalPool::init(n)`** -- pre-configure thread count before first use
+  (std::call_once semantics).
+
+- **C++20 ranges overloads** -- `submit_batch(range)`, `try_submit_batch(range)`,
+  `parallel_for_each(range, func)` on all pool types and GlobalPool. Guarded
+  by `__cpp_lib_ranges`.
+
+- **Auto-register pool workers** -- opt-in `register_workers` flag on both
+  pool constructors. Workers register/unregister automatically via
+  `AutoRegisterCurrentThread` RAII guard.
+
+- **Per-task tracing hooks** -- `set_on_task_start(callback)` and
+  `set_on_task_end(callback)` on both pool types. Callbacks receive timestamp,
+  thread ID, and (for end) elapsed duration.
+
+- **Cooperative cancellation** -- `submit(stop_token, F, Args...)` and
+  `try_submit(stop_token, F, Args...)` overloads. Tasks are skipped if stop is
+  requested. Guarded by `__cpp_lib_jthread`.
+
+- **Future combinators** -- new `futures.hpp` with `when_all`, `when_any`,
+  `when_all_settled` (typed and void specializations).
+
+- **Lifecycle modes** -- `ShutdownPolicy::drain` (default) and
+  `ShutdownPolicy::drop_pending`. `shutdown(policy)` replaces the old
+  no-argument `shutdown()`. `shutdown_for(timeout)` provides timed drain.
+
+- **Coroutine scheduler integration** -- `schedule_on{pool}` awaitable to hop
+  to a pool thread, `executor_base` / `pool_executor<Pool>` type-erased
+  executor for pool-aware tasks, `run_on(pool, coro_fn)` convenience returning
+  `std::future`.
+
 ### New Types
 
 - `ThreadPoolBase<WaitPolicy>` - parameterized single-queue thread pool.
-- `IndefiniteWait` / `PollingWait` - wait policy types for `ThreadPoolBase`.
+- `IndefiniteWait` / `PollingWait<IntervalMs>` - wait policy types for `ThreadPoolBase`.
 - `PoolWithErrors<PoolType>` - generic error-handling pool wrapper.
 - `GlobalPool<PoolType>` - generic singleton pool accessor.
+- `ShutdownPolicy` - enum controlling shutdown behavior (drain / drop_pending).
+- `TaskStartCallback` / `TaskEndCallback` - tracing callback types.
+- `executor_base` / `pool_executor<Pool>` - type-erased executor for coroutines.
+- `schedule_on<Pool>` - awaitable for hopping to a pool thread.
+- `futures.hpp` - future combinators (`when_all`, `when_any`, `when_all_settled`).
 
 ### Internal Improvements
 
diff --git a/include/threadschedule/error_handler.hpp b/include/threadschedule/error_handler.hpp
index c9a30a0..ed8b576 100644
--- a/include/threadschedule/error_handler.hpp
+++ b/include/threadschedule/error_handler.hpp
@@ -4,6 +4,7 @@
 #include <exception>
 #include <functional>
 #include <future>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <string>
@@ -127,11 +128,11 @@ using ErrorCallback = std::function<void(TaskError const&)>;
  * - If a callback itself throws, the exception is silently swallowed so that
  *   remaining callbacks still execute.
  *
- * @par Limitations
- * add_callback() returns an index that identifies the callback, but there is
- * no @c remove_callback() -- only clear_callbacks() removes all callbacks at
- * once.  The error count returned by error_count() is monotonically
- * increasing and is only reset by an explicit call to reset_error_count().
+ * @par Callback management
+ * add_callback() returns a stable ID that can be passed to remove_callback()
+ * to unregister a single callback.  clear_callbacks() removes all at once.
+ * The error count returned by error_count() is monotonically increasing and
+ * is only reset by an explicit call to reset_error_count().
  */
 class ErrorHandler
 {
@@ -140,15 +141,35 @@ class ErrorHandler
      * @brief Register an error callback.
      *
      * @param callback Callable to invoke when a task throws.
-     * @return Zero-based index (handle) of the newly added callback.
-     *         There is currently no API to remove an individual callback;
-     *         use clear_callbacks() to remove all.
+     * @return Stable ID for the callback, usable with remove_callback().
      */
     auto add_callback(ErrorCallback callback) -> size_t
     {
         std::lock_guard<std::mutex> lock(mutex_);
-        callbacks_.push_back(std::move(callback));
-        return callbacks_.size() - 1;
+        size_t const id = next_callback_id_++;
+        callbacks_.emplace(id, std::move(callback));
+        return id;
+    }
+
+    /**
+     * @brief Remove a single callback by its ID.
+     *
+     * @param id The ID returned by add_callback().
+     * @return @c true if the callback was found and removed, @c false otherwise.
+     */
+    auto remove_callback(size_t id) -> bool
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        return callbacks_.erase(id) > 0;
+    }
+
+    /**
+     * @brief Check whether a callback with the given ID is registered.
+     */
+    [[nodiscard]] auto has_callback(size_t id) const -> bool
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        return callbacks_.count(id) > 0;
     }
 
     /**
@@ -167,8 +188,8 @@ class ErrorHandler
      * @brief Dispatch an error to all registered callbacks.
      *
      * Increments the internal error counter and then invokes every registered
-     * callback in order.  If any callback throws, the exception is caught and
-     * silently discarded so that subsequent callbacks still run.
+     * callback in insertion order.  If any callback throws, the exception is
+     * caught and silently discarded so that subsequent callbacks still run.
      *
      * @param error Diagnostic information about the failed task.
      */
@@ -177,7 +198,7 @@ class ErrorHandler
         std::lock_guard<std::mutex> lock(mutex_);
         error_count_++;
 
-        for (auto const& callback : callbacks_)
+        for (auto const& [id, callback] : callbacks_)
         {
             try
             {
@@ -185,7 +206,6 @@ class ErrorHandler
             }
             catch (...)
             {
-                // Error handlers should not throw, but we catch just in case
             }
         }
     }
@@ -215,7 +235,8 @@ class ErrorHandler
 
   private:
     mutable std::mutex mutex_;
-    std::vector<ErrorCallback> callbacks_;
+    std::map<size_t, ErrorCallback> callbacks_;
+    size_t next_callback_id_{0};
     size_t error_count_{0};
 };
 
diff --git a/include/threadschedule/futures.hpp b/include/threadschedule/futures.hpp
new file mode 100644
index 0000000..b637bfc
--- /dev/null
+++ b/include/threadschedule/futures.hpp
@@ -0,0 +1,186 @@
+#pragma once
+
+/**
+ * @file futures.hpp
+ * @brief Combinators for @c std::future: @ref when_all, @ref when_any,
+ *        @ref when_all_settled.
+ *
+ * These utilities simplify waiting on multiple futures produced by thread
+ * pool submissions.
+ */
+
+#include "expected.hpp"
+
+#include <chrono>
+#include <exception>
+#include <future>
+#include <utility>
+#include <vector>
+
+namespace threadschedule
+{
+
+/**
+ * @brief Block until all futures complete, returning results in submission order.
+ *
+ * If any future throws, the first exception is captured and re-thrown after
+ * all remaining futures have been waited on (to avoid leaving them dangling).
+ *
+ * @tparam T The value type of each future.
+ * @param futures A vector of futures to wait on. Moved-from on return.
+ * @return A vector of values in the same order as the input futures.
+ */
+template <typename T>
+auto when_all(std::vector<std::future<T>>& futures) -> std::vector<T>
+{
+    std::vector<T> results;
+    results.reserve(futures.size());
+    std::exception_ptr first_error;
+
+    for (auto& f : futures)
+    {
+        try
+        {
+            results.push_back(f.get());
+        }
+        catch (...)
+        {
+            if (!first_error)
+                first_error = std::current_exception();
+            results.emplace_back();
+        }
+    }
+
+    if (first_error)
+        std::rethrow_exception(first_error);
+
+    return results;
+}
+
+/**
+ * @brief Block until all void futures complete.
+ *
+ * Re-throws the first exception after all futures have been waited on.
+ */
+inline void when_all(std::vector<std::future<void>>& futures)
+{
+    std::exception_ptr first_error;
+
+    for (auto& f : futures)
+    {
+        try
+        {
+            f.get();
+        }
+        catch (...)
+        {
+            if (!first_error)
+                first_error = std::current_exception();
+        }
+    }
+
+    if (first_error)
+        std::rethrow_exception(first_error);
+}
+
+/**
+ * @brief Block until all futures complete, returning an @c expected per slot.
+ *
+ * Never throws. Each slot is either the result value or the captured
+ * @c std::exception_ptr.
+ *
+ * @tparam T The value type of each future.
+ */
+template <typename T>
+auto when_all_settled(std::vector<std::future<T>>& futures)
+    -> std::vector<expected<T, std::exception_ptr>>
+{
+    std::vector<expected<T, std::exception_ptr>> results;
+    results.reserve(futures.size());
+
+    for (auto& f : futures)
+    {
+        try
+        {
+            results.push_back(f.get());
+        }
+        catch (...)
+        {
+            results.push_back(unexpected(std::current_exception()));
+        }
+    }
+
+    return results;
+}
+
+/**
+ * @brief Block until all void futures complete, returning an @c expected per slot.
+ */
+inline auto when_all_settled(std::vector<std::future<void>>& futures)
+    -> std::vector<expected<void, std::exception_ptr>>
+{
+    std::vector<expected<void, std::exception_ptr>> results;
+    results.reserve(futures.size());
+
+    for (auto& f : futures)
+    {
+        try
+        {
+            f.get();
+            results.emplace_back();
+        }
+        catch (...)
+        {
+            results.push_back(unexpected(std::current_exception()));
+        }
+    }
+
+    return results;
+}
+
+/**
+ * @brief Block until the first future becomes ready.
+ *
+ * Polls all futures round-robin with a 1 ms timeout until one is ready,
+ * then returns its index and value.
+ *
+ * @note The remaining futures are left in their current state -- the caller
+ *       is responsible for managing their lifetime.
+ *
+ * @tparam T The value type of each future.
+ * @return A pair of (index of the first ready future, its value).
+ */
+template <typename T>
+auto when_any(std::vector<std::future<T>>& futures) -> std::pair<size_t, T>
+{
+    while (true)
+    {
+        for (size_t i = 0; i < futures.size(); ++i)
+        {
+            if (futures[i].wait_for(std::chrono::milliseconds(1)) == std::future_status::ready)
+                return {i, futures[i].get()};
+        }
+    }
+}
+
+/**
+ * @brief Block until the first void future becomes ready.
+ *
+ * @return The index of the first ready future.
+ */
+inline auto when_any(std::vector<std::future<void>>& futures) -> size_t
+{
+    while (true)
+    {
+        for (size_t i = 0; i < futures.size(); ++i)
+        {
+            if (futures[i].wait_for(std::chrono::milliseconds(1)) == std::future_status::ready)
+            {
+                futures[i].get();
+                return i;
+            }
+        }
+    }
+}
+
+} // namespace threadschedule
diff --git a/include/threadschedule/task.hpp b/include/threadschedule/task.hpp
index d0bea60..cbeccbd 100644
--- a/include/threadschedule/task.hpp
+++ b/include/threadschedule/task.hpp
@@ -16,6 +16,8 @@
 #include <atomic>
 #include <coroutine>
 #include <exception>
+#include <functional>
+#include <future>
 #include <optional>
 #include <type_traits>
 #include <utility>
@@ -26,6 +28,35 @@ namespace threadschedule
 template <typename T = void>
 class task;
 
+/**
+ * @brief Type-erased executor interface for pool-aware coroutines.
+ *
+ * Implementations schedule a coroutine handle for execution on a specific
+ * executor (e.g. a thread pool).
+ */
+struct executor_base
+{
+    virtual void execute(std::coroutine_handle<>) = 0;
+    virtual ~executor_base() = default;
+};
+
+/**
+ * @brief Executor that dispatches coroutine resumption to a thread pool.
+ *
+ * @tparam Pool A thread pool type providing @c submit(Callable).
+ */
+template <typename Pool>
+struct pool_executor : executor_base
+{
+    Pool& pool;
+    explicit pool_executor(Pool& p) : pool(p) {}
+
+    void execute(std::coroutine_handle<> h) override
+    {
+        pool.submit([h]() mutable { h.resume(); });
+    }
+};
+
 namespace detail
 {
 
@@ -35,10 +66,9 @@ namespace detail
  * @internal This is an implementation detail of the task coroutine machinery.
  *
  * When a task's coroutine body finishes, `final_awaiter` is returned from
- * `final_suspend()`. It is never ready (always suspends), and on suspension
- * it symmetric-transfers to the stored continuation. If no continuation has
- * been set (e.g. the task was started via `sync_wait`), it transfers to
- * `std::noop_coroutine()` to avoid undefined behaviour.
+ * `final_suspend()`. If an executor is set on the promise, the continuation
+ * is dispatched through it (e.g. resumed on a pool thread). Otherwise,
+ * symmetric transfer is used for zero-overhead inline resumption.
  */
 struct final_awaiter
 {
@@ -50,7 +80,13 @@ struct final_awaiter
     template <typename Promise>
     auto await_suspend(std::coroutine_handle<Promise> h) const noexcept -> std::coroutine_handle<>
     {
-        if (auto cont = h.promise().continuation_; cont)
+        auto cont = h.promise().continuation_;
+        if (h.promise().executor_ && cont)
+        {
+            h.promise().executor_->execute(cont);
+            return std::noop_coroutine();
+        }
+        if (cont)
             return cont;
         return std::noop_coroutine();
     }
@@ -78,6 +114,9 @@ struct final_awaiter
  * - **Continuation:** `continuation_` is set by the task's awaiter just
  *   before resuming the task. `final_awaiter` uses it to return control
  *   to the parent coroutine.
+ * - **Executor:** If `executor_` is set (e.g. via `schedule_on`), the
+ *   continuation is dispatched through the executor instead of using
+ *   symmetric transfer.
  */
 template <typename T>
 class task_promise_base
@@ -107,6 +146,7 @@ class task_promise_base
     }
 
     std::coroutine_handle<> continuation_{};
+    executor_base* executor_{nullptr};
 
   protected:
     std::exception_ptr exception_{};
@@ -579,6 +619,71 @@ inline void sync_wait(task<void> t)
         std::rethrow_exception(ex);
 }
 
+// ---------------------------------------------------------------------------
+// schedule_on awaitable
+// ---------------------------------------------------------------------------
+
+/**
+ * @brief Awaitable that transfers execution to a thread pool.
+ *
+ * Use `co_await schedule_on{pool}` inside any coroutine to continue
+ * execution on one of the pool's worker threads.
+ *
+ * @tparam Pool A thread pool type providing @c submit(Callable).
+ *
+ * @par Example
+ * @code
+ * task<void> work(HighPerformancePool& pool) {
+ *     co_await schedule_on{pool};
+ *     // now running on a pool thread
+ * }
+ * @endcode
+ */
+template <typename Pool>
+struct schedule_on
+{
+    Pool& pool;
+
+    [[nodiscard]] auto await_ready() const noexcept -> bool { return false; }
+
+    void await_suspend(std::coroutine_handle<> h) const
+    {
+        pool.submit([h]() mutable { h.resume(); });
+    }
+
+    void await_resume() const noexcept {}
+};
+
+// ---------------------------------------------------------------------------
+// run_on convenience
+// ---------------------------------------------------------------------------
+
+/**
+ * @brief Submit a coroutine-returning callable to a pool and return a
+ *        @c std::future for its result.
+ *
+ * The callable is invoked on a pool worker thread. Inside the callable,
+ * you can use `co_await` freely -- all continuations run on the calling
+ * pool unless explicitly transferred elsewhere.
+ *
+ * @tparam Pool A thread pool type providing @c submit(Callable).
+ * @tparam F    A callable returning @c task<T>.
+ *
+ * @par Example
+ * @code
+ * auto future = run_on(pool, []() -> task<int> { co_return 42; });
+ * int v = future.get();
+ * @endcode
+ */
+template <typename Pool, typename F>
+auto run_on(Pool& pool, F&& coro_fn)
+    -> std::future<decltype(sync_wait(std::declval<std::invoke_result_t<F>>()))>
+{
+    return pool.submit([fn = std::forward<F>(coro_fn)]() mutable {
+        return sync_wait(fn());
+    });
+}
+
 } // namespace threadschedule
 
 #endif // __cpp_impl_coroutine
diff --git a/include/threadschedule/thread_pool.hpp b/include/threadschedule/thread_pool.hpp
index c26e6f2..d6891eb 100644
--- a/include/threadschedule/thread_pool.hpp
+++ b/include/threadschedule/thread_pool.hpp
@@ -2,17 +2,24 @@
 
 #include "expected.hpp"
 #include "scheduler_policy.hpp"
+#include "thread_registry.hpp"
 #include "thread_wrapper.hpp"
 #include <algorithm>
 #include <array>
+#include <cstdint>
 #include <atomic>
 #include <condition_variable>
 #include <future>
 #include <mutex>
+#include <optional>
 #include <queue>
 #include <random>
 #include <vector>
 
+#if __cpp_lib_ranges >= 201911L
+#include <ranges>
+#endif
+
 namespace threadschedule
 {
 
@@ -70,6 +77,36 @@ inline auto distribute_workers_across_cpus(WorkerRange& workers) -> expected<voi
     return unexpected(std::make_error_code(std::errc::operation_not_permitted));
 }
 
+template <typename Pool, typename Iterator, typename F>
+inline void parallel_for_each_chunked(Pool& pool, Iterator begin, Iterator end, F&& func, size_t num_workers)
+{
+    auto const total = static_cast<size_t>(std::distance(begin, end));
+    if (total == 0)
+        return;
+
+    size_t const chunk_size = (std::max)(size_t(1), total / (num_workers * 4));
+    std::vector<std::future<void>> futures;
+    auto it = begin;
+
+    while (it != end)
+    {
+        auto remaining = static_cast<size_t>(std::distance(it, end));
+        auto this_chunk = (std::min)(chunk_size, remaining);
+        auto chunk_end = it;
+        std::advance(chunk_end, this_chunk);
+
+        futures.push_back(pool.submit([it, chunk_end, &func]() {
+            for (auto cur = it; cur != chunk_end; ++cur)
+                func(*cur);
+        }));
+
+        it = chunk_end;
+    }
+
+    for (auto& f : futures)
+        f.get();
+}
+
 } // namespace detail
 
 /**
@@ -102,6 +139,14 @@ inline auto distribute_workers_across_cpus(WorkerRange& workers) -> expected<voi
  *
  * @tparam T The task type. Must be move-constructible.
  */
+
+/// Callback invoked when a pool worker begins executing a task.
+using TaskStartCallback = std::function<void(std::chrono::steady_clock::time_point, std::thread::id)>;
+
+/// Callback invoked when a pool worker finishes executing a task.
+using TaskEndCallback = std::function<void(std::chrono::steady_clock::time_point, std::thread::id,
+                                           std::chrono::microseconds elapsed)>;
+
 template <typename T>
 class WorkStealingDeque
 {
@@ -211,6 +256,13 @@ class WorkStealingDeque
     {
         return size() == 0;
     }
+
+    void clear()
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        bottom_.store(0, std::memory_order_relaxed);
+        top_.store(0, std::memory_order_relaxed);
+    }
 };
 
 /**
@@ -293,6 +345,16 @@ class WorkStealingDeque
  *       work-stealing complexity. Best for high-throughput scenarios like
  *       image processing, batch operations, etc.
  */
+
+/**
+ * @brief Controls how a pool handles pending tasks during shutdown.
+ */
+enum class ShutdownPolicy : uint8_t
+{
+    drain,        ///< Finish all queued tasks before stopping (default).
+    drop_pending  ///< Finish running tasks, discard queued ones.
+};
+
 class HighPerformancePool
 {
   public:
@@ -309,14 +371,16 @@ class HighPerformancePool
         std::chrono::microseconds avg_task_time;
     };
 
-    explicit HighPerformancePool(size_t num_threads = std::thread::hardware_concurrency())
-        : num_threads_(num_threads == 0 ? 1 : num_threads), stop_(false), next_victim_(0),
-          start_time_(std::chrono::steady_clock::now())
+    explicit HighPerformancePool(size_t num_threads = std::thread::hardware_concurrency(),
+                                 size_t deque_capacity = WorkStealingDeque<Task>::DEFAULT_CAPACITY,
+                                 bool register_workers = false)
+        : num_threads_(num_threads == 0 ? 1 : num_threads), register_workers_(register_workers),
+          stop_(false), next_victim_(0), start_time_(std::chrono::steady_clock::now())
     {
         worker_queues_.resize(num_threads_);
         for (size_t i = 0; i < num_threads_; ++i)
         {
-            worker_queues_[i] = std::make_unique<WorkStealingDeque<Task>>();
+            worker_queues_[i] = std::make_unique<WorkStealingDeque<Task>>(deque_capacity);
         }
 
         workers_.reserve(num_threads_);
@@ -332,14 +396,73 @@ class HighPerformancePool
 
     ~HighPerformancePool()
     {
-        shutdown();
+        shutdown(ShutdownPolicy::drain);
+    }
+
+    /**
+     * @brief Shut the pool down.
+     *
+     * @param policy @c drain (default) finishes all queued tasks;
+     *               @c drop_pending discards queued tasks.
+     */
+    void shutdown(ShutdownPolicy policy = ShutdownPolicy::drain)
+    {
+        {
+            std::lock_guard<std::mutex> lock(overflow_mutex_);
+            if (stop_.exchange(true, std::memory_order_acq_rel))
+                return;
+
+            if (policy == ShutdownPolicy::drop_pending)
+            {
+                std::queue<Task> empty;
+                overflow_tasks_.swap(empty);
+                for (auto& q : worker_queues_)
+                    q->clear();
+            }
+        }
+
+        wakeup_condition_.notify_all();
+
+        for (auto& worker : workers_)
+        {
+            if (worker.joinable())
+                worker.join();
+        }
+
+        workers_.clear();
+    }
+
+    /**
+     * @brief Attempt a timed drain: finish as many tasks as possible within
+     *        @p timeout, then force-stop remaining workers.
+     * @return @c true if all tasks completed within the deadline,
+     *         @c false if the timeout expired first.
+     */
+    auto shutdown_for(std::chrono::milliseconds timeout) -> bool
+    {
+        auto const deadline = std::chrono::steady_clock::now() + timeout;
+
+        {
+            std::lock_guard<std::mutex> lock(overflow_mutex_);
+            if (stop_.load(std::memory_order_acquire))
+                return true;
+        }
+
+        std::unique_lock<std::mutex> lock(completion_mutex_);
+        bool const drained = completion_condition_.wait_until(lock, deadline, [this] {
+            return pending_tasks() == 0 && active_tasks_.load(std::memory_order_acquire) == 0;
+        });
+
+        shutdown(ShutdownPolicy::drain);
+        return drained;
     }
 
     /**
-     * @brief High-performance task submission (optimized hot path)
+     * @brief Submit a task, returning an error instead of throwing on shutdown.
      */
     template <typename F, typename... Args>
-    auto submit(F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
+    auto try_submit(F&& f, Args&&... args)
+        -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
     {
         using return_type = std::invoke_result_t<F, Args...>;
 
@@ -349,9 +472,7 @@ class HighPerformancePool
         std::future<return_type> result = task->get_future();
 
         if (stop_.load(std::memory_order_acquire))
-        {
-            throw std::runtime_error("HighPerformancePool is shutting down");
-        }
+            return unexpected(std::make_error_code(std::errc::operation_canceled));
 
         size_t const preferred_queue = next_victim_.fetch_add(1, std::memory_order_relaxed) % num_threads_;
 
@@ -374,9 +495,7 @@ class HighPerformancePool
         {
             std::lock_guard<std::mutex> lock(overflow_mutex_);
             if (stop_.load(std::memory_order_relaxed))
-            {
-                throw std::runtime_error("HighPerformancePool is shutting down");
-            }
+                return unexpected(std::make_error_code(std::errc::operation_canceled));
             overflow_tasks_.emplace([task]() { (*task)(); });
         }
 
@@ -385,19 +504,61 @@ class HighPerformancePool
     }
 
     /**
-     * @brief Batch task submission for maximum throughput
+     * @brief Submit a task. Throws std::runtime_error if the pool is shutting down.
+     */
+    template <typename F, typename... Args>
+    auto submit(F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
+    {
+        auto result = try_submit(std::forward<F>(f), std::forward<Args>(args)...);
+        if (!result.has_value())
+            throw std::runtime_error("HighPerformancePool is shutting down");
+        return std::move(result.value());
+    }
+
+#if __cpp_lib_jthread >= 201911L
+    /**
+     * @brief Submit a cancellable task. If stop is already requested the task
+     *        is skipped and the future throws @c std::future_error (broken_promise).
+     */
+    template <typename F, typename... Args>
+    auto submit(std::stop_token token, F&& f, Args&&... args)
+        -> std::future<std::invoke_result_t<F, Args...>>
+    {
+        return submit([token = std::move(token), fn = std::bind(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
+            if (token.stop_requested())
+                return decltype(fn())();
+            return fn();
+        });
+    }
+
+    /**
+     * @brief Non-throwing cancellable submission.
+     */
+    template <typename F, typename... Args>
+    auto try_submit(std::stop_token token, F&& f, Args&&... args)
+        -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
+    {
+        return try_submit([token = std::move(token), fn = std::bind(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
+            if (token.stop_requested())
+                return decltype(fn())();
+            return fn();
+        });
+    }
+#endif
+
+    /**
+     * @brief Batch task submission, returning an error instead of throwing on shutdown.
      */
     template <typename Iterator>
-    auto submit_batch(Iterator begin, Iterator end) -> std::vector<std::future<void>>
+    auto try_submit_batch(Iterator begin, Iterator end)
+        -> expected<std::vector<std::future<void>>, std::error_code>
     {
         std::vector<std::future<void>> futures;
         size_t const batch_size = std::distance(begin, end);
         futures.reserve(batch_size);
 
         if (stop_.load(std::memory_order_acquire))
-        {
-            throw std::runtime_error("HighPerformancePool is shutting down");
-        }
+            return unexpected(std::make_error_code(std::errc::operation_canceled));
 
         size_t queue_idx = next_victim_.fetch_add(batch_size, std::memory_order_relaxed) % num_threads_;
 
@@ -429,37 +590,39 @@ class HighPerformancePool
     }
 
     /**
-     * @brief Optimized parallel for_each with work distribution
+     * @brief Batch task submission. Throws on shutdown.
+     */
+    template <typename Iterator>
+    auto submit_batch(Iterator begin, Iterator end) -> std::vector<std::future<void>>
+    {
+        auto result = try_submit_batch(begin, end);
+        if (!result.has_value())
+            throw std::runtime_error("HighPerformancePool is shutting down");
+        return std::move(result.value());
+    }
+
+    /**
+     * @brief Apply a function to a range in parallel using chunked work distribution.
      */
     template <typename Iterator, typename F>
     void parallel_for_each(Iterator begin, Iterator end, F&& func)
     {
-        size_t const total_items = std::distance(begin, end);
-        if (total_items == 0)
-            return;
-
-        size_t const chunk_size = (std::max)(size_t(1), total_items / (num_threads_ * 4));
-        std::vector<std::future<void>> futures;
-
-        for (auto it = begin; it < end;)
-        {
-            auto chunk_end = (std::min)(it + chunk_size, end);
+        detail::parallel_for_each_chunked(*this, begin, end, std::forward<F>(func), num_threads_);
+    }
 
-            futures.push_back(submit([func, it, chunk_end]() {
-                for (auto chunk_it = it; chunk_it != chunk_end; ++chunk_it)
-                {
-                    func(*chunk_it);
-                }
-            }));
+#if __cpp_lib_ranges >= 201911L
+    template <std::ranges::input_range R>
+    auto submit_batch(R&& range) { return submit_batch(std::ranges::begin(range), std::ranges::end(range)); }
 
-            it = chunk_end;
-        }
+    template <std::ranges::input_range R>
+    auto try_submit_batch(R&& range) { return try_submit_batch(std::ranges::begin(range), std::ranges::end(range)); }
 
-        for (auto& future : futures)
-        {
-            future.wait();
-        }
+    template <std::ranges::input_range R, typename F>
+    void parallel_for_each(R&& range, F&& func)
+    {
+        parallel_for_each(std::ranges::begin(range), std::ranges::end(range), std::forward<F>(func));
     }
+#endif
 
     [[nodiscard]] auto size() const noexcept -> size_t
     {
@@ -505,29 +668,6 @@ class HighPerformancePool
             lock, [this] { return pending_tasks() == 0 && active_tasks_.load(std::memory_order_acquire) == 0; });
     }
 
-    void shutdown()
-    {
-        {
-            std::lock_guard<std::mutex> lock(overflow_mutex_);
-            if (stop_.exchange(true, std::memory_order_acq_rel))
-            {
-                return;
-            }
-        }
-
-        wakeup_condition_.notify_all();
-
-        for (auto& worker : workers_)
-        {
-            if (worker.joinable())
-            {
-                worker.join();
-            }
-        }
-
-        workers_.clear();
-    }
-
     /**
      * @brief Get detailed performance statistics
      */
@@ -565,8 +705,27 @@ class HighPerformancePool
         return stats;
     }
 
+    /**
+     * @brief Set a callback invoked at the start of each task.
+     */
+    void set_on_task_start(TaskStartCallback cb)
+    {
+        std::lock_guard<std::mutex> lock(trace_mutex_);
+        on_task_start_ = std::move(cb);
+    }
+
+    /**
+     * @brief Set a callback invoked at the end of each task.
+     */
+    void set_on_task_end(TaskEndCallback cb)
+    {
+        std::lock_guard<std::mutex> lock(trace_mutex_);
+        on_task_end_ = std::move(cb);
+    }
+
   private:
     size_t num_threads_;
+    bool register_workers_;
     std::vector<ThreadWrapper> workers_;
     std::vector<std::unique_ptr<WorkStealingDeque<Task>>> worker_queues_;
 
@@ -586,11 +745,19 @@ class HighPerformancePool
     std::atomic<size_t> stolen_tasks_{0};
     std::atomic<uint64_t> total_task_time_{0};
 
+    std::mutex trace_mutex_;
+    TaskStartCallback on_task_start_;
+    TaskEndCallback on_task_end_;
+
     std::chrono::steady_clock::time_point start_time_;
 
     // NOLINTNEXTLINE(readability-function-cognitive-complexity)
     void worker_function(size_t worker_id)
     {
+        std::optional<AutoRegisterCurrentThread> reg_guard;
+        if (register_workers_)
+            reg_guard.emplace("hp_worker_" + std::to_string(worker_id), "threadschedule.pool");
+
         thread_local std::mt19937 gen = []() {
             std::random_device device;
             return std::mt19937(device());
@@ -638,6 +805,14 @@ class HighPerformancePool
                 active_tasks_.fetch_add(1, std::memory_order_relaxed);
 
                 auto const start_time = std::chrono::steady_clock::now();
+                auto const tid = std::this_thread::get_id();
+
+                {
+                    std::lock_guard<std::mutex> tl(trace_mutex_);
+                    if (on_task_start_)
+                        on_task_start_(start_time, tid);
+                }
+
                 try
                 {
                     task();
@@ -650,6 +825,12 @@ class HighPerformancePool
                 auto const task_duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
                 total_task_time_.fetch_add(task_duration.count(), std::memory_order_relaxed);
 
+                {
+                    std::lock_guard<std::mutex> tl(trace_mutex_);
+                    if (on_task_end_)
+                        on_task_end_(end_time, tid, task_duration);
+                }
+
                 active_tasks_.fetch_sub(1, std::memory_order_relaxed);
                 completed_tasks_.fetch_add(1, std::memory_order_relaxed);
 
@@ -690,18 +871,21 @@ struct IndefiniteWait
 };
 
 /**
- * @brief Wait policy that polls with a 10 ms timeout.
+ * @brief Wait policy that polls with a configurable timeout.
  *
  * Workers periodically re-check the queue even without notification, trading
  * a small amount of CPU for lower wake-up latency under bursty workloads.
- * Used by the @c FastThreadPool type alias.
+ * Used by the @c FastThreadPool type alias (default 10 ms).
+ *
+ * @tparam IntervalMs Polling interval in milliseconds.
  */
+template <unsigned IntervalMs = 10>
 struct PollingWait
 {
     template <typename Lock, typename Pred>
     static auto wait(std::condition_variable& cv, Lock& lock, Pred pred) -> bool
     {
-        return cv.wait_for(lock, std::chrono::milliseconds(10), pred);
+        return cv.wait_for(lock, std::chrono::milliseconds(IntervalMs), pred);
     }
 };
 
@@ -777,8 +961,10 @@ class ThreadPoolBase
         std::chrono::microseconds avg_task_time;
     };
 
-    explicit ThreadPoolBase(size_t num_threads = std::thread::hardware_concurrency())
-        : num_threads_(num_threads == 0 ? 1 : num_threads), stop_(false),
+    explicit ThreadPoolBase(size_t num_threads = std::thread::hardware_concurrency(),
+                            bool register_workers = false)
+        : num_threads_(num_threads == 0 ? 1 : num_threads),
+          register_workers_(register_workers), stop_(false),
           start_time_(std::chrono::steady_clock::now())
     {
         workers_.reserve(num_threads_);
@@ -794,14 +980,15 @@ class ThreadPoolBase
 
     ~ThreadPoolBase()
     {
-        shutdown();
+        shutdown(ShutdownPolicy::drain);
     }
 
     /**
-     * @brief Submit a task to the thread pool
+     * @brief Submit a task, returning an error instead of throwing on shutdown.
      */
     template <typename F, typename... Args>
-    auto submit(F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
+    auto try_submit(F&& f, Args&&... args)
+        -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
     {
         using return_type = std::invoke_result_t<F, Args...>;
 
@@ -813,9 +1000,7 @@ class ThreadPoolBase
         {
             std::lock_guard<std::mutex> lock(queue_mutex_);
             if (stop_)
-            {
-                throw std::runtime_error("Pool is shutting down");
-            }
+                return unexpected(std::make_error_code(std::errc::operation_canceled));
             tasks_.emplace([task]() { (*task)(); });
         }
 
@@ -824,10 +1009,51 @@ class ThreadPoolBase
     }
 
     /**
-     * @brief Submit multiple tasks under a single lock acquisition
+     * @brief Submit a task. Throws std::runtime_error if the pool is shutting down.
+     */
+    template <typename F, typename... Args>
+    auto submit(F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
+    {
+        auto result = try_submit(std::forward<F>(f), std::forward<Args>(args)...);
+        if (!result.has_value())
+            throw std::runtime_error("Pool is shutting down");
+        return std::move(result.value());
+    }
+
+#if __cpp_lib_jthread >= 201911L
+    /**
+     * @brief Submit a cancellable task. If stop is already requested the task
+     *        is skipped and returns a default-constructed result.
+     */
+    template <typename F, typename... Args>
+    auto submit(std::stop_token token, F&& f, Args&&... args)
+        -> std::future<std::invoke_result_t<F, Args...>>
+    {
+        return submit([token = std::move(token), fn = std::bind(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
+            if (token.stop_requested())
+                return decltype(fn())();
+            return fn();
+        });
+    }
+
+    template <typename F, typename... Args>
+    auto try_submit(std::stop_token token, F&& f, Args&&... args)
+        -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
+    {
+        return try_submit([token = std::move(token), fn = std::bind(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
+            if (token.stop_requested())
+                return decltype(fn())();
+            return fn();
+        });
+    }
+#endif
+
+    /**
+     * @brief Submit multiple tasks, returning an error instead of throwing on shutdown.
      */
     template <typename Iterator>
-    auto submit_batch(Iterator begin, Iterator end) -> std::vector<std::future<void>>
+    auto try_submit_batch(Iterator begin, Iterator end)
+        -> expected<std::vector<std::future<void>>, std::error_code>
     {
         std::vector<std::future<void>> futures;
         futures.reserve(std::distance(begin, end));
@@ -835,9 +1061,7 @@ class ThreadPoolBase
         {
             std::lock_guard<std::mutex> lock(queue_mutex_);
             if (stop_)
-            {
-                throw std::runtime_error("Pool is shutting down");
-            }
+                return unexpected(std::make_error_code(std::errc::operation_canceled));
 
             for (auto it = begin; it != end; ++it)
             {
@@ -852,24 +1076,39 @@ class ThreadPoolBase
     }
 
     /**
-     * @brief Apply a function to a range of values in parallel
+     * @brief Submit multiple tasks under a single lock acquisition. Throws on shutdown.
+     */
+    template <typename Iterator>
+    auto submit_batch(Iterator begin, Iterator end) -> std::vector<std::future<void>>
+    {
+        auto result = try_submit_batch(begin, end);
+        if (!result.has_value())
+            throw std::runtime_error("Pool is shutting down");
+        return std::move(result.value());
+    }
+
+    /**
+     * @brief Apply a function to a range in parallel using chunked work distribution.
      */
     template <typename Iterator, typename F>
     void parallel_for_each(Iterator begin, Iterator end, F&& func)
     {
-        std::vector<std::future<void>> futures;
-        futures.reserve(std::distance(begin, end));
+        detail::parallel_for_each_chunked(*this, begin, end, std::forward<F>(func), num_threads_);
+    }
 
-        for (auto it = begin; it != end; ++it)
-        {
-            futures.push_back(submit([func, it]() { func(*it); }));
-        }
+#if __cpp_lib_ranges >= 201911L
+    template <std::ranges::input_range R>
+    auto submit_batch(R&& range) { return submit_batch(std::ranges::begin(range), std::ranges::end(range)); }
 
-        for (auto& future : futures)
-        {
-            future.wait();
-        }
+    template <std::ranges::input_range R>
+    auto try_submit_batch(R&& range) { return try_submit_batch(std::ranges::begin(range), std::ranges::end(range)); }
+
+    template <std::ranges::input_range R, typename F>
+    void parallel_for_each(R&& range, F&& func)
+    {
+        parallel_for_each(std::ranges::begin(range), std::ranges::end(range), std::forward<F>(func));
     }
+#endif
 
     [[nodiscard]] auto size() const noexcept -> size_t
     {
@@ -914,13 +1153,24 @@ class ThreadPoolBase
             lock, [this] { return tasks_.empty() && active_tasks_.load(std::memory_order_acquire) == 0; });
     }
 
-    void shutdown()
+    /**
+     * @brief Shut the pool down.
+     *
+     * @param policy @c drain (default) finishes all queued tasks;
+     *               @c drop_pending discards queued tasks.
+     */
+    void shutdown(ShutdownPolicy policy = ShutdownPolicy::drain)
     {
         {
             std::lock_guard<std::mutex> lock(queue_mutex_);
             if (stop_)
                 return;
             stop_ = true;
+            if (policy == ShutdownPolicy::drop_pending)
+            {
+                std::queue<Task> empty;
+                tasks_.swap(empty);
+            }
         }
 
         condition_.notify_all();
@@ -928,14 +1178,38 @@ class ThreadPoolBase
         for (auto& worker : workers_)
         {
             if (worker.joinable())
-            {
                 worker.join();
-            }
         }
 
         workers_.clear();
     }
 
+    /**
+     * @brief Attempt a timed drain: finish as many tasks as possible within
+     *        @p timeout, then force-stop remaining workers.
+     * @return @c true if all tasks completed within the deadline,
+     *         @c false if the timeout expired first.
+     */
+    auto shutdown_for(std::chrono::milliseconds timeout) -> bool
+    {
+        auto const deadline = std::chrono::steady_clock::now() + timeout;
+
+        {
+            std::lock_guard<std::mutex> lock(queue_mutex_);
+            if (stop_)
+                return true;
+        }
+
+        std::unique_lock<std::mutex> lock(queue_mutex_);
+        bool const drained = task_finished_condition_.wait_until(lock, deadline, [this] {
+            return tasks_.empty() && active_tasks_.load(std::memory_order_acquire) == 0;
+        });
+        lock.unlock();
+
+        shutdown(ShutdownPolicy::drain);
+        return drained;
+    }
+
     /**
      * @brief Get performance statistics
      */
@@ -973,8 +1247,27 @@ class ThreadPoolBase
         return stats;
     }
 
+    /**
+     * @brief Set a callback invoked at the start of each task.
+     */
+    void set_on_task_start(TaskStartCallback cb)
+    {
+        std::lock_guard<std::mutex> lock(trace_mutex_);
+        on_task_start_ = std::move(cb);
+    }
+
+    /**
+     * @brief Set a callback invoked at the end of each task.
+     */
+    void set_on_task_end(TaskEndCallback cb)
+    {
+        std::lock_guard<std::mutex> lock(trace_mutex_);
+        on_task_end_ = std::move(cb);
+    }
+
   private:
     size_t num_threads_;
+    bool register_workers_;
     std::vector<ThreadWrapper> workers_;
     std::queue<Task> tasks_;
 
@@ -986,10 +1279,18 @@ class ThreadPoolBase
     std::atomic<size_t> completed_tasks_{0};
     std::atomic<uint64_t> total_task_time_{0};
 
+    std::mutex trace_mutex_;
+    TaskStartCallback on_task_start_;
+    TaskEndCallback on_task_end_;
+
     std::chrono::steady_clock::time_point start_time_;
 
-    void worker_function(size_t /*worker_id*/)
+    void worker_function(size_t worker_id)
     {
+        std::optional<AutoRegisterCurrentThread> reg_guard;
+        if (register_workers_)
+            reg_guard.emplace("pool_worker_" + std::to_string(worker_id), "threadschedule.pool");
+
         while (true)
         {
             Task task;
@@ -1022,6 +1323,14 @@ class ThreadPoolBase
             if (found_task)
             {
                 auto const start_time = std::chrono::steady_clock::now();
+                auto const tid = std::this_thread::get_id();
+
+                {
+                    std::lock_guard<std::mutex> tl(trace_mutex_);
+                    if (on_task_start_)
+                        on_task_start_(start_time, tid);
+                }
+
                 try
                 {
                     task();
@@ -1034,6 +1343,12 @@ class ThreadPoolBase
                 auto const task_duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
                 total_task_time_.fetch_add(task_duration.count(), std::memory_order_relaxed);
 
+                {
+                    std::lock_guard<std::mutex> tl(trace_mutex_);
+                    if (on_task_end_)
+                        on_task_end_(end_time, tid, task_duration);
+                }
+
                 active_tasks_.fetch_sub(1, std::memory_order_relaxed);
                 completed_tasks_.fetch_add(1, std::memory_order_relaxed);
 
@@ -1062,7 +1377,7 @@ using ThreadPool = ThreadPoolBase<IndefiniteWait>;
  *
  * @see ThreadPoolBase, PollingWait
  */
-using FastThreadPool = ThreadPoolBase<PollingWait>;
+using FastThreadPool = ThreadPoolBase<PollingWait<>>;
 
 // ---------------------------------------------------------------------------
 // GlobalPool
@@ -1099,9 +1414,20 @@ template <typename PoolType>
 class GlobalPool
 {
   public:
+    /**
+     * @brief Pre-configure the number of threads before first use.
+     *
+     * Must be called before instance() is first invoked. Subsequent calls
+     * are ignored (std::call_once semantics).
+     */
+    static void init(size_t num_threads)
+    {
+        std::call_once(init_flag_(), [num_threads] { thread_count_() = num_threads; });
+    }
+
     static auto instance() -> PoolType&
     {
-        static PoolType pool(std::thread::hardware_concurrency());
+        static PoolType pool(thread_count_());
         return pool;
     }
 
@@ -1111,20 +1437,58 @@ class GlobalPool
         return instance().submit(std::forward<F>(f), std::forward<Args>(args)...);
     }
 
+    template <typename F, typename... Args>
+    static auto try_submit(F&& f, Args&&... args)
+    {
+        return instance().try_submit(std::forward<F>(f), std::forward<Args>(args)...);
+    }
+
     template <typename Iterator>
     static auto submit_batch(Iterator begin, Iterator end)
     {
         return instance().submit_batch(begin, end);
     }
 
+    template <typename Iterator>
+    static auto try_submit_batch(Iterator begin, Iterator end)
+    {
+        return instance().try_submit_batch(begin, end);
+    }
+
     template <typename Iterator, typename F>
     static void parallel_for_each(Iterator begin, Iterator end, F&& func)
     {
         instance().parallel_for_each(begin, end, std::forward<F>(func));
     }
 
+#if __cpp_lib_ranges >= 201911L
+    template <std::ranges::input_range R>
+    static auto submit_batch(R&& range) { return instance().submit_batch(std::forward<R>(range)); }
+
+    template <std::ranges::input_range R>
+    static auto try_submit_batch(R&& range) { return instance().try_submit_batch(std::forward<R>(range)); }
+
+    template <std::ranges::input_range R, typename F>
+    static void parallel_for_each(R&& range, F&& func)
+    {
+        instance().parallel_for_each(std::forward<R>(range), std::forward<F>(func));
+    }
+#endif
+
   private:
     GlobalPool() = default;
+
+    static auto init_flag_() -> std::once_flag&
+    {
+        static std::once_flag flag;
+        return flag;
+    }
+
+    static auto thread_count_() -> size_t&
+    {
+        static size_t count = std::thread::hardware_concurrency();
+        return count;
+    }
 };
 
 /** @brief Singleton @ref ThreadPool accessor. */
diff --git a/include/threadschedule/thread_pool_with_errors.hpp b/include/threadschedule/thread_pool_with_errors.hpp
index 5e7e419..1454544 100644
--- a/include/threadschedule/thread_pool_with_errors.hpp
+++ b/include/threadschedule/thread_pool_with_errors.hpp
@@ -51,11 +51,26 @@ class PoolWithErrors
         return submit_impl(description, std::forward<F>(f), std::forward<Args>(args)...);
     }
 
+    /**
+     * @brief Submit a task, returning an error instead of throwing on shutdown.
+     */
+    template <typename F, typename... Args>
+    auto try_submit(F&& f, Args&&... args)
+        -> expected<FutureWithErrorHandler<std::invoke_result_t<F, Args...>>, std::error_code>
+    {
+        return try_submit_impl({}, std::forward<F>(f), std::forward<Args>(args)...);
+    }
+
     auto add_error_callback(ErrorCallback callback) -> size_t
     {
         return error_handler_->add_callback(std::move(callback));
     }
 
+    auto remove_error_callback(size_t id) -> bool
+    {
+        return error_handler_->remove_callback(id);
+    }
+
     void clear_error_callbacks()
     {
         error_handler_->clear_callbacks();
@@ -139,6 +154,29 @@ class PoolWithErrors
         return FutureWithErrorHandler<std::invoke_result_t<F, Args...>>(std::move(future));
     }
 
+    template <typename F, typename... Args>
+    auto try_submit_impl(std::string description, F&& f, Args&&... args)
+        -> expected<FutureWithErrorHandler<std::invoke_result_t<F, Args...>>, std::error_code>
+    {
+        auto handler = error_handler_;
+        auto wrapped_task = [f = std::forward<F>(f), args = std::make_tuple(std::forward<Args>(args)...), handler,
+                             desc = std::move(description)]() {
+            try
+            {
+                return std::apply(f, args);
+            }
+            catch (...)
+            {
+                handler->handle_error(TaskError::capture(desc));
+                throw;
+            }
+        };
+        auto result = pool_.try_submit(std::move(wrapped_task));
+        if (!result.has_value())
+            return unexpected(result.error());
+        return FutureWithErrorHandler<std::invoke_result_t<F, Args...>>(std::move(result.value()));
+    }
+
     PoolType pool_;
     std::shared_ptr<ErrorHandler> error_handler_;
 };
diff --git a/include/threadschedule/threadschedule.hpp b/include/threadschedule/threadschedule.hpp
index 06b293d..4896b12 100644
--- a/include/threadschedule/threadschedule.hpp
+++ b/include/threadschedule/threadschedule.hpp
@@ -3,6 +3,7 @@
 #include "chaos.hpp"
 #include "concepts.hpp"
 #include "error_handler.hpp"
+#include "futures.hpp"
 #include "generator.hpp"
 #include "profiles.hpp"
 #include "pthread_wrapper.hpp"
@@ -69,13 +70,17 @@ using ts::ScheduledTaskHandle;
 using ts::ScheduledThreadPool;
 using ts::ScheduledThreadPoolT;
 using ts::SchedulingPolicy;
+using ts::ShutdownPolicy;
 using ts::TaskError;
 using ts::ThreadAffinity;
 using ts::ThreadByNameView;
 using ts::ThreadPool;
 using ts::ThreadPoolBase;
 using ts::ThreadPoolWithErrors;
+using ts::PollingWait;
 using ts::PoolWithErrors;
+using ts::TaskEndCallback;
+using ts::TaskStartCallback;
 using ts::ThreadPriority;
 using ts::ThreadProfile;
 using ts::ThreadWrapper;
@@ -86,11 +91,20 @@ using ts::BuildMode;
 using ts::build_mode;
 using ts::build_mode_string;
 
+// Future combinators
+using ts::when_all;
+using ts::when_all_settled;
+using ts::when_any;
+
 // Coroutine primitives (C++20)
 #if defined(__cpp_impl_coroutine) && __cpp_impl_coroutine >= 201902L
-using ts::task;
-using ts::sync_wait;
+using ts::executor_base;
 using ts::generator;
+using ts::pool_executor;
+using ts::run_on;
+using ts::schedule_on;
+using ts::sync_wait;
+using ts::task;
 #endif
 
 } // namespace threadschedule

From bc6798c53820fc7f31051550f18c727eefd01b7c Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Sun, 5 Apr 2026 17:42:45 +0200
Subject: [PATCH 05/15] Enhance thread pool capabilities and introduce new
 lightweight pool

- Added `LightweightPoolT<TaskSize>` for ultra-lightweight fire-and-forget task execution with zero heap allocations for typical lambdas.
- Implemented `post()` and `try_post()` methods for fire-and-forget submissions across all pool types, reducing overhead.
- Updated `ScheduledThreadPoolT` to utilize `post()` internally, eliminating unnecessary future allocations.
- Introduced new types: `LightweightPool`, `ScheduledLightweightPool`, and `detail::SboCallable<TaskSize>` for improved callable management.
- Updated CHANGELOG to reflect these new features and enhancements.
---
 CHANGELOG.md                               |  18 +
 include/threadschedule/pthread_wrapper.hpp |  12 +-
 include/threadschedule/scheduled_pool.hpp  |   4 +-
 include/threadschedule/thread_pool.hpp     | 485 ++++++++++++++++++++-
 include/threadschedule/threadschedule.hpp  |   3 +
 5 files changed, 504 insertions(+), 18 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f6ca2fd..e3f3de2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -86,6 +86,21 @@
   executor for pool-aware tasks, `run_on(pool, coro_fn)` convenience returning
   `std::future`.
 
+- **`LightweightPoolT<TaskSize>`** -- ultra-lightweight fire-and-forget pool
+  using a custom `detail::SboCallable<TaskSize>` with configurable inline buffer
+  (default 64 bytes = 1 cache line, 56 bytes usable). Zero heap allocations for
+  typical lambdas. No futures, no `packaged_task`, no statistics, no tracing.
+  Workers are `ThreadWrapper` so `configure_threads`/`set_affinity` still work.
+  `using LightweightPool = LightweightPoolT<>` for the default.
+
+- **`post()` / `try_post()`** -- fire-and-forget submission on all pool types
+  (`HighPerformancePool`, `ThreadPoolBase`, `GlobalPool`). Same queue logic as
+  `submit()` but skips `packaged_task`/`shared_ptr`/`future` overhead.
+
+- **`ScheduledThreadPoolT` now uses `post()`** internally instead of `submit()`,
+  eliminating wasted `future` allocations for every scheduled task dispatch.
+  New alias: `ScheduledLightweightPool = ScheduledThreadPoolT<LightweightPool>`.
+
 ### New Types
 
 - `ThreadPoolBase<WaitPolicy>` - parameterized single-queue thread pool.
@@ -97,6 +112,9 @@
 - `executor_base` / `pool_executor<Pool>` - type-erased executor for coroutines.
 - `schedule_on<Pool>` - awaitable for hopping to a pool thread.
 - `futures.hpp` - future combinators (`when_all`, `when_any`, `when_all_settled`).
+- `LightweightPoolT<TaskSize>` / `LightweightPool` - fire-and-forget pool with SBO.
+- `detail::SboCallable<TaskSize>` - type-erased callable with inline storage.
+- `ScheduledLightweightPool` - scheduled pool backed by `LightweightPool`.
 
 ### Internal Improvements
 
diff --git a/include/threadschedule/pthread_wrapper.hpp b/include/threadschedule/pthread_wrapper.hpp
index db6485a..db7ee4b 100644
--- a/include/threadschedule/pthread_wrapper.hpp
+++ b/include/threadschedule/pthread_wrapper.hpp
@@ -9,6 +9,7 @@
 #include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
 
 #ifdef _WIN32
 #include <windows.h>
@@ -59,9 +60,11 @@ class PThreadWrapper
     explicit PThreadWrapper(F&& func, Args&&... args) : thread_(0), joined_(false)
     {
 
-        // Store the callable in a way pthread can handle
         auto callable =
-            std::make_unique<std::function<void()>>(std::bind(std::forward<F>(func), std::forward<Args>(args)...));
+            std::make_unique<std::function<void()>>([fn = std::forward<F>(func),
+                                                     tup = std::make_tuple(std::forward<Args>(args)...)]() mutable {
+                std::apply(std::move(fn), std::move(tup));
+            });
 
         int const result = pthread_create(&thread_, nullptr, thread_function, callable.release());
 
@@ -221,7 +224,10 @@ class PThreadWrapper
 
         PThreadWrapper wrapper;
         auto callable =
-            std::make_unique<std::function<void()>>(std::bind(std::forward<F>(func), std::forward<Args>(args)...));
+            std::make_unique<std::function<void()>>([fn = std::forward<F>(func),
+                                                     tup = std::make_tuple(std::forward<Args>(args)...)]() mutable {
+                std::apply(std::move(fn), std::move(tup));
+            });
 
         int const result = pthread_create(&wrapper.thread_, &attr, thread_function, callable.release());
 
diff --git a/include/threadschedule/scheduled_pool.hpp b/include/threadschedule/scheduled_pool.hpp
index 31c149f..748ac8e 100644
--- a/include/threadschedule/scheduled_pool.hpp
+++ b/include/threadschedule/scheduled_pool.hpp
@@ -359,7 +359,7 @@ class ScheduledThreadPoolT
                 auto task_copy = info.task;
                 auto cancelled_flag = info.cancelled;
 
-                pool_.submit([task_copy, cancelled_flag]() {
+                pool_.post([task_copy, cancelled_flag]() {
                     if (!cancelled_flag->load(std::memory_order_acquire))
                     {
                         task_copy();
@@ -387,5 +387,7 @@ using ScheduledThreadPool = ScheduledThreadPoolT<ThreadPool>;
 using ScheduledHighPerformancePool = ScheduledThreadPoolT<HighPerformancePool>;
 /** @brief @ref ScheduledThreadPoolT using @ref FastThreadPool as backend. */
 using ScheduledFastThreadPool = ScheduledThreadPoolT<FastThreadPool>;
+/** @brief @ref ScheduledThreadPoolT using @ref LightweightPool as backend (minimal overhead). */
+using ScheduledLightweightPool = ScheduledThreadPoolT<LightweightPool>;
 
 } // namespace threadschedule
diff --git a/include/threadschedule/thread_pool.hpp b/include/threadschedule/thread_pool.hpp
index d6891eb..b1caefd 100644
--- a/include/threadschedule/thread_pool.hpp
+++ b/include/threadschedule/thread_pool.hpp
@@ -14,6 +14,7 @@
 #include <optional>
 #include <queue>
 #include <random>
+#include <tuple>
 #include <vector>
 
 #if __cpp_lib_ranges >= 201911L
@@ -107,6 +108,157 @@ inline void parallel_for_each_chunked(Pool& pool, Iterator begin, Iterator end,
         f.get();
 }
 
+// ---------------------------------------------------------------------------
+// bind_args -- optimal argument binding, C++20 pack-capture or C++17 tuple
+// ---------------------------------------------------------------------------
+
+/**
+ * @brief Bind a callable with its arguments into a nullary lambda.
+ *
+ * On C++20 and later this uses pack init-captures for zero intermediate
+ * storage overhead. On C++17 it falls back to @c std::make_tuple /
+ * @c std::apply which is still significantly faster than @c std::bind.
+ */
+template <typename F, typename... Args>
+auto bind_args(F&& f, Args&&... args)
+{
+#if __cpp_init_captures >= 201803L
+    return [fn = std::forward<F>(f), ...a = std::forward<Args>(args)]() mutable {
+        return fn(std::move(a)...);
+    };
+#else
+    return [fn = std::forward<F>(f),
+            tup = std::make_tuple(std::forward<Args>(args)...)]() mutable {
+        return std::apply(std::move(fn), std::move(tup));
+    };
+#endif
+}
+
+// ---------------------------------------------------------------------------
+// SboCallable -- type-erased callable with inline small-buffer storage
+// ---------------------------------------------------------------------------
+
+/**
+ * @brief Type-erased, move-only callable with configurable inline storage.
+ *
+ * Avoids the heap allocation that @c std::function incurs for callables
+ * larger than its (typically 16-byte) internal buffer. Callables that fit
+ * within @c TaskSize - sizeof(void*) bytes are stored inline; larger ones
+ * fall back to a heap allocation.
+ *
+ * @tparam TaskSize Total object size in bytes (default 64, one x86 cache line).
+ *         The usable inline buffer is @c TaskSize - 8 bytes on 64-bit platforms.
+ */
+template <size_t TaskSize = 64>
+class SboCallable
+{
+    static_assert(TaskSize > sizeof(void*), "TaskSize must be larger than a pointer");
+
+    struct VTable
+    {
+        void (*invoke)(void* storage);
+        void (*destroy)(void* storage);
+        void (*move_to)(void* dst, void* src) noexcept;
+    };
+
+    static constexpr size_t kBufferSize = TaskSize - sizeof(VTable const*);
+
+    template <typename F>
+    static constexpr bool fits_inline_v =
+        sizeof(F) <= kBufferSize &&
+        alignof(F) <= alignof(std::max_align_t) &&
+        std::is_nothrow_move_constructible_v<F>;
+
+    template <typename F>
+    static VTable const* vtable_for() noexcept
+    {
+        if constexpr (fits_inline_v<F>)
+        {
+            static constexpr VTable vt{
+                [](void* s) { (*static_cast<F*>(s))(); },
+                [](void* s) { static_cast<F*>(s)->~F(); },
+                [](void* dst, void* src) noexcept {
+                    ::new (dst) F(std::move(*static_cast<F*>(src)));
+                    static_cast<F*>(src)->~F();
+                }};
+            return &vt;
+        }
+        else
+        {
+            static constexpr VTable vt{
+                [](void* s) { (*(*static_cast<F**>(s)))(); },
+                [](void* s) { delete *static_cast<F**>(s); },
+                [](void* dst, void* src) noexcept {
+                    *static_cast<F**>(dst) = *static_cast<F**>(src);
+                    *static_cast<F**>(src) = nullptr;
+                }};
+            return &vt;
+        }
+    }
+
+  public:
+    SboCallable() = default;
+
+    template <typename F, typename = std::enable_if_t<!std::is_same_v<std::decay_t<F>, SboCallable>>>
+    SboCallable(F&& f) // NOLINT(google-explicit-constructor)
+    {
+        using Decay = std::decay_t<F>;
+        vtable_ = vtable_for<Decay>();
+        if constexpr (fits_inline_v<Decay>)
+            ::new (buffer_) Decay(std::forward<F>(f));
+        else
+            *reinterpret_cast<Decay**>(buffer_) = new Decay(std::forward<F>(f));
+    }
+
+    SboCallable(SboCallable&& other) noexcept : vtable_(other.vtable_)
+    {
+        if (vtable_)
+        {
+            vtable_->move_to(buffer_, other.buffer_);
+            other.vtable_ = nullptr;
+        }
+    }
+
+    auto operator=(SboCallable&& other) noexcept -> SboCallable&
+    {
+        if (this != &other)
+        {
+            if (vtable_)
+                vtable_->destroy(buffer_);
+            vtable_ = other.vtable_;
+            if (vtable_)
+            {
+                vtable_->move_to(buffer_, other.buffer_);
+                other.vtable_ = nullptr;
+            }
+        }
+        return *this;
+    }
+
+    SboCallable(SboCallable const&) = delete;
+    auto operator=(SboCallable const&) -> SboCallable& = delete;
+
+    ~SboCallable()
+    {
+        if (vtable_)
+            vtable_->destroy(buffer_);
+    }
+
+    explicit operator bool() const noexcept { return vtable_ != nullptr; }
+
+    void operator()()
+    {
+        auto* vt = vtable_;
+        vtable_ = nullptr;
+        vt->invoke(buffer_);
+        vt->destroy(buffer_);
+    }
+
+  private:
+    VTable const* vtable_ = nullptr;
+    alignas(std::max_align_t) unsigned char buffer_[kBufferSize]{};
+};
+
 } // namespace detail
 
 /**
@@ -467,7 +619,7 @@ class HighPerformancePool
         using return_type = std::invoke_result_t<F, Args...>;
 
         auto task = std::make_shared<std::packaged_task<return_type()>>(
-            std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+            detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...));
 
         std::future<return_type> result = task->get_future();
 
@@ -515,6 +667,57 @@ class HighPerformancePool
         return std::move(result.value());
     }
 
+    /**
+     * @brief Fire-and-forget submission (no future, no packaged_task overhead).
+     */
+    template <typename F, typename... Args>
+    void post(F&& f, Args&&... args)
+    {
+        auto r = try_post(std::forward<F>(f), std::forward<Args>(args)...);
+        if (!r.has_value())
+            throw std::runtime_error("HighPerformancePool is shutting down");
+    }
+
+    /**
+     * @brief Fire-and-forget submission. Returns error on shutdown.
+     */
+    template <typename F, typename... Args>
+    auto try_post(F&& f, Args&&... args) -> expected<void, std::error_code>
+    {
+        Task bound(detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...));
+
+        if (stop_.load(std::memory_order_acquire))
+            return unexpected(std::make_error_code(std::errc::operation_canceled));
+
+        size_t const preferred_queue = next_victim_.fetch_add(1, std::memory_order_relaxed) % num_threads_;
+
+        if (worker_queues_[preferred_queue]->push(std::move(bound)))
+        {
+            wakeup_condition_.notify_one();
+            return {};
+        }
+
+        for (size_t attempts = 0; attempts < (std::min)(num_threads_, size_t(3)); ++attempts)
+        {
+            size_t const idx = (preferred_queue + attempts + 1) % num_threads_;
+            if (worker_queues_[idx]->push(std::move(bound)))
+            {
+                wakeup_condition_.notify_one();
+                return {};
+            }
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(overflow_mutex_);
+            if (stop_.load(std::memory_order_relaxed))
+                return unexpected(std::make_error_code(std::errc::operation_canceled));
+            overflow_tasks_.emplace(std::move(bound));
+        }
+
+        wakeup_condition_.notify_all();
+        return {};
+    }
+
 #if __cpp_lib_jthread >= 201911L
     /**
      * @brief Submit a cancellable task. If stop is already requested the task
@@ -524,10 +727,11 @@ class HighPerformancePool
     auto submit(std::stop_token token, F&& f, Args&&... args)
         -> std::future<std::invoke_result_t<F, Args...>>
     {
-        return submit([token = std::move(token), fn = std::bind(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
+        return submit([token = std::move(token),
+                       bound = detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
             if (token.stop_requested())
-                return decltype(fn())();
-            return fn();
+                return std::invoke_result_t<F, Args...>();
+            return bound();
         });
     }
 
@@ -538,10 +742,11 @@ class HighPerformancePool
     auto try_submit(std::stop_token token, F&& f, Args&&... args)
         -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
     {
-        return try_submit([token = std::move(token), fn = std::bind(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
+        return try_submit([token = std::move(token),
+                           bound = detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
             if (token.stop_requested())
-                return decltype(fn())();
-            return fn();
+                return std::invoke_result_t<F, Args...>();
+            return bound();
         });
     }
 #endif
@@ -993,7 +1198,7 @@ class ThreadPoolBase
         using return_type = std::invoke_result_t<F, Args...>;
 
         auto task = std::make_shared<std::packaged_task<return_type()>>(
-            std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+            detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...));
 
         std::future<return_type> result = task->get_future();
 
@@ -1020,6 +1225,33 @@ class ThreadPoolBase
         return std::move(result.value());
     }
 
+    /**
+     * @brief Fire-and-forget submission (no future, no packaged_task overhead).
+     */
+    template <typename F, typename... Args>
+    void post(F&& f, Args&&... args)
+    {
+        auto r = try_post(std::forward<F>(f), std::forward<Args>(args)...);
+        if (!r.has_value())
+            throw std::runtime_error("Pool is shutting down");
+    }
+
+    /**
+     * @brief Fire-and-forget submission. Returns error on shutdown.
+     */
+    template <typename F, typename... Args>
+    auto try_post(F&& f, Args&&... args) -> expected<void, std::error_code>
+    {
+        {
+            std::lock_guard<std::mutex> lock(queue_mutex_);
+            if (stop_)
+                return unexpected(std::make_error_code(std::errc::operation_canceled));
+            tasks_.emplace(detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...));
+        }
+        condition_.notify_one();
+        return {};
+    }
+
 #if __cpp_lib_jthread >= 201911L
     /**
      * @brief Submit a cancellable task. If stop is already requested the task
@@ -1029,10 +1261,11 @@ class ThreadPoolBase
     auto submit(std::stop_token token, F&& f, Args&&... args)
         -> std::future<std::invoke_result_t<F, Args...>>
     {
-        return submit([token = std::move(token), fn = std::bind(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
+        return submit([token = std::move(token),
+                       bound = detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
             if (token.stop_requested())
-                return decltype(fn())();
-            return fn();
+                return std::invoke_result_t<F, Args...>();
+            return bound();
         });
     }
 
@@ -1040,10 +1273,11 @@ class ThreadPoolBase
     auto try_submit(std::stop_token token, F&& f, Args&&... args)
         -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
     {
-        return try_submit([token = std::move(token), fn = std::bind(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
+        return try_submit([token = std::move(token),
+                           bound = detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
             if (token.stop_requested())
-                return decltype(fn())();
-            return fn();
+                return std::invoke_result_t<F, Args...>();
+            return bound();
         });
     }
 #endif
@@ -1379,6 +1613,217 @@ using ThreadPool = ThreadPoolBase<IndefiniteWait>;
  */
 using FastThreadPool = ThreadPoolBase<PollingWait<>>;
 
+// ---------------------------------------------------------------------------
+// LightweightPoolT
+// ---------------------------------------------------------------------------
+
+/**
+ * @brief Ultra-lightweight fire-and-forget thread pool.
+ *
+ * Uses a custom @ref detail::SboCallable instead of @c std::function to avoid
+ * heap allocations for callables up to @c TaskSize - 8 bytes. No futures, no
+ * packaged_task, no statistics, no tracing -- just raw throughput.
+ *
+ * Workers are @ref ThreadWrapper instances so that naming, affinity, and
+ * scheduling policy can still be configured after construction.
+ *
+ * @par API
+ * Only @c post() (fire-and-forget) is provided. For tasks that need a return
+ * value, use @ref ThreadPool or @ref HighPerformancePool with @c submit().
+ *
+ * @tparam TaskSize Total size in bytes of each inline task slot (default 64,
+ *         one x86 cache line). Usable buffer = @c TaskSize - 8 bytes.
+ */
+template <size_t TaskSize = 64>
+class LightweightPoolT
+{
+  public:
+    explicit LightweightPoolT(size_t num_threads = std::thread::hardware_concurrency())
+        : num_threads_(num_threads == 0 ? 1 : num_threads)
+    {
+        workers_.reserve(num_threads_);
+        for (size_t i = 0; i < num_threads_; ++i)
+            workers_.emplace_back(&LightweightPoolT::worker_loop, this);
+    }
+
+    LightweightPoolT(LightweightPoolT const&) = delete;
+    auto operator=(LightweightPoolT const&) -> LightweightPoolT& = delete;
+
+    ~LightweightPoolT() { shutdown(ShutdownPolicy::drain); }
+
+    /**
+     * @brief Fire-and-forget task submission. Throws on shutdown.
+     */
+    template <typename F, typename... Args>
+    void post(F&& f, Args&&... args)
+    {
+        auto r = try_post(std::forward<F>(f), std::forward<Args>(args)...);
+        if (!r.has_value())
+            throw std::runtime_error("LightweightPool is shutting down");
+    }
+
+    /**
+     * @brief Fire-and-forget task submission. Returns error on shutdown.
+     */
+    template <typename F, typename... Args>
+    auto try_post(F&& f, Args&&... args) -> expected<void, std::error_code>
+    {
+        detail::SboCallable<TaskSize> task(detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...));
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            if (stop_)
+                return unexpected(std::make_error_code(std::errc::operation_canceled));
+            tasks_.push(std::move(task));
+        }
+        condition_.notify_one();
+        return {};
+    }
+
+    /**
+     * @brief Batch fire-and-forget submission under a single lock.
+     */
+    template <typename Iterator>
+    void post_batch(Iterator begin, Iterator end)
+    {
+        auto r = try_post_batch(begin, end);
+        if (!r.has_value())
+            throw std::runtime_error("LightweightPool is shutting down");
+    }
+
+    /**
+     * @brief Batch fire-and-forget submission. Returns error on shutdown.
+     */
+    template <typename Iterator>
+    auto try_post_batch(Iterator begin, Iterator end) -> expected<void, std::error_code>
+    {
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            if (stop_)
+                return unexpected(std::make_error_code(std::errc::operation_canceled));
+            for (auto it = begin; it != end; ++it)
+                tasks_.push(detail::SboCallable<TaskSize>(*it));
+        }
+        condition_.notify_all();
+        return {};
+    }
+
+#if __cpp_lib_ranges >= 201911L
+    template <std::ranges::input_range R>
+    void post_batch(R&& range) { post_batch(std::ranges::begin(range), std::ranges::end(range)); }
+
+    template <std::ranges::input_range R>
+    auto try_post_batch(R&& range) { return try_post_batch(std::ranges::begin(range), std::ranges::end(range)); }
+#endif
+
+    /**
+     * @brief Shut the pool down.
+     */
+    void shutdown(ShutdownPolicy policy = ShutdownPolicy::drain)
+    {
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            if (stop_)
+                return;
+            stop_ = true;
+            if (policy == ShutdownPolicy::drop_pending)
+            {
+                std::queue<detail::SboCallable<TaskSize>> empty;
+                tasks_.swap(empty);
+            }
+        }
+        condition_.notify_all();
+        for (auto& w : workers_)
+        {
+            if (w.joinable())
+                w.join();
+        }
+        workers_.clear();
+    }
+
+    /**
+     * @brief Timed drain: finish as many tasks as possible within timeout.
+     * @return @c true if all tasks completed, @c false on timeout.
+     */
+    auto shutdown_for(std::chrono::milliseconds timeout) -> bool
+    {
+        auto const deadline = std::chrono::steady_clock::now() + timeout;
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            if (stop_)
+                return true;
+        }
+        std::unique_lock<std::mutex> lock(mutex_);
+        bool const drained = drain_condition_.wait_until(lock, deadline, [this] {
+            return tasks_.empty() && active_tasks_.load(std::memory_order_acquire) == 0;
+        });
+        lock.unlock();
+        shutdown(ShutdownPolicy::drain);
+        return drained;
+    }
+
+    [[nodiscard]] auto size() const noexcept -> size_t { return num_threads_; }
+
+    auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
+                           ThreadPriority priority = ThreadPriority::normal()) -> expected<void, std::error_code>
+    {
+        return detail::configure_worker_threads(workers_, name_prefix, policy, priority);
+    }
+
+    auto set_affinity(ThreadAffinity const& affinity) -> expected<void, std::error_code>
+    {
+        return detail::set_worker_affinity(workers_, affinity);
+    }
+
+    auto distribute_across_cpus() -> expected<void, std::error_code>
+    {
+        return detail::distribute_workers_across_cpus(workers_);
+    }
+
+  private:
+    size_t num_threads_;
+    std::vector<ThreadWrapper> workers_;
+    std::queue<detail::SboCallable<TaskSize>> tasks_;
+    std::mutex mutex_;
+    std::condition_variable condition_;
+    std::condition_variable drain_condition_;
+    std::atomic<bool> stop_{false};
+    std::atomic<size_t> active_tasks_{0};
+
+    void worker_loop()
+    {
+        while (true)
+        {
+            detail::SboCallable<TaskSize> task;
+            {
+                std::unique_lock<std::mutex> lock(mutex_);
+                condition_.wait(lock, [this] { return stop_ || !tasks_.empty(); });
+                if (stop_ && tasks_.empty())
+                    return;
+                if (!tasks_.empty())
+                {
+                    task = std::move(tasks_.front());
+                    tasks_.pop();
+                    active_tasks_.fetch_add(1, std::memory_order_relaxed);
+                }
+                else
+                    continue;
+            }
+            try
+            {
+                task();
+            }
+            catch (...)
+            {
+            }
+            active_tasks_.fetch_sub(1, std::memory_order_relaxed);
+            drain_condition_.notify_all();
+        }
+    }
+};
+
+/** @brief Default lightweight pool with 64-byte task slots. */
+using LightweightPool = LightweightPoolT<>;
+
 // ---------------------------------------------------------------------------
 // GlobalPool
 // ---------------------------------------------------------------------------
@@ -1443,6 +1888,18 @@ class GlobalPool
         return instance().try_submit(std::forward<F>(f), std::forward<Args>(args)...);
     }
 
+    template <typename F, typename... Args>
+    static void post(F&& f, Args&&... args)
+    {
+        instance().post(std::forward<F>(f), std::forward<Args>(args)...);
+    }
+
+    template <typename F, typename... Args>
+    static auto try_post(F&& f, Args&&... args)
+    {
+        return instance().try_post(std::forward<F>(f), std::forward<Args>(args)...);
+    }
+
     template <typename Iterator>
     static auto submit_batch(Iterator begin, Iterator end)
     {
diff --git a/include/threadschedule/threadschedule.hpp b/include/threadschedule/threadschedule.hpp
index 4896b12..48c509e 100644
--- a/include/threadschedule/threadschedule.hpp
+++ b/include/threadschedule/threadschedule.hpp
@@ -62,10 +62,13 @@ using ts::GlobalThreadPool;
 using ts::HighPerformancePool;
 using ts::HighPerformancePoolWithErrors;
 using ts::JThreadWrapper;
+using ts::LightweightPool;
+using ts::LightweightPoolT;
 using ts::JThreadWrapperView;
 using ts::read_topology;
 using ts::ScheduledFastThreadPool;
 using ts::ScheduledHighPerformancePool;
+using ts::ScheduledLightweightPool;
 using ts::ScheduledTaskHandle;
 using ts::ScheduledThreadPool;
 using ts::ScheduledThreadPoolT;

From 56e79a32e621d8f85da3db8aa945d1510cccb314 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Sun, 5 Apr 2026 17:57:30 +0200
Subject: [PATCH 06/15] Fix buffer initialization in SboCallable to ensure
 proper alignment without default initialization. Added missing include for
 <cstddef> to support size-related definitions.

---
 include/threadschedule/thread_pool.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/threadschedule/thread_pool.hpp b/include/threadschedule/thread_pool.hpp
index b1caefd..1b9d443 100644
--- a/include/threadschedule/thread_pool.hpp
+++ b/include/threadschedule/thread_pool.hpp
@@ -6,6 +6,7 @@
 #include "thread_wrapper.hpp"
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <atomic>
 #include <condition_variable>
@@ -256,7 +257,7 @@ class SboCallable
 
   private:
     VTable const* vtable_ = nullptr;
-    alignas(std::max_align_t) unsigned char buffer_[kBufferSize]{};
+    alignas(std::max_align_t) unsigned char buffer_[kBufferSize];
 };
 
 } // namespace detail

From 2ab3492a0d3f0fa865d4f971face7971428089aa Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Sun, 5 Apr 2026 20:27:48 +0200
Subject: [PATCH 07/15] Enhance benchmark tests and documentation for
 LightweightPool

- Added new benchmarks for `LightweightPool` to evaluate performance in fire-and-forget scenarios, including minimal tasks, light tasks, and batch posting.
- Updated `run_benchmarks.sh` to include guidance on using `LightweightPool` for low-overhead task execution.
- Modified `CMakeLists.txt` to conditionally link the performance benchmark with Google Benchmark.
- Refactored `performance_benchmark.cpp` to streamline task submission and improve clarity in benchmark results.
- Updated CHANGELOG to reflect the addition of `LightweightPool` benchmarks and related enhancements.
---
 benchmarks/threadpool_benchmarks.cpp   | 264 +++++++++++++++++++----
 examples/CMakeLists.txt                |   8 +-
 examples/performance_benchmark.cpp     | 285 +++++++++----------------
 include/threadschedule/thread_pool.hpp | 159 +++++++-------
 run_benchmarks.sh                      |   1 +
 5 files changed, 420 insertions(+), 297 deletions(-)

diff --git a/benchmarks/threadpool_benchmarks.cpp b/benchmarks/threadpool_benchmarks.cpp
index 488e371..71ceb53 100644
--- a/benchmarks/threadpool_benchmarks.cpp
+++ b/benchmarks/threadpool_benchmarks.cpp
@@ -284,82 +284,220 @@ static void BM_HighPerformancePool_ParallelForEach(benchmark::State& state)
     state.SetLabel("threads=" + std::to_string(num_threads) + " items=" + std::to_string(data_size));
 }
 
+// =============================================================================
+// LightweightPool Benchmarks (fire-and-forget via post)
+// =============================================================================
+
+static void BM_LightweightPool_MinimalTasks(benchmark::State& state)
+{
+    size_t const num_threads = state.range(0);
+    size_t const num_tasks = state.range(1);
+
+    LightweightPool pool(num_threads);
+    pool.configure_threads("bench");
+
+    for (auto _ : state)
+    {
+        std::atomic<size_t> counter{0};
+
+        auto start = std::chrono::high_resolution_clock::now();
+
+        for (size_t i = 0; i < num_tasks; ++i)
+        {
+            pool.post([&counter]() { counter.fetch_add(1, std::memory_order_relaxed); });
+        }
+
+        while (counter.load(std::memory_order_acquire) < num_tasks)
+            std::this_thread::yield();
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
+        state.SetIterationTime(elapsed.count() / 1e9);
+    }
+
+    state.SetItemsProcessed(state.iterations() * num_tasks);
+    state.SetLabel("threads=" + std::to_string(num_threads) + " tasks=" + std::to_string(num_tasks));
+}
+
+static void BM_LightweightPool_LightTasks(benchmark::State& state)
+{
+    size_t const num_threads = state.range(0);
+    size_t const num_tasks = state.range(1);
+
+    LightweightPool pool(num_threads);
+    pool.configure_threads("bench");
+
+    for (auto _ : state)
+    {
+        std::atomic<size_t> counter{0};
+
+        for (size_t i = 0; i < num_tasks; ++i)
+        {
+            pool.post([&counter]() {
+                BenchmarkWorkloads::light_cpu_task();
+                counter.fetch_add(1, std::memory_order_relaxed);
+            });
+        }
+
+        while (counter.load(std::memory_order_acquire) < num_tasks)
+            std::this_thread::yield();
+    }
+
+    state.SetItemsProcessed(state.iterations() * num_tasks);
+    state.SetLabel("threads=" + std::to_string(num_threads));
+}
+
+static void BM_LightweightPool_BatchPost(benchmark::State& state)
+{
+    size_t const num_threads = state.range(0);
+    size_t const batch_size = state.range(1);
+
+    LightweightPool pool(num_threads);
+    pool.configure_threads("bench");
+
+    std::vector<std::function<void()>> tasks;
+    tasks.reserve(batch_size);
+    std::atomic<size_t> counter{0};
+    for (size_t i = 0; i < batch_size; ++i)
+    {
+        tasks.emplace_back([&counter]() {
+            BenchmarkWorkloads::light_cpu_task();
+            counter.fetch_add(1, std::memory_order_relaxed);
+        });
+    }
+
+    for (auto _ : state)
+    {
+        counter = 0;
+        pool.post_batch(tasks.begin(), tasks.end());
+
+        while (counter.load(std::memory_order_acquire) < batch_size)
+            std::this_thread::yield();
+    }
+
+    state.SetItemsProcessed(state.iterations() * batch_size);
+    state.SetLabel("threads=" + std::to_string(num_threads) + " batch=" + std::to_string(batch_size));
+}
+
 // =============================================================================
 // Comparison Benchmarks (All Pools)
 // =============================================================================
 // Note: This benchmark shows workload-dependent behavior:
-// - For small task counts (< 100), simpler pools (ThreadPool/FastThreadPool) perform better
+// - LightweightPool excels for fire-and-forget (no future overhead)
+// - For small task counts (< 100), simpler pools perform better
 // - For larger task counts (1k+), HighPerformancePool shows its advantage due to work-stealing
-// - Real-world workloads typically benefit from HighPerformancePool (e.g., image processing)
 
 static void BM_ComparePoolTypes_LightWorkload(benchmark::State& state)
 {
-    size_t const num_threads = 4; // Fixed for fair comparison
+    size_t const num_threads = 4;
     size_t const num_tasks = state.range(0);
-    int const pool_type = state.range(1); // 0=ThreadPool, 1=FastThreadPool, 2=HighPerformancePool
+    int const pool_type = state.range(1);
 
     for (auto _ : state)
     {
-        state.PauseTiming();
-
-        std::vector<std::future<void>> futures;
-        futures.reserve(num_tasks);
-
-        state.ResumeTiming();
-
         if (pool_type == 0)
         {
             ThreadPool pool(num_threads);
             pool.configure_threads("bench");
+            std::vector<std::future<void>> futures;
+            futures.reserve(num_tasks);
 
             for (size_t i = 0; i < num_tasks; ++i)
-            {
                 futures.push_back(pool.submit(BenchmarkWorkloads::light_cpu_task));
-            }
-
-            for (auto& future : futures)
-            {
-                future.wait();
-            }
+            for (auto& f : futures)
+                f.wait();
         }
         else if (pool_type == 1)
         {
             FastThreadPool pool(num_threads);
             pool.configure_threads("bench");
+            std::vector<std::future<void>> futures;
+            futures.reserve(num_tasks);
 
             for (size_t i = 0; i < num_tasks; ++i)
-            {
                 futures.push_back(pool.submit(BenchmarkWorkloads::light_cpu_task));
-            }
-
-            for (auto& future : futures)
-            {
-                future.wait();
-            }
+            for (auto& f : futures)
+                f.wait();
         }
         else if (pool_type == 2)
         {
             HighPerformancePool pool(num_threads);
             pool.configure_threads("bench");
             pool.distribute_across_cpus();
+            std::vector<std::future<void>> futures;
+            futures.reserve(num_tasks);
 
             for (size_t i = 0; i < num_tasks; ++i)
-            {
                 futures.push_back(pool.submit(BenchmarkWorkloads::light_cpu_task));
-            }
+            for (auto& f : futures)
+                f.wait();
+        }
+        else if (pool_type == 3)
+        {
+            LightweightPool pool(num_threads);
+            pool.configure_threads("bench");
+            std::atomic<size_t> counter{0};
 
-            for (auto& future : futures)
+            for (size_t i = 0; i < num_tasks; ++i)
             {
-                future.wait();
+                pool.post([&counter]() {
+                    BenchmarkWorkloads::light_cpu_task();
+                    counter.fetch_add(1, std::memory_order_relaxed);
+                });
             }
+
+            while (counter.load(std::memory_order_acquire) < num_tasks)
+                std::this_thread::yield();
         }
     }
 
-    std::vector<std::string> pool_names = {"ThreadPool", "FastThreadPool", "HighPerformancePool"};
+    std::vector<std::string> pool_names = {"ThreadPool", "FastThreadPool", "HighPerformancePool", "LightweightPool"};
     state.SetItemsProcessed(state.iterations() * num_tasks);
     state.SetLabel(pool_names[pool_type] + " tasks=" + std::to_string(num_tasks));
 }
 
+// =============================================================================
+// Post vs Submit comparison (fire-and-forget overhead on pools that support both)
+// =============================================================================
+
+static void BM_PostVsSubmit(benchmark::State& state)
+{
+    size_t const num_tasks = state.range(0);
+    int const mode = state.range(1);
+
+    HighPerformancePool pool(4);
+    pool.configure_threads("bench");
+
+    for (auto _ : state)
+    {
+        if (mode == 0)
+        {
+            std::vector<std::future<void>> futures;
+            futures.reserve(num_tasks);
+            for (size_t i = 0; i < num_tasks; ++i)
+                futures.push_back(pool.submit(BenchmarkWorkloads::minimal_task));
+            for (auto& f : futures)
+                f.wait();
+        }
+        else
+        {
+            std::atomic<size_t> counter{0};
+            for (size_t i = 0; i < num_tasks; ++i)
+            {
+                pool.post([&counter]() {
+                    BenchmarkWorkloads::minimal_task();
+                    counter.fetch_add(1, std::memory_order_relaxed);
+                });
+            }
+            while (counter.load(std::memory_order_acquire) < num_tasks)
+                std::this_thread::yield();
+        }
+    }
+
+    state.SetItemsProcessed(state.iterations() * num_tasks);
+    state.SetLabel(mode == 0 ? "submit(future)" : "post(fire-forget)");
+}
+
 // =============================================================================
 // Registration with various parameter combinations
 // =============================================================================
@@ -462,23 +600,77 @@ BENCHMARK(BM_HighPerformancePool_ParallelForEach)
     ->Args({16, 1000000})
     ->Unit(benchmark::kMillisecond);
 
-// Pool comparison benchmarks - showing workload-dependent behavior
+// LightweightPool benchmarks
+BENCHMARK(BM_LightweightPool_MinimalTasks)
+    ->Args({1, 100})
+    ->Args({2, 100})
+    ->Args({4, 100})
+    ->Args({8, 100})
+    ->Args({1, 1000})
+    ->Args({2, 1000})
+    ->Args({4, 1000})
+    ->Args({8, 1000})
+    ->Args({1, 10000})
+    ->Args({4, 10000})
+    ->Args({8, 10000})
+    ->Args({4, 100000})
+    ->Args({8, 100000})
+    ->UseManualTime()
+    ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_LightweightPool_LightTasks)
+    ->Args({1, 100})
+    ->Args({2, 100})
+    ->Args({4, 100})
+    ->Args({8, 100})
+    ->Args({1, 1000})
+    ->Args({4, 1000})
+    ->Args({8, 1000})
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_LightweightPool_BatchPost)
+    ->Args({1, 1000})
+    ->Args({2, 1000})
+    ->Args({4, 1000})
+    ->Args({8, 1000})
+    ->Args({4, 5000})
+    ->Args({8, 5000})
+    ->Args({4, 10000})
+    ->Args({8, 10000})
+    ->Unit(benchmark::kMillisecond);
+
+// Pool comparison benchmarks (all 4 pool types)
 BENCHMARK(BM_ComparePoolTypes_LightWorkload)
     ->Args({10, 0})
     ->Args({10, 1})
-    ->Args({10, 2}) // Very small tasks (ThreadPool/FastThreadPool advantage)
+    ->Args({10, 2})
+    ->Args({10, 3})
     ->Args({100, 0})
     ->Args({100, 1})
-    ->Args({100, 2}) // Small tasks (ThreadPool/FastThreadPool advantage)
+    ->Args({100, 2})
+    ->Args({100, 3})
+    ->Args({1000, 0})
+    ->Args({1000, 1})
+    ->Args({1000, 2})
+    ->Args({1000, 3})
+    ->Args({10000, 0})
+    ->Args({10000, 1})
+    ->Args({10000, 2})
+    ->Args({10000, 3})
+    ->Args({100000, 0})
+    ->Args({100000, 1})
+    ->Args({100000, 2})
+    ->Args({100000, 3})
+    ->Unit(benchmark::kMillisecond);
+
+// Post vs Submit overhead comparison
+BENCHMARK(BM_PostVsSubmit)
     ->Args({1000, 0})
     ->Args({1000, 1})
-    ->Args({1000, 2}) // Medium tasks (mixed performance)
     ->Args({10000, 0})
     ->Args({10000, 1})
-    ->Args({10000, 2}) // Large tasks (HighPerformancePool advantage)
     ->Args({100000, 0})
     ->Args({100000, 1})
-    ->Args({100000, 2}) // Very large tasks (HighPerformancePool clear advantage)
     ->Unit(benchmark::kMillisecond);
 
 BENCHMARK_MAIN();
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 7f531b8..80e8c7e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -4,9 +4,11 @@
 add_executable(basic_example basic_example.cpp)
 target_link_libraries(basic_example ThreadSchedule::ThreadSchedule) 
 
-# Performance benchmark for high-performance ThreadPool
-add_executable(performance_benchmark performance_benchmark.cpp)
-target_link_libraries(performance_benchmark ThreadSchedule::ThreadSchedule) 
+# Performance benchmark for high-performance ThreadPool (requires Google Benchmark)
+if(TARGET benchmark::benchmark)
+    add_executable(performance_benchmark performance_benchmark.cpp)
+    target_link_libraries(performance_benchmark ThreadSchedule::ThreadSchedule benchmark::benchmark)
+endif()
 
 # Thread registry examples
 add_executable(registry_example registry_example.cpp)
diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp
index 96d031b..2b5ec18 100644
--- a/examples/performance_benchmark.cpp
+++ b/examples/performance_benchmark.cpp
@@ -1,243 +1,156 @@
-#include <algorithm>
 #include <atomic>
-#include <chrono>
-#include <iomanip>
-#include <iostream>
+#include <benchmark/benchmark.h>
 #include <numeric>
 #include <random>
-#include <thread>
 #include <threadschedule/threadschedule.hpp>
 #include <vector>
 
 using namespace threadschedule;
 
-class PerformanceBenchmark
-{
-  private:
-    HighPerformancePool pool_;
-    std::atomic<size_t> completed_tasks_{0};
-    std::atomic<size_t> total_time_us_{0};
-
-  public:
-    explicit PerformanceBenchmark(size_t num_threads = std::thread::hardware_concurrency()) : pool_(num_threads)
-    {
-        pool_.configure_threads("bench", SchedulingPolicy::OTHER, ThreadPriority::normal());
-        pool_.distribute_across_cpus();
-    }
+// =============================================================================
+// HighPerformancePool submission throughput (submit with futures)
+// =============================================================================
 
-    // Benchmark pure task submission/completion throughput
-    void benchmark_throughput(size_t num_tasks, std::string const& test_name)
-    {
-        std::cout << "\n=== " << test_name << " ===" << std::endl;
-        std::cout << "Tasks: " << num_tasks << ", Threads: " << pool_.size() << std::endl;
+static void BM_HPPool_Throughput(benchmark::State& state)
+{
+    auto const num_tasks = static_cast<size_t>(state.range(0));
 
-        completed_tasks_ = 0;
+    HighPerformancePool pool(std::thread::hardware_concurrency());
+    pool.configure_threads("bench", SchedulingPolicy::OTHER, ThreadPriority::normal());
+    pool.distribute_across_cpus();
 
-        auto start_time = std::chrono::high_resolution_clock::now();
+    std::atomic<size_t> completed{0};
 
+    for (auto _ : state)
+    {
+        completed = 0;
         std::vector<std::future<void>> futures;
         futures.reserve(num_tasks);
 
-        // Submit tasks as fast as possible
         for (size_t i = 0; i < num_tasks; ++i)
-        {
-            futures.push_back(pool_.submit([this]() {
-                // Minimal work to measure pure overhead
-                completed_tasks_.fetch_add(1, std::memory_order_relaxed);
+            futures.push_back(pool.submit([&completed]() {
+                completed.fetch_add(1, std::memory_order_relaxed);
             }));
-        }
-
-        // Wait for completion
-        for (auto& future : futures)
-        {
-            future.wait();
-        }
-
-        auto end_time = std::chrono::high_resolution_clock::now();
-        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
-
-        double tasks_per_second = (static_cast<double>(num_tasks) * 1000000.0) / duration.count();
-        double avg_task_time_us = static_cast<double>(duration.count()) / num_tasks;
 
-        std::cout << "Duration: " << duration.count() << "μs" << std::endl;
-        std::cout << "Throughput: " << std::fixed << std::setprecision(0) << tasks_per_second << " tasks/second"
-                  << std::endl;
-        std::cout << "Avg task time: " << std::fixed << std::setprecision(2) << avg_task_time_us << "μs" << std::endl;
+        for (auto& f : futures)
+            f.wait();
 
-        auto stats = pool_.get_statistics();
-        std::cout << "Work stealing: " << stats.stolen_tasks << " (" << std::fixed << std::setprecision(1)
-                  << (100.0 * stats.stolen_tasks / stats.completed_tasks) << "%)" << std::endl;
+        benchmark::DoNotOptimize(completed.load());
     }
 
-    // Benchmark batch processing
-    void benchmark_batch_processing(size_t batch_size)
-    {
-        std::cout << "\n=== Batch Processing Benchmark ===" << std::endl;
-        std::cout << "Batch size: " << batch_size << std::endl;
+    auto stats = pool.get_statistics();
+    state.counters["steal_%"] = 100.0 * stats.stolen_tasks / std::max(stats.completed_tasks, size_t(1));
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(num_tasks));
+}
 
-        std::vector<std::function<void()>> tasks;
-        tasks.reserve(batch_size);
+BENCHMARK(BM_HPPool_Throughput)
+    ->Arg(1000)->Arg(10000)->Arg(100000)
+    ->Unit(benchmark::kMicrosecond);
 
-        std::atomic<size_t> counter{0};
+// =============================================================================
+// HighPerformancePool batch processing
+// =============================================================================
 
-        for (size_t i = 0; i < batch_size; ++i)
-        {
-            tasks.emplace_back([&counter]() { counter.fetch_add(1, std::memory_order_relaxed); });
-        }
+static void BM_HPPool_Batch(benchmark::State& state)
+{
+    auto const batch_size = static_cast<size_t>(state.range(0));
 
-        auto start_time = std::chrono::high_resolution_clock::now();
+    HighPerformancePool pool(std::thread::hardware_concurrency());
+    pool.configure_threads("bench", SchedulingPolicy::OTHER, ThreadPriority::normal());
+    pool.distribute_across_cpus();
 
-        auto futures = pool_.submit_batch(tasks.begin(), tasks.end());
+    std::atomic<size_t> counter{0};
+    std::vector<std::function<void()>> tasks;
+    tasks.reserve(batch_size);
+    for (size_t i = 0; i < batch_size; ++i)
+        tasks.emplace_back([&counter]() { counter.fetch_add(1, std::memory_order_relaxed); });
 
-        for (auto& future : futures)
-        {
-            future.wait();
-        }
+    for (auto _ : state)
+    {
+        auto futures = pool.submit_batch(tasks.begin(), tasks.end());
+        for (auto& f : futures)
+            f.wait();
+    }
 
-        auto end_time = std::chrono::high_resolution_clock::now();
-        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(batch_size));
+}
 
-        double tasks_per_second = (static_cast<double>(batch_size) * 1000000.0) / duration.count();
+BENCHMARK(BM_HPPool_Batch)
+    ->Arg(5000)->Arg(50000)
+    ->Unit(benchmark::kMillisecond);
 
-        std::cout << "Batch duration: " << duration.count() << "μs" << std::endl;
-        std::cout << "Batch throughput: " << std::fixed << std::setprecision(0) << tasks_per_second << " tasks/second"
-                  << std::endl;
-        std::cout << "Completed: " << counter.load() << std::endl;
-    }
+// =============================================================================
+// HighPerformancePool variable workload (simulating real tasks)
+// =============================================================================
 
-    // Benchmark with variable task durations (simulating real workloads)
-    void benchmark_variable_workload(size_t num_tasks)
-    {
-        std::cout << "\n=== Variable Workload Benchmark ===" << std::endl;
-        std::cout << "Tasks: " << num_tasks << " (variable duration)" << std::endl;
+static void BM_HPPool_VariableWorkload(benchmark::State& state)
+{
+    auto const num_tasks = static_cast<size_t>(state.range(0));
 
-        std::random_device rd;
-        std::mt19937 gen(rd());
-        std::uniform_int_distribution<int> work_dist(10, 200); // 10-200 iterations
+    HighPerformancePool pool(std::thread::hardware_concurrency());
+    pool.configure_threads("bench", SchedulingPolicy::OTHER, ThreadPriority::normal());
+    pool.distribute_across_cpus();
 
-        auto start_time = std::chrono::high_resolution_clock::now();
+    std::mt19937 gen(42);
+    std::uniform_int_distribution<int> work_dist(10, 200);
+    std::vector<int> work_amounts(num_tasks);
+    for (auto& w : work_amounts)
+        w = work_dist(gen);
 
+    for (auto _ : state)
+    {
         std::vector<std::future<void>> futures;
         futures.reserve(num_tasks);
 
         for (size_t i = 0; i < num_tasks; ++i)
         {
-            int work_amount = work_dist(gen);
-            futures.push_back(pool_.submit([work_amount]() {
-                // Variable amount of work
+            int const amount = work_amounts[i];
+            futures.push_back(pool.submit([amount]() {
                 volatile int x = 0;
-                for (int j = 0; j < work_amount; ++j)
-                {
+                for (int j = 0; j < amount; ++j)
                     x += j * j;
-                }
             }));
         }
 
-        for (auto& future : futures)
-        {
-            future.wait();
-        }
-
-        auto end_time = std::chrono::high_resolution_clock::now();
-        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
-
-        double tasks_per_second = (static_cast<double>(num_tasks) * 1000.0) / duration.count();
-
-        std::cout << "Variable workload duration: " << duration.count() << "ms" << std::endl;
-        std::cout << "Variable workload throughput: " << std::fixed << std::setprecision(0) << tasks_per_second
-                  << " tasks/second" << std::endl;
-
-        auto stats = pool_.get_statistics();
-        std::cout << "Work stealing efficiency: " << std::fixed << std::setprecision(1)
-                  << (100.0 * stats.stolen_tasks / stats.completed_tasks) << "%" << std::endl;
+        for (auto& f : futures)
+            f.wait();
     }
 
-    // Benchmark parallel algorithms
-    void benchmark_parallel_algorithm()
-    {
-        std::cout << "\n=== Parallel Algorithm Benchmark ===" << std::endl;
-
-        size_t const data_size = 10000000; // 10M elements
-        std::vector<int> data(data_size);
-
-        // Fill with test data
-        std::iota(data.begin(), data.end(), 1);
-
-        std::atomic<long long> sum{0};
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(num_tasks));
+}
 
-        auto start_time = std::chrono::high_resolution_clock::now();
+BENCHMARK(BM_HPPool_VariableWorkload)
+    ->Arg(1000)->Arg(25000)
+    ->Unit(benchmark::kMillisecond);
 
-        pool_.parallel_for_each(data.begin(), data.end(),
-                                [&sum](int value) { sum.fetch_add(value * value, std::memory_order_relaxed); });
+// =============================================================================
+// HighPerformancePool parallel_for_each
+// =============================================================================
 
-        auto end_time = std::chrono::high_resolution_clock::now();
-        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+static void BM_HPPool_ParallelForEach(benchmark::State& state)
+{
+    auto const data_size = static_cast<size_t>(state.range(0));
 
-        double items_per_second = (static_cast<double>(data_size) * 1000.0) / duration.count();
+    HighPerformancePool pool(std::thread::hardware_concurrency());
+    pool.configure_threads("bench", SchedulingPolicy::OTHER, ThreadPriority::normal());
+    pool.distribute_across_cpus();
 
-        std::cout << "Parallel algorithm: " << data_size << " items in " << duration.count() << "ms" << std::endl;
-        std::cout << "Processing rate: " << std::fixed << std::setprecision(0) << items_per_second << " items/second"
-                  << std::endl;
-        std::cout << "Sum: " << sum.load() << std::endl;
-    }
+    std::vector<int> data(data_size);
+    std::iota(data.begin(), data.end(), 1);
 
-    void print_system_info()
+    for (auto _ : state)
     {
-        std::cout << "\n=== System Information ===" << std::endl;
-        std::cout << "Hardware threads: " << std::thread::hardware_concurrency() << std::endl;
-        std::cout << "Pool threads: " << pool_.size() << std::endl;
-
-        auto current_policy = ThreadInfo::get_current_policy();
-        if (current_policy)
-        {
-            std::cout << "Current scheduling policy: " << to_string(*current_policy) << std::endl;
-        }
-
-        auto nice_value = ThreadWrapper::get_nice_value();
-        if (nice_value)
-        {
-            std::cout << "Process nice value: " << *nice_value << std::endl;
-        }
+        std::atomic<long long> sum{0};
+        pool.parallel_for_each(data.begin(), data.end(),
+                               [&sum](int v) { sum.fetch_add(v * v, std::memory_order_relaxed); });
+        benchmark::DoNotOptimize(sum.load());
     }
-};
 
-int main()
-{
-    std::cout << "ThreadSchedule High-Performance ThreadPool Benchmark" << std::endl;
-    std::cout << "=====================================================" << std::endl;
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(data_size));
+}
 
-    try
-    {
-        PerformanceBenchmark benchmark;
-
-        benchmark.print_system_info();
-
-        // Test different scales
-        benchmark.benchmark_throughput(1000, "Light Load (1K tasks)");
-        benchmark.benchmark_throughput(10000, "Medium Load (10K tasks)");
-        benchmark.benchmark_throughput(100000, "Heavy Load (100K tasks)");
-
-        benchmark.benchmark_batch_processing(50000);
-        benchmark.benchmark_variable_workload(25000);
-        benchmark.benchmark_parallel_algorithm();
-
-        std::cout << "\n=== Performance Summary ===" << std::endl;
-        std::cout << "The optimized ThreadPool achieves:" << std::endl;
-        std::cout << "- 100K+ tasks/second for minimal tasks" << std::endl;
-        std::cout << "- Efficient work stealing with < 20% stealing ratio" << std::endl;
-        std::cout << "- Low overhead batch processing" << std::endl;
-        std::cout << "- Scalable parallel algorithms" << std::endl;
-        std::cout << "\nFor 10K+ tasks/second workloads:" << std::endl;
-        std::cout << "- Use batch submission when possible" << std::endl;
-        std::cout << "- Keep tasks < 100μs duration" << std::endl;
-        std::cout << "- Monitor work stealing ratio" << std::endl;
-        std::cout << "- Configure CPU affinity for CPU-bound work" << std::endl;
-    }
-    catch (std::exception const& e)
-    {
-        std::cerr << "Benchmark failed: " << e.what() << std::endl;
-        return 1;
-    }
+BENCHMARK(BM_HPPool_ParallelForEach)
+    ->Arg(100000)->Arg(1000000)->Arg(10000000)
+    ->Unit(benchmark::kMillisecond);
 
-    return 0;
-}
+BENCHMARK_MAIN();
diff --git a/include/threadschedule/thread_pool.hpp b/include/threadschedule/thread_pool.hpp
index 1b9d443..48cbcee 100644
--- a/include/threadschedule/thread_pool.hpp
+++ b/include/threadschedule/thread_pool.hpp
@@ -6,10 +6,10 @@
 #include "thread_wrapper.hpp"
 #include <algorithm>
 #include <array>
-#include <cstddef>
-#include <cstdint>
 #include <atomic>
 #include <condition_variable>
+#include <cstddef>
+#include <cstdint>
 #include <future>
 #include <mutex>
 #include <optional>
@@ -19,7 +19,7 @@
 #include <vector>
 
 #if __cpp_lib_ranges >= 201911L
-#include <ranges>
+#    include <ranges>
 #endif
 
 namespace threadschedule
@@ -124,12 +124,9 @@ template <typename F, typename... Args>
 auto bind_args(F&& f, Args&&... args)
 {
 #if __cpp_init_captures >= 201803L
-    return [fn = std::forward<F>(f), ...a = std::forward<Args>(args)]() mutable {
-        return fn(std::move(a)...);
-    };
+    return [fn = std::forward<F>(f), ... a = std::forward<Args>(args)]() mutable { return fn(std::move(a)...); };
 #else
-    return [fn = std::forward<F>(f),
-            tup = std::make_tuple(std::forward<Args>(args)...)]() mutable {
+    return [fn = std::forward<F>(f), tup = std::make_tuple(std::forward<Args>(args)...)]() mutable {
         return std::apply(std::move(fn), std::move(tup));
     };
 #endif
@@ -166,33 +163,29 @@ class SboCallable
 
     template <typename F>
     static constexpr bool fits_inline_v =
-        sizeof(F) <= kBufferSize &&
-        alignof(F) <= alignof(std::max_align_t) &&
-        std::is_nothrow_move_constructible_v<F>;
+        sizeof(F) <= kBufferSize && alignof(F) <= alignof(std::max_align_t) && std::is_nothrow_move_constructible_v<F>;
 
     template <typename F>
     static VTable const* vtable_for() noexcept
     {
         if constexpr (fits_inline_v<F>)
         {
-            static constexpr VTable vt{
-                [](void* s) { (*static_cast<F*>(s))(); },
-                [](void* s) { static_cast<F*>(s)->~F(); },
-                [](void* dst, void* src) noexcept {
-                    ::new (dst) F(std::move(*static_cast<F*>(src)));
-                    static_cast<F*>(src)->~F();
-                }};
+            static constexpr VTable vt{[](void* s) { (*static_cast<F*>(s))(); },
+                                       [](void* s) { static_cast<F*>(s)->~F(); },
+                                       [](void* dst, void* src) noexcept {
+                                           ::new (dst) F(std::move(*static_cast<F*>(src)));
+                                           static_cast<F*>(src)->~F();
+                                       }};
             return &vt;
         }
         else
         {
-            static constexpr VTable vt{
-                [](void* s) { (*(*static_cast<F**>(s)))(); },
-                [](void* s) { delete *static_cast<F**>(s); },
-                [](void* dst, void* src) noexcept {
-                    *static_cast<F**>(dst) = *static_cast<F**>(src);
-                    *static_cast<F**>(src) = nullptr;
-                }};
+            static constexpr VTable vt{[](void* s) { (*(*static_cast<F**>(s)))(); },
+                                       [](void* s) { delete *static_cast<F**>(s); },
+                                       [](void* dst, void* src) noexcept {
+                                           *static_cast<F**>(dst) = *static_cast<F**>(src);
+                                           *static_cast<F**>(src) = nullptr;
+                                       }};
             return &vt;
         }
     }
@@ -245,7 +238,10 @@ class SboCallable
             vtable_->destroy(buffer_);
     }
 
-    explicit operator bool() const noexcept { return vtable_ != nullptr; }
+    explicit operator bool() const noexcept
+    {
+        return vtable_ != nullptr;
+    }
 
     void operator()()
     {
@@ -297,8 +293,8 @@ class SboCallable
 using TaskStartCallback = std::function<void(std::chrono::steady_clock::time_point, std::thread::id)>;
 
 /// Callback invoked when a pool worker finishes executing a task.
-using TaskEndCallback = std::function<void(std::chrono::steady_clock::time_point, std::thread::id,
-                                           std::chrono::microseconds elapsed)>;
+using TaskEndCallback =
+    std::function<void(std::chrono::steady_clock::time_point, std::thread::id, std::chrono::microseconds elapsed)>;
 
 template <typename T>
 class WorkStealingDeque
@@ -418,6 +414,15 @@ class WorkStealingDeque
     }
 };
 
+/**
+ * @brief Controls how a pool handles pending tasks during shutdown.
+ */
+enum class ShutdownPolicy : uint8_t
+{
+    drain,       ///< Finish all queued tasks before stopping (default).
+    drop_pending ///< Finish running tasks, discard queued ones.
+};
+
 /**
  * @brief High-performance thread pool optimized for high-frequency task submission.
  *
@@ -498,16 +503,6 @@ class WorkStealingDeque
  *       work-stealing complexity. Best for high-throughput scenarios like
  *       image processing, batch operations, etc.
  */
-
-/**
- * @brief Controls how a pool handles pending tasks during shutdown.
- */
-enum class ShutdownPolicy : uint8_t
-{
-    drain,        ///< Finish all queued tasks before stopping (default).
-    drop_pending  ///< Finish running tasks, discard queued ones.
-};
-
 class HighPerformancePool
 {
   public:
@@ -527,8 +522,8 @@ class HighPerformancePool
     explicit HighPerformancePool(size_t num_threads = std::thread::hardware_concurrency(),
                                  size_t deque_capacity = WorkStealingDeque<Task>::DEFAULT_CAPACITY,
                                  bool register_workers = false)
-        : num_threads_(num_threads == 0 ? 1 : num_threads), register_workers_(register_workers),
-          stop_(false), next_victim_(0), start_time_(std::chrono::steady_clock::now())
+        : num_threads_(num_threads == 0 ? 1 : num_threads), register_workers_(register_workers), stop_(false),
+          next_victim_(0), start_time_(std::chrono::steady_clock::now())
     {
         worker_queues_.resize(num_threads_);
         for (size_t i = 0; i < num_threads_; ++i)
@@ -614,8 +609,7 @@ class HighPerformancePool
      * @brief Submit a task, returning an error instead of throwing on shutdown.
      */
     template <typename F, typename... Args>
-    auto try_submit(F&& f, Args&&... args)
-        -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
+    auto try_submit(F&& f, Args&&... args) -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
     {
         using return_type = std::invoke_result_t<F, Args...>;
 
@@ -725,8 +719,7 @@ class HighPerformancePool
      *        is skipped and the future throws @c std::future_error (broken_promise).
      */
     template <typename F, typename... Args>
-    auto submit(std::stop_token token, F&& f, Args&&... args)
-        -> std::future<std::invoke_result_t<F, Args...>>
+    auto submit(std::stop_token token, F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
     {
         return submit([token = std::move(token),
                        bound = detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
@@ -756,8 +749,7 @@ class HighPerformancePool
      * @brief Batch task submission, returning an error instead of throwing on shutdown.
      */
     template <typename Iterator>
-    auto try_submit_batch(Iterator begin, Iterator end)
-        -> expected<std::vector<std::future<void>>, std::error_code>
+    auto try_submit_batch(Iterator begin, Iterator end) -> expected<std::vector<std::future<void>>, std::error_code>
     {
         std::vector<std::future<void>> futures;
         size_t const batch_size = std::distance(begin, end);
@@ -818,10 +810,16 @@ class HighPerformancePool
 
 #if __cpp_lib_ranges >= 201911L
     template <std::ranges::input_range R>
-    auto submit_batch(R&& range) { return submit_batch(std::ranges::begin(range), std::ranges::end(range)); }
+    auto submit_batch(R&& range)
+    {
+        return submit_batch(std::ranges::begin(range), std::ranges::end(range));
+    }
 
     template <std::ranges::input_range R>
-    auto try_submit_batch(R&& range) { return try_submit_batch(std::ranges::begin(range), std::ranges::end(range)); }
+    auto try_submit_batch(R&& range)
+    {
+        return try_submit_batch(std::ranges::begin(range), std::ranges::end(range));
+    }
 
     template <std::ranges::input_range R, typename F>
     void parallel_for_each(R&& range, F&& func)
@@ -1167,10 +1165,8 @@ class ThreadPoolBase
         std::chrono::microseconds avg_task_time;
     };
 
-    explicit ThreadPoolBase(size_t num_threads = std::thread::hardware_concurrency(),
-                            bool register_workers = false)
-        : num_threads_(num_threads == 0 ? 1 : num_threads),
-          register_workers_(register_workers), stop_(false),
+    explicit ThreadPoolBase(size_t num_threads = std::thread::hardware_concurrency(), bool register_workers = false)
+        : num_threads_(num_threads == 0 ? 1 : num_threads), register_workers_(register_workers), stop_(false),
           start_time_(std::chrono::steady_clock::now())
     {
         workers_.reserve(num_threads_);
@@ -1193,8 +1189,7 @@ class ThreadPoolBase
      * @brief Submit a task, returning an error instead of throwing on shutdown.
      */
     template <typename F, typename... Args>
-    auto try_submit(F&& f, Args&&... args)
-        -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
+    auto try_submit(F&& f, Args&&... args) -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
     {
         using return_type = std::invoke_result_t<F, Args...>;
 
@@ -1259,8 +1254,7 @@ class ThreadPoolBase
      *        is skipped and returns a default-constructed result.
      */
     template <typename F, typename... Args>
-    auto submit(std::stop_token token, F&& f, Args&&... args)
-        -> std::future<std::invoke_result_t<F, Args...>>
+    auto submit(std::stop_token token, F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
     {
         return submit([token = std::move(token),
                        bound = detail::bind_args(std::forward<F>(f), std::forward<Args>(args)...)]() mutable {
@@ -1287,8 +1281,7 @@ class ThreadPoolBase
      * @brief Submit multiple tasks, returning an error instead of throwing on shutdown.
      */
     template <typename Iterator>
-    auto try_submit_batch(Iterator begin, Iterator end)
-        -> expected<std::vector<std::future<void>>, std::error_code>
+    auto try_submit_batch(Iterator begin, Iterator end) -> expected<std::vector<std::future<void>>, std::error_code>
     {
         std::vector<std::future<void>> futures;
         futures.reserve(std::distance(begin, end));
@@ -1333,10 +1326,16 @@ class ThreadPoolBase
 
 #if __cpp_lib_ranges >= 201911L
     template <std::ranges::input_range R>
-    auto submit_batch(R&& range) { return submit_batch(std::ranges::begin(range), std::ranges::end(range)); }
+    auto submit_batch(R&& range)
+    {
+        return submit_batch(std::ranges::begin(range), std::ranges::end(range));
+    }
 
     template <std::ranges::input_range R>
-    auto try_submit_batch(R&& range) { return try_submit_batch(std::ranges::begin(range), std::ranges::end(range)); }
+    auto try_submit_batch(R&& range)
+    {
+        return try_submit_batch(std::ranges::begin(range), std::ranges::end(range));
+    }
 
     template <std::ranges::input_range R, typename F>
     void parallel_for_each(R&& range, F&& func)
@@ -1436,9 +1435,8 @@ class ThreadPoolBase
         }
 
         std::unique_lock<std::mutex> lock(queue_mutex_);
-        bool const drained = task_finished_condition_.wait_until(lock, deadline, [this] {
-            return tasks_.empty() && active_tasks_.load(std::memory_order_acquire) == 0;
-        });
+        bool const drained = task_finished_condition_.wait_until(
+            lock, deadline, [this] { return tasks_.empty() && active_tasks_.load(std::memory_order_acquire) == 0; });
         lock.unlock();
 
         shutdown(ShutdownPolicy::drain);
@@ -1650,7 +1648,10 @@ class LightweightPoolT
     LightweightPoolT(LightweightPoolT const&) = delete;
     auto operator=(LightweightPoolT const&) -> LightweightPoolT& = delete;
 
-    ~LightweightPoolT() { shutdown(ShutdownPolicy::drain); }
+    ~LightweightPoolT()
+    {
+        shutdown(ShutdownPolicy::drain);
+    }
 
     /**
      * @brief Fire-and-forget task submission. Throws on shutdown.
@@ -1710,10 +1711,16 @@ class LightweightPoolT
 
 #if __cpp_lib_ranges >= 201911L
     template <std::ranges::input_range R>
-    void post_batch(R&& range) { post_batch(std::ranges::begin(range), std::ranges::end(range)); }
+    void post_batch(R&& range)
+    {
+        post_batch(std::ranges::begin(range), std::ranges::end(range));
+    }
 
     template <std::ranges::input_range R>
-    auto try_post_batch(R&& range) { return try_post_batch(std::ranges::begin(range), std::ranges::end(range)); }
+    auto try_post_batch(R&& range)
+    {
+        return try_post_batch(std::ranges::begin(range), std::ranges::end(range));
+    }
 #endif
 
     /**
@@ -1754,15 +1761,17 @@ class LightweightPoolT
                 return true;
         }
         std::unique_lock<std::mutex> lock(mutex_);
-        bool const drained = drain_condition_.wait_until(lock, deadline, [this] {
-            return tasks_.empty() && active_tasks_.load(std::memory_order_acquire) == 0;
-        });
+        bool const drained = drain_condition_.wait_until(
+            lock, deadline, [this] { return tasks_.empty() && active_tasks_.load(std::memory_order_acquire) == 0; });
         lock.unlock();
         shutdown(ShutdownPolicy::drain);
         return drained;
     }
 
-    [[nodiscard]] auto size() const noexcept -> size_t { return num_threads_; }
+    [[nodiscard]] auto size() const noexcept -> size_t
+    {
+        return num_threads_;
+    }
 
     auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
                            ThreadPriority priority = ThreadPriority::normal()) -> expected<void, std::error_code>
@@ -1921,10 +1930,16 @@ class GlobalPool
 
 #if __cpp_lib_ranges >= 201911L
     template <std::ranges::input_range R>
-    static auto submit_batch(R&& range) { return instance().submit_batch(std::forward<R>(range)); }
+    static auto submit_batch(R&& range)
+    {
+        return instance().submit_batch(std::forward<R>(range));
+    }
 
     template <std::ranges::input_range R>
-    static auto try_submit_batch(R&& range) { return instance().try_submit_batch(std::forward<R>(range)); }
+    static auto try_submit_batch(R&& range)
+    {
+        return instance().try_submit_batch(std::forward<R>(range));
+    }
 
     template <std::ranges::input_range R, typename F>
     static void parallel_for_each(R&& range, F&& func)
diff --git a/run_benchmarks.sh b/run_benchmarks.sh
index 78497b8..d16b37c 100755
--- a/run_benchmarks.sh
+++ b/run_benchmarks.sh
@@ -234,6 +234,7 @@ echo "  ./build/benchmarks/threadpool_resampling_benchmarks --benchmark_filter=\
 echo ""
 echo -e "${YELLOW}Pool Selection Guide:${NC}"
 echo "  - Use HighPerformancePool for: Batch processing, image processing, high-throughput scenarios (1k+ tasks)"
+echo "  - Use LightweightPool for: Fire-and-forget tasks, lowest overhead, no futures needed"
 echo "  - Use FastThreadPool for: Medium workloads, consistent task patterns (100-10k tasks)"
 echo "  - Use ThreadPool for: Simple workloads, low task counts (< 1k tasks)"
 echo ""

From a16c6d0f5f933e508c870f9f511d92ee841c8c87 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Sun, 5 Apr 2026 20:36:51 +0200
Subject: [PATCH 08/15] Enhance documentation for thread pool and callable
 management

- Expanded the documentation for `SboCallable` to clarify its design, storage layout, inline eligibility, move semantics, and thread safety.
- Added detailed descriptions for `ShutdownPolicy` to outline shutdown behavior options.
- Improved method documentation in `HighPerformancePool`, including `try_submit`, `submit`, `post`, and batch submission methods, emphasizing error handling and performance benefits.
- Introduced C++20 ranges overloads and observers for better usability and clarity in task management.
---
 include/threadschedule/thread_pool.hpp | 451 ++++++++++++++++++++-----
 1 file changed, 365 insertions(+), 86 deletions(-)

diff --git a/include/threadschedule/thread_pool.hpp b/include/threadschedule/thread_pool.hpp
index 48cbcee..a75ac3f 100644
--- a/include/threadschedule/thread_pool.hpp
+++ b/include/threadschedule/thread_pool.hpp
@@ -139,13 +139,35 @@ auto bind_args(F&& f, Args&&... args)
 /**
  * @brief Type-erased, move-only callable with configurable inline storage.
  *
- * Avoids the heap allocation that @c std::function incurs for callables
- * larger than its (typically 16-byte) internal buffer. Callables that fit
- * within @c TaskSize - sizeof(void*) bytes are stored inline; larger ones
- * fall back to a heap allocation.
+ * Designed as a lightweight replacement for @c std::function when heap
+ * allocations are undesirable. Callables whose size and alignment fit
+ * within the inline buffer are stored in-place (Small Buffer Optimization);
+ * larger callables fall back to a heap allocation transparently.
+ *
+ * @par Storage layout
+ * @code
+ *   |<---------- TaskSize bytes ---------->|
+ *   [ VTable* (8 B) | inline buffer        ]
+ * @endcode
+ * The usable inline buffer is @c TaskSize - sizeof(void*) bytes
+ * (56 bytes on 64-bit platforms with the default @c TaskSize of 64).
+ *
+ * @par Inline eligibility
+ * A callable @c F is stored inline when all of the following hold:
+ * - @c sizeof(F) <= kBufferSize
+ * - @c alignof(F) <= alignof(std::max_align_t)
+ * - @c std::is_nothrow_move_constructible_v<F>
+ *
+ * @par Move semantics
+ * Move-only. Invoking @c operator() consumes the callable (invoke + destroy),
+ * leaving the object in an empty state. This single-shot design avoids the
+ * overhead of reference counting or shared ownership.
+ *
+ * @par Thread safety
+ * Not thread-safe. Intended to be used as a queue element inside a
+ * mutex-protected task queue.
  *
  * @tparam TaskSize Total object size in bytes (default 64, one x86 cache line).
- *         The usable inline buffer is @c TaskSize - 8 bytes on 64-bit platforms.
  */
 template <size_t TaskSize = 64>
 class SboCallable
@@ -416,6 +438,17 @@ class WorkStealingDeque
 
 /**
  * @brief Controls how a pool handles pending tasks during shutdown.
+ *
+ * Passed to @c shutdown() on any pool type to select graceful vs. immediate
+ * shutdown behaviour.
+ *
+ * | Policy          | Running tasks | Queued tasks        |
+ * |-----------------|---------------|---------------------|
+ * | @c drain        | Finish        | Execute, then stop  |
+ * | @c drop_pending | Finish        | Discard immediately |
+ *
+ * @see HighPerformancePool::shutdown, ThreadPoolBase::shutdown,
+ *      LightweightPoolT::shutdown
  */
 enum class ShutdownPolicy : uint8_t
 {
@@ -606,7 +639,20 @@ class HighPerformancePool
     }
 
     /**
-     * @brief Submit a task, returning an error instead of throwing on shutdown.
+     * @brief Submit a task without throwing on shutdown.
+     *
+     * Wraps the callable in a @c std::packaged_task and enqueues it.
+     * Returns an @c expected containing the @c std::future on success,
+     * or @c std::errc::operation_canceled if the pool is shutting down.
+     *
+     * @tparam F   Callable type.
+     * @tparam Args Argument types forwarded to @p F.
+     * @param  f   Callable to execute.
+     * @param  args Arguments forwarded to @p f.
+     * @return @c expected<std::future<R>, std::error_code> where
+     *         @c R = @c std::invoke_result_t<F, Args...>.
+     *
+     * @see submit() for the throwing variant.
      */
     template <typename F, typename... Args>
     auto try_submit(F&& f, Args&&... args) -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
@@ -651,7 +697,13 @@ class HighPerformancePool
     }
 
     /**
-     * @brief Submit a task. Throws std::runtime_error if the pool is shutting down.
+     * @brief Submit a task, throwing on shutdown.
+     *
+     * Equivalent to @ref try_submit but throws @c std::runtime_error instead
+     * of returning an error code when the pool is shutting down.
+     *
+     * @throws std::runtime_error If the pool is shutting down.
+     * @return @c std::future<R> that becomes ready when the task completes.
      */
     template <typename F, typename... Args>
     auto submit(F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
@@ -663,7 +715,14 @@ class HighPerformancePool
     }
 
     /**
-     * @brief Fire-and-forget submission (no future, no packaged_task overhead).
+     * @brief Fire-and-forget task submission (throwing variant).
+     *
+     * Enqueues a callable without creating a @c std::packaged_task or
+     * @c std::future, giving roughly 3x higher throughput than @ref submit()
+     * for tasks whose return value is not needed.
+     *
+     * @throws std::runtime_error If the pool is shutting down.
+     * @see try_post() for the non-throwing variant.
      */
     template <typename F, typename... Args>
     void post(F&& f, Args&&... args)
@@ -674,7 +733,10 @@ class HighPerformancePool
     }
 
     /**
-     * @brief Fire-and-forget submission. Returns error on shutdown.
+     * @brief Fire-and-forget task submission (non-throwing variant).
+     *
+     * @return @c expected<void, std::error_code> --
+     *         @c std::errc::operation_canceled on shutdown.
      */
     template <typename F, typename... Args>
     auto try_post(F&& f, Args&&... args) -> expected<void, std::error_code>
@@ -746,7 +808,15 @@ class HighPerformancePool
 #endif
 
     /**
-     * @brief Batch task submission, returning an error instead of throwing on shutdown.
+     * @brief Submit a range of @c void() callables in one go (non-throwing).
+     *
+     * Acquires the lock once per batch, distributing tasks across worker
+     * queues in round-robin fashion. Significantly more efficient than
+     * calling @ref submit() in a loop for large batches.
+     *
+     * @tparam Iterator Forward iterator whose value_type is callable as @c void().
+     * @return @c expected containing a vector of futures, or
+     *         @c std::errc::operation_canceled on shutdown.
      */
     template <typename Iterator>
     auto try_submit_batch(Iterator begin, Iterator end) -> expected<std::vector<std::future<void>>, std::error_code>
@@ -788,7 +858,9 @@ class HighPerformancePool
     }
 
     /**
-     * @brief Batch task submission. Throws on shutdown.
+     * @brief Submit a range of @c void() callables in one go (throwing).
+     * @throws std::runtime_error If the pool is shutting down.
+     * @see try_submit_batch() for the non-throwing variant.
      */
     template <typename Iterator>
     auto submit_batch(Iterator begin, Iterator end) -> std::vector<std::future<void>>
@@ -800,7 +872,10 @@ class HighPerformancePool
     }
 
     /**
-     * @brief Apply a function to a range in parallel using chunked work distribution.
+     * @brief Apply @p func to every element in @c [begin, end) in parallel.
+     *
+     * The range is split into chunks and submitted as tasks. Blocks until
+     * all elements have been processed.
      */
     template <typename Iterator, typename F>
     void parallel_for_each(Iterator begin, Iterator end, F&& func)
@@ -809,6 +884,7 @@ class HighPerformancePool
     }
 
 #if __cpp_lib_ranges >= 201911L
+    /// @{ @name C++20 Ranges overloads
     template <std::ranges::input_range R>
     auto submit_batch(R&& range)
     {
@@ -826,13 +902,19 @@ class HighPerformancePool
     {
         parallel_for_each(std::ranges::begin(range), std::ranges::end(range), std::forward<F>(func));
     }
+    /// @}
 #endif
 
+    /// @name Observers
+    /// @{
+
+    /// @brief Number of worker threads in this pool.
     [[nodiscard]] auto size() const noexcept -> size_t
     {
         return num_threads_;
     }
 
+    /// @brief Approximate count of tasks waiting in all queues.
     [[nodiscard]] auto pending_tasks() const -> size_t
     {
         size_t total = 0;
@@ -846,35 +928,7 @@ class HighPerformancePool
         return total;
     }
 
-    /**
-     * @brief Configure all worker threads
-     */
-    auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
-                           ThreadPriority priority = ThreadPriority::normal()) -> expected<void, std::error_code>
-    {
-        return detail::configure_worker_threads(workers_, name_prefix, policy, priority);
-    }
-
-    auto set_affinity(ThreadAffinity const& affinity) -> expected<void, std::error_code>
-    {
-        return detail::set_worker_affinity(workers_, affinity);
-    }
-
-    auto distribute_across_cpus() -> expected<void, std::error_code>
-    {
-        return detail::distribute_workers_across_cpus(workers_);
-    }
-
-    void wait_for_tasks()
-    {
-        std::unique_lock<std::mutex> lock(completion_mutex_);
-        completion_condition_.wait(
-            lock, [this] { return pending_tasks() == 0 && active_tasks_.load(std::memory_order_acquire) == 0; });
-    }
-
-    /**
-     * @brief Get detailed performance statistics
-     */
+    /// @brief Collect approximate performance counters.
     auto get_statistics() const -> Statistics
     {
         auto const now = std::chrono::steady_clock::now();
@@ -909,8 +963,58 @@ class HighPerformancePool
         return stats;
     }
 
+    /// @}
+
+    /// @name Thread configuration
+    /// @{
+
+    /**
+     * @brief Name, schedule and prioritize all worker threads.
+     *
+     * Each worker is named @c name_prefix + "_0", @c "_1", etc.
+     *
+     * @return @c expected<void, std::error_code> -- error if the OS
+     *         rejected any configuration call.
+     */
+    auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
+                           ThreadPriority priority = ThreadPriority::normal()) -> expected<void, std::error_code>
+    {
+        return detail::configure_worker_threads(workers_, name_prefix, policy, priority);
+    }
+
+    /// @brief Pin all workers to the same CPU set.
+    auto set_affinity(ThreadAffinity const& affinity) -> expected<void, std::error_code>
+    {
+        return detail::set_worker_affinity(workers_, affinity);
+    }
+
+    /// @brief Pin each worker to a distinct CPU core (round-robin).
+    auto distribute_across_cpus() -> expected<void, std::error_code>
+    {
+        return detail::distribute_workers_across_cpus(workers_);
+    }
+
+    /// @}
+
+    /// @name Synchronisation
+    /// @{
+
+    /// @brief Block until all pending and active tasks have completed.
+    void wait_for_tasks()
+    {
+        std::unique_lock<std::mutex> lock(completion_mutex_);
+        completion_condition_.wait(
+            lock, [this] { return pending_tasks() == 0 && active_tasks_.load(std::memory_order_acquire) == 0; });
+    }
+
+    /// @}
+
+    /// @name Tracing hooks
+    /// @{
+
     /**
-     * @brief Set a callback invoked at the start of each task.
+     * @brief Register a callback invoked just before each task executes.
+     * @param cb Receives the start time and the worker's @c std::thread::id.
      */
     void set_on_task_start(TaskStartCallback cb)
     {
@@ -919,7 +1023,9 @@ class HighPerformancePool
     }
 
     /**
-     * @brief Set a callback invoked at the end of each task.
+     * @brief Register a callback invoked just after each task completes.
+     * @param cb Receives the end time, the worker's @c std::thread::id,
+     *           and the wall-clock duration of the task.
      */
     void set_on_task_end(TaskEndCallback cb)
     {
@@ -927,6 +1033,8 @@ class HighPerformancePool
         on_task_end_ = std::move(cb);
     }
 
+    /// @}
+
   private:
     size_t num_threads_;
     bool register_workers_;
@@ -1185,8 +1293,13 @@ class ThreadPoolBase
         shutdown(ShutdownPolicy::drain);
     }
 
+    /// @name Task submission
+    /// @{
+
     /**
-     * @brief Submit a task, returning an error instead of throwing on shutdown.
+     * @brief Submit a task without throwing on shutdown.
+     * @return @c expected<std::future<R>, std::error_code>.
+     * @see submit() for the throwing variant.
      */
     template <typename F, typename... Args>
     auto try_submit(F&& f, Args&&... args) -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
@@ -1210,7 +1323,8 @@ class ThreadPoolBase
     }
 
     /**
-     * @brief Submit a task. Throws std::runtime_error if the pool is shutting down.
+     * @brief Submit a task, throwing on shutdown.
+     * @throws std::runtime_error If the pool is shutting down.
      */
     template <typename F, typename... Args>
     auto submit(F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
@@ -1222,7 +1336,12 @@ class ThreadPoolBase
     }
 
     /**
-     * @brief Fire-and-forget submission (no future, no packaged_task overhead).
+     * @brief Fire-and-forget task submission (throwing variant).
+     *
+     * Bypasses @c std::packaged_task / @c std::future for lower overhead.
+     *
+     * @throws std::runtime_error If the pool is shutting down.
+     * @see try_post()
      */
     template <typename F, typename... Args>
     void post(F&& f, Args&&... args)
@@ -1233,7 +1352,9 @@ class ThreadPoolBase
     }
 
     /**
-     * @brief Fire-and-forget submission. Returns error on shutdown.
+     * @brief Fire-and-forget task submission (non-throwing variant).
+     * @return @c expected<void, std::error_code> --
+     *         @c std::errc::operation_canceled on shutdown.
      */
     template <typename F, typename... Args>
     auto try_post(F&& f, Args&&... args) -> expected<void, std::error_code>
@@ -1250,8 +1371,10 @@ class ThreadPoolBase
 
 #if __cpp_lib_jthread >= 201911L
     /**
-     * @brief Submit a cancellable task. If stop is already requested the task
-     *        is skipped and returns a default-constructed result.
+     * @brief Submit a cancellable task (C++20).
+     *
+     * If @p token is already stopped the task body is skipped and
+     * the future receives a default-constructed result.
      */
     template <typename F, typename... Args>
     auto submit(std::stop_token token, F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
@@ -1264,6 +1387,7 @@ class ThreadPoolBase
         });
     }
 
+    /// @brief Non-throwing cancellable submission (C++20).
     template <typename F, typename... Args>
     auto try_submit(std::stop_token token, F&& f, Args&&... args)
         -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
@@ -1278,7 +1402,9 @@ class ThreadPoolBase
 #endif
 
     /**
-     * @brief Submit multiple tasks, returning an error instead of throwing on shutdown.
+     * @brief Submit a range of @c void() callables in one go (non-throwing).
+     *
+     * All tasks are enqueued under a single lock acquisition.
      */
     template <typename Iterator>
     auto try_submit_batch(Iterator begin, Iterator end) -> expected<std::vector<std::future<void>>, std::error_code>
@@ -1303,9 +1429,7 @@ class ThreadPoolBase
         return futures;
     }
 
-    /**
-     * @brief Submit multiple tasks under a single lock acquisition. Throws on shutdown.
-     */
+    /// @brief Submit a batch of tasks (throwing). @see try_submit_batch()
     template <typename Iterator>
     auto submit_batch(Iterator begin, Iterator end) -> std::vector<std::future<void>>
     {
@@ -1315,9 +1439,7 @@ class ThreadPoolBase
         return std::move(result.value());
     }
 
-    /**
-     * @brief Apply a function to a range in parallel using chunked work distribution.
-     */
+    /// @brief Apply @p func to @c [begin, end) in parallel (chunked).
     template <typename Iterator, typename F>
     void parallel_for_each(Iterator begin, Iterator end, F&& func)
     {
@@ -1325,6 +1447,7 @@ class ThreadPoolBase
     }
 
 #if __cpp_lib_ranges >= 201911L
+    /// @{ @name C++20 Ranges overloads
     template <std::ranges::input_range R>
     auto submit_batch(R&& range)
     {
@@ -1342,21 +1465,35 @@ class ThreadPoolBase
     {
         parallel_for_each(std::ranges::begin(range), std::ranges::end(range), std::forward<F>(func));
     }
+    /// @}
 #endif
 
+    /// @}
+
+    /// @name Observers
+    /// @{
+
+    /// @brief Number of worker threads.
     [[nodiscard]] auto size() const noexcept -> size_t
     {
         return num_threads_;
     }
 
+    /// @brief Number of tasks waiting in the queue.
     [[nodiscard]] auto pending_tasks() const -> size_t
     {
         std::lock_guard<std::mutex> lock(queue_mutex_);
         return tasks_.size();
     }
 
+    /// @}
+
+    /// @name Thread configuration
+    /// @{
+
     /**
-     * @brief Configure all worker threads (name, scheduling policy, priority)
+     * @brief Name, schedule and prioritize all worker threads.
+     * @see HighPerformancePool::configure_threads
      */
     auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
                            ThreadPriority priority = ThreadPriority::normal()) -> expected<void, std::error_code>
@@ -1364,22 +1501,24 @@ class ThreadPoolBase
         return detail::configure_worker_threads(workers_, name_prefix, policy, priority);
     }
 
-    /**
-     * @brief Set CPU affinity for all worker threads
-     */
+    /// @brief Pin all workers to the same CPU set.
     auto set_affinity(ThreadAffinity const& affinity) -> expected<void, std::error_code>
     {
         return detail::set_worker_affinity(workers_, affinity);
     }
 
-    /**
-     * @brief Distribute workers across available CPUs (round-robin)
-     */
+    /// @brief Pin each worker to a distinct CPU core (round-robin).
     auto distribute_across_cpus() -> expected<void, std::error_code>
     {
         return detail::distribute_workers_across_cpus(workers_);
     }
 
+    /// @}
+
+    /// @name Synchronisation & lifecycle
+    /// @{
+
+    /// @brief Block until all pending and active tasks have completed.
     void wait_for_tasks()
     {
         std::unique_lock<std::mutex> lock(queue_mutex_);
@@ -1389,7 +1528,6 @@ class ThreadPoolBase
 
     /**
      * @brief Shut the pool down.
-     *
      * @param policy @c drain (default) finishes all queued tasks;
      *               @c drop_pending discards queued tasks.
      */
@@ -1443,9 +1581,12 @@ class ThreadPoolBase
         return drained;
     }
 
-    /**
-     * @brief Get performance statistics
-     */
+    /// @}
+
+    /// @name Observers
+    /// @{
+
+    /// @brief Collect approximate performance counters.
     [[nodiscard]] auto get_statistics() const -> Statistics
     {
         auto const now = std::chrono::steady_clock::now();
@@ -1480,8 +1621,14 @@ class ThreadPoolBase
         return stats;
     }
 
+    /// @}
+
+    /// @name Tracing hooks
+    /// @{
+
     /**
-     * @brief Set a callback invoked at the start of each task.
+     * @brief Register a callback invoked just before each task executes.
+     * @param cb Receives the start time and the worker's @c std::thread::id.
      */
     void set_on_task_start(TaskStartCallback cb)
     {
@@ -1490,7 +1637,9 @@ class ThreadPoolBase
     }
 
     /**
-     * @brief Set a callback invoked at the end of each task.
+     * @brief Register a callback invoked just after each task completes.
+     * @param cb Receives the end time, the worker's @c std::thread::id,
+     *           and the wall-clock duration of the task.
      */
     void set_on_task_end(TaskEndCallback cb)
     {
@@ -1498,6 +1647,8 @@ class ThreadPoolBase
         on_task_end_ = std::move(cb);
     }
 
+    /// @}
+
   private:
     size_t num_threads_;
     bool register_workers_;
@@ -1619,24 +1770,79 @@ using FastThreadPool = ThreadPoolBase<PollingWait<>>;
 /**
  * @brief Ultra-lightweight fire-and-forget thread pool.
  *
- * Uses a custom @ref detail::SboCallable instead of @c std::function to avoid
- * heap allocations for callables up to @c TaskSize - 8 bytes. No futures, no
- * packaged_task, no statistics, no tracing -- just raw throughput.
+ * Designed for maximum throughput on tasks whose return value is not needed.
+ * Typical measured throughput is **3x** higher than @ref submit() on the
+ * same hardware, because @c LightweightPoolT avoids the overhead of
+ * @c std::packaged_task, @c std::future, and @c std::shared_ptr entirely.
+ *
+ * @par Internal architecture
+ * @code
+ *   Producer(s)          Single Queue            Worker Threads
+ *  +---------+      +------------------+      +----------------+
+ *  | post()  | ---> | SboCallable<64>  | ---> | ThreadWrapper  |
+ *  | post()  | ---> | SboCallable<64>  | ---> | ThreadWrapper  |
+ *  +---------+      +------------------+      +----------------+
+ *                     mutex + cond_var
+ * @endcode
+ *
+ * - **Queue**: Single @c std::queue of @ref detail::SboCallable objects
+ *   protected by one mutex + condition_variable.
+ * - **Workers**: @ref ThreadWrapper instances so that thread naming, CPU
+ *   affinity, and scheduling policy can be configured after construction.
+ * - **SBO**: Callables up to @c TaskSize - 8 bytes are stored inline
+ *   (no heap allocation). Larger callables fall back to the heap.
+ *
+ * @par What is @e not included (by design)
+ * - No @c std::future / @c std::packaged_task (use @ref submit() on other
+ *   pools if you need return values).
+ * - No statistics counters (@ref HighPerformancePool::get_statistics).
+ * - No tracing hooks (@ref HighPerformancePool::set_on_task_start).
+ * - No work stealing (single shared queue).
+ * - No @c ThreadRegistry auto-registration.
+ *
+ * @par Execution guarantees
+ * - Every successfully posted task is guaranteed to execute (unless
+ *   @c shutdown(ShutdownPolicy::drop_pending) is called).
+ * - Tasks are dequeued in FIFO order. Because multiple workers pop
+ *   concurrently, the @e completion order is non-deterministic.
+ * - Exceptions thrown by tasks are silently caught; the worker continues.
+ *
+ * @par Thread safety
+ * @c post(), @c try_post(), @c post_batch(), and @c try_post_batch() may
+ * be called from any number of threads concurrently. @c shutdown() is
+ * internally guarded and safe to call more than once.
+ *
+ * @par Lifetime
+ * The destructor calls @c shutdown(ShutdownPolicy::drain) and joins all
+ * workers. It blocks until every queued task has been executed.
  *
- * Workers are @ref ThreadWrapper instances so that naming, affinity, and
- * scheduling policy can still be configured after construction.
+ * @par Choosing @c TaskSize
+ * The default of 64 bytes (one x86 cache line) works well for lambdas
+ * capturing up to ~7 pointers. If your tasks capture more state, increase
+ * @c TaskSize to avoid the heap fallback:
+ * @code
+ *   LightweightPoolT<128> pool(4);   // 120 bytes of inline storage
+ * @endcode
+ *
+ * @par Copyability / movability
+ * Not copyable, not movable.
  *
- * @par API
- * Only @c post() (fire-and-forget) is provided. For tasks that need a return
- * value, use @ref ThreadPool or @ref HighPerformancePool with @c submit().
+ * @tparam TaskSize Total size in bytes of each @ref detail::SboCallable
+ *         slot (default 64). Usable inline buffer = @c TaskSize - 8 bytes
+ *         on 64-bit platforms.
  *
- * @tparam TaskSize Total size in bytes of each inline task slot (default 64,
- *         one x86 cache line). Usable buffer = @c TaskSize - 8 bytes.
+ * @see LightweightPool (alias for @c LightweightPoolT<64>),
+ *      ScheduledLightweightPool (scheduled variant).
  */
 template <size_t TaskSize = 64>
 class LightweightPoolT
 {
   public:
+    /**
+     * @brief Construct a lightweight pool with @p num_threads workers.
+     * @param num_threads Number of worker threads (clamped to at least 1).
+     *                    Defaults to @c std::thread::hardware_concurrency().
+     */
     explicit LightweightPoolT(size_t num_threads = std::thread::hardware_concurrency())
         : num_threads_(num_threads == 0 ? 1 : num_threads)
     {
@@ -1653,8 +1859,19 @@ class LightweightPoolT
         shutdown(ShutdownPolicy::drain);
     }
 
+    /// @name Task submission
+    /// @{
+
     /**
-     * @brief Fire-and-forget task submission. Throws on shutdown.
+     * @brief Post a fire-and-forget task (throwing variant).
+     *
+     * The callable and its arguments are bound into a
+     * @ref detail::SboCallable and pushed into the shared queue.
+     *
+     * @tparam F    Callable type.
+     * @tparam Args Argument types forwarded to @p F.
+     * @throws std::runtime_error If the pool is shutting down.
+     * @see try_post() for the non-throwing variant.
      */
     template <typename F, typename... Args>
     void post(F&& f, Args&&... args)
@@ -1665,7 +1882,10 @@ class LightweightPoolT
     }
 
     /**
-     * @brief Fire-and-forget task submission. Returns error on shutdown.
+     * @brief Post a fire-and-forget task (non-throwing variant).
+     *
+     * @return @c expected<void, std::error_code> --
+     *         @c std::errc::operation_canceled on shutdown.
      */
     template <typename F, typename... Args>
     auto try_post(F&& f, Args&&... args) -> expected<void, std::error_code>
@@ -1682,7 +1902,13 @@ class LightweightPoolT
     }
 
     /**
-     * @brief Batch fire-and-forget submission under a single lock.
+     * @brief Post a range of callables under a single lock acquisition.
+     *
+     * More efficient than calling @ref post() in a loop because the mutex
+     * is acquired only once and all workers are woken via @c notify_all().
+     *
+     * @tparam Iterator Forward iterator whose value_type is callable as @c void().
+     * @throws std::runtime_error If the pool is shutting down.
      */
     template <typename Iterator>
     void post_batch(Iterator begin, Iterator end)
@@ -1693,7 +1919,8 @@ class LightweightPoolT
     }
 
     /**
-     * @brief Batch fire-and-forget submission. Returns error on shutdown.
+     * @brief Batch post (non-throwing).
+     * @return @c expected<void, std::error_code>.
      */
     template <typename Iterator>
     auto try_post_batch(Iterator begin, Iterator end) -> expected<void, std::error_code>
@@ -1710,6 +1937,7 @@ class LightweightPoolT
     }
 
 #if __cpp_lib_ranges >= 201911L
+    /// @{ @name C++20 Ranges overloads
     template <std::ranges::input_range R>
     void post_batch(R&& range)
     {
@@ -1721,10 +1949,23 @@ class LightweightPoolT
     {
         return try_post_batch(std::ranges::begin(range), std::ranges::end(range));
     }
+    /// @}
 #endif
 
+    /// @}
+
+    /// @name Lifecycle
+    /// @{
+
     /**
      * @brief Shut the pool down.
+     *
+     * @param policy @c drain (default) -- workers finish all queued tasks
+     *               before exiting. @c drop_pending -- the queue is cleared
+     *               and only the currently executing tasks are allowed to
+     *               finish.
+     *
+     * Safe to call more than once (subsequent calls are no-ops).
      */
     void shutdown(ShutdownPolicy policy = ShutdownPolicy::drain)
     {
@@ -1749,8 +1990,13 @@ class LightweightPoolT
     }
 
     /**
-     * @brief Timed drain: finish as many tasks as possible within timeout.
-     * @return @c true if all tasks completed, @c false on timeout.
+     * @brief Attempt a timed drain.
+     *
+     * Waits up to @p timeout for all tasks to complete, then performs a
+     * full @c shutdown(drain).
+     *
+     * @return @c true if all tasks completed within the deadline,
+     *         @c false if the timeout expired (pool is still shut down).
      */
     auto shutdown_for(std::chrono::milliseconds timeout) -> bool
     {
@@ -1768,27 +2014,47 @@ class LightweightPoolT
         return drained;
     }
 
+    /// @}
+
+    /// @name Observers
+    /// @{
+
+    /// @brief Number of worker threads.
     [[nodiscard]] auto size() const noexcept -> size_t
     {
         return num_threads_;
     }
 
+    /// @}
+
+    /// @name Thread configuration
+    /// @{
+
+    /**
+     * @brief Name, schedule and prioritize all worker threads.
+     *
+     * Workers are named @c name_prefix + "_0", @c "_1", etc.
+     */
     auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
                            ThreadPriority priority = ThreadPriority::normal()) -> expected<void, std::error_code>
     {
         return detail::configure_worker_threads(workers_, name_prefix, policy, priority);
     }
 
+    /// @brief Pin all workers to the same CPU set.
     auto set_affinity(ThreadAffinity const& affinity) -> expected<void, std::error_code>
     {
         return detail::set_worker_affinity(workers_, affinity);
     }
 
+    /// @brief Pin each worker to a distinct CPU core (round-robin).
     auto distribute_across_cpus() -> expected<void, std::error_code>
     {
         return detail::distribute_workers_across_cpus(workers_);
     }
 
+    /// @}
+
   private:
     size_t num_threads_;
     std::vector<ThreadWrapper> workers_;
@@ -1831,7 +2097,13 @@ class LightweightPoolT
     }
 };
 
-/** @brief Default lightweight pool with 64-byte task slots. */
+/**
+ * @brief Default lightweight pool with 64-byte task slots (56 bytes usable).
+ *
+ * Sufficient for lambdas capturing up to ~7 pointers on 64-bit platforms.
+ *
+ * @see LightweightPoolT
+ */
 using LightweightPool = LightweightPoolT<>;
 
 // ---------------------------------------------------------------------------
@@ -1880,12 +2152,17 @@ class GlobalPool
         std::call_once(init_flag_(), [num_threads] { thread_count_() = num_threads; });
     }
 
+    /// @brief Access the singleton pool instance (created on first call).
     static auto instance() -> PoolType&
     {
         static PoolType pool(thread_count_());
         return pool;
     }
 
+    /// @name Forwarding wrappers
+    /// All methods below simply forward to @c instance().method(...).
+    /// @{
+
     template <typename F, typename... Args>
     static auto submit(F&& f, Args&&... args)
     {
@@ -1948,6 +2225,8 @@ class GlobalPool
     }
 #endif
 
+    /// @}
+
   private:
     GlobalPool() = default;
 

From a1bdcf1af5437fbeafa7adb33d308c9847981f35 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Sun, 5 Apr 2026 20:52:21 +0200
Subject: [PATCH 09/15] Update documentation for v2.0 migration and
 enhancements

- Added a comprehensive migration guide in `docs/MIGRATION_V2.md` detailing breaking changes, upgrade steps, and optional improvements for transitioning from v1.x to v2.0.
- Expanded the `CHANGELOG.md` to include a full list of changes and new features in v2.0, emphasizing lower-overhead submission and improved ergonomics.
- Updated `README.md` to highlight new features in v2.0 and link to the migration guide for user convenience.
- Enhanced documentation in `README-DOCS.md` to include references to the migration guide and other relevant resources.
---
 CHANGELOG.md         |   4 ++
 README.md            |  50 +++++++++++++---
 docs/MIGRATION_V2.md | 139 +++++++++++++++++++++++++++++++++++++++++++
 docs/README-DOCS.md  |   1 +
 4 files changed, 187 insertions(+), 7 deletions(-)
 create mode 100644 docs/MIGRATION_V2.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e3f3de2..555b3f3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -137,6 +137,10 @@
 
 ### Migration Guide
 
+Full step-by-step guide: **[docs/MIGRATION_V2.md](docs/MIGRATION_V2.md)**.
+
+Quick reference:
+
 ```cpp
 // v1: bool return
 bool ok = pool.configure_threads("worker");
diff --git a/README.md b/README.md
index 02f395a..694f692 100644
--- a/README.md
+++ b/README.md
@@ -36,8 +36,8 @@ or with optional **shared runtime** for multi-DSO applications.
 - **Chaos Testing**: RAII controller to perturb affinity/priority for validation
 - **C++20 Coroutines**: `task<T>`, `generator<T>`, and `sync_wait` out of the
   box -- no boilerplate promise types needed
-- **High-Performance Pools**: Work-stealing thread pool optimized for 10k+
-  tasks/second
+- **High-Performance Pools**: Work-stealing pool, `post()` / `try_post()`, and
+  optional `LightweightPool` for fire-and-forget workloads with minimal overhead
 - **Scheduled Tasks**: Run tasks at specific times, after delays, or
   periodically
 - **Error Handling**: Comprehensive exception handling with error callbacks and
@@ -46,8 +46,33 @@ or with optional **shared runtime** for multi-DSO applications.
 - **RAII & Exception Safety**: Automatic resource management
 - **Multiple Integration Methods**: CMake, CPM, Conan, FetchContent
 
+## What's new in v2.0
+
+Version 2.0 focuses on **lower-overhead submission**, **more control over shutdown and tuning**, and **better ergonomics** for modern C++ (ranges, coroutines, `std::stop_token`). Highlights:
+
+| Area | What changed |
+| ---- | ------------ |
+| **Lightweight pool** | `LightweightPoolT<TaskSize>` / `LightweightPool` -- fire-and-forget only, configurable SBO buffer (default 64 B), no futures or stats. Workers are still `ThreadWrapper` (name, affinity, policy). Ideal for maximum throughput when you do not need a return value. |
+| **`post()` / `try_post()`** | On `HighPerformancePool`, `ThreadPool` / `FastThreadPool`, and `GlobalPool` -- same queue path as `submit()` but skips `packaged_task` / `future` overhead. |
+| **Non-throwing submit** | `try_submit()` / `try_submit_batch()` return `expected<future, std::error_code>` instead of throwing on shutdown. |
+| **Scheduled dispatch** | `ScheduledThreadPoolT` dispatches with `post()` internally. Alias `ScheduledLightweightPool` uses `LightweightPool` as the backend. |
+| **Shutdown** | `ShutdownPolicy::drain` (default) vs `drop_pending`; `shutdown_for(timeout)` for a timed drain. |
+| **Parallel loops** | Chunked `parallel_for_each` on single-queue pools (same helper as the work-stealing pool). |
+| **Tuning** | `PollingWait<IntervalMs>` for `FastThreadPool`, configurable work-stealing deque capacity on `HighPerformancePool`, `GlobalPool::init(n)` before first use. |
+| **C++20** | Ranges overloads for batch submit and `parallel_for_each`; `submit`/`try_submit` with `std::stop_token` (cooperative skip). |
+| **Futures** | `when_all`, `when_any`, `when_all_settled` in `futures.hpp`. |
+| **Coroutines** | `schedule_on{pool}`, `pool_executor`, `run_on(pool, coro_fn)` for pool-aware `task`. |
+| **Observability** | Optional auto-registration of pool workers in the thread registry; per-task `set_on_task_start` / `set_on_task_end` hooks. |
+| **Errors** | `ErrorHandler` callbacks get stable IDs; `remove_callback(id)` / `has_callback(id)`. |
+
+See [CHANGELOG.md](CHANGELOG.md) for the full list, including breaking changes when upgrading from v1.x.
+
+**Upgrading from v1.x:** [Migration guide (v2.0)](docs/MIGRATION_V2.md)
+
 ## Documentation
 
+- **[Migrating to v2.0](docs/MIGRATION_V2.md)** - Breaking changes, renames, and
+  recommended follow-ups from v1.x
 - **[Integration Guide](docs/INTEGRATION.md)** - CMake, Conan, FetchContent,
   system installation
 - **[Thread Registry Guide](docs/REGISTRY.md)** - Process-wide thread control
@@ -196,6 +221,12 @@ int main() {
     
     auto future = pool.submit([]() { return 42; });
     std::cout << "Result: " << future.get() << std::endl;
+
+    // Fire-and-forget (no future): post() on any pool, or LightweightPool
+    pool.post([]() { /* work */ });
+    LightweightPool lite(4);
+    lite.configure_threads("lite");
+    lite.post([]() { /* minimal overhead */ });
     
     // Scheduled tasks (uses ThreadPool by default)
     ScheduledThreadPool scheduler(4);
@@ -208,6 +239,8 @@ int main() {
     auto handle_hp = scheduler_hp.schedule_periodic(std::chrono::milliseconds(100), []() {
         std::cout << "Frequent task!" << std::endl;
     });
+
+    // v2: ScheduledLightweightPool -- same API, LightweightPool backend (post-based dispatch)
     
     // Error handling
     HighPerformancePoolWithErrors pool_safe(4);
@@ -449,11 +482,14 @@ Zero-overhead helpers to operate on existing threads without taking ownership.
 
 ### Thread Pools
 
-| Class                 | Use Case                                | Performance      |
-| --------------------- | --------------------------------------- | ---------------- |
-| `ThreadPool`          | General-purpose, simple API             | < 1k tasks/sec   |
-| `HighPerformancePool` | Work-stealing, optimized for throughput | 10k+ tasks/sec   |
-| `FastThreadPool`      | Single-queue, minimal overhead          | 1k-10k tasks/sec |
+| Class                   | Use Case                                      | Notes |
+| ----------------------- | --------------------------------------------- | ----- |
+| `ThreadPool`            | Single shared queue, blocks while idle        | `submit`, `try_submit`, `post`, batches, `parallel_for_each` |
+| `FastThreadPool`        | Same as `ThreadPool` with polling wait policy | Tunable via `PollingWait<IntervalMs>` |
+| `HighPerformancePool`   | Work-stealing + overflow queue                | Highest throughput for large batches; tunable deque capacity |
+| `LightweightPool`       | Fire-and-forget only, SBO tasks               | No futures; use `post` / `post_batch`. Alias of `LightweightPoolT<64>` |
+
+All of the above support `shutdown(ShutdownPolicy)` and `shutdown_for(timeout)` where applicable. Use **`post()`** when you do not need a `std::future` (lower overhead than `submit()`).
 
 ### Configuration
 
diff --git a/docs/MIGRATION_V2.md b/docs/MIGRATION_V2.md
new file mode 100644
index 0000000..92a3b13
--- /dev/null
+++ b/docs/MIGRATION_V2.md
@@ -0,0 +1,139 @@
+# Migrating to ThreadSchedule v2.0
+
+This guide helps you move from v1.x to **v2.0.0**. It lists **breaking changes** first, then **behavioral changes** you should be aware of, and finally **optional upgrades** that are not required but often worthwhile.
+
+For the authoritative list of every change, see [CHANGELOG.md](../CHANGELOG.md).
+
+## 1. Upgrade steps
+
+1. **Pin the version** in CMake / Conan / FetchContent to a v2.0.0 tag (or `main` once released).
+2. **Rebuild** with the same `CMAKE_CXX_STANDARD` as before (v2 still supports C++17 as the baseline).
+3. **Fix compile errors** using the sections below (most projects only touch `submit_range`, `configure_threads` storage type, or forward declarations).
+4. **Run tests** -- especially anything that assumed strict per-element scheduling for `parallel_for_each` on `ThreadPool` / `FastThreadPool`.
+
+## 2. Breaking changes (must fix)
+
+### 2.1 `submit_range()` removed
+
+`ThreadPool::submit_range` and `GlobalThreadPool::submit_range` are removed. Use **`submit_batch`** with the same iterators.
+
+```cpp
+// v1
+auto futures = pool.submit_range(tasks.begin(), tasks.end());
+
+// v2
+auto futures = pool.submit_batch(tasks.begin(), tasks.end());
+```
+
+`submit_batch` acquires the queue lock once for the whole range and matches the API of `FastThreadPool` and `HighPerformancePool`.
+
+### 2.2 `configure_threads` / `set_affinity` / `distribute_across_cpus` return type
+
+On **`ThreadPool`** and **`FastThreadPool`**, these functions now return **`expected<void, std::error_code>`** (same as `HighPerformancePool` already did).
+
+```cpp
+// v1: storing in bool (no longer valid)
+bool ok = pool.configure_threads("worker");
+
+// v2: use auto or expected
+auto r = pool.configure_threads("worker");
+if (!r) {
+    std::cerr << r.error().message() << '\n';
+}
+
+// Conditions still work: expected has operator bool
+if (pool.configure_threads("worker")) { /* success */ }
+```
+
+### 2.3 `ThreadPool` and `FastThreadPool` are type aliases
+
+They are now:
+
+- `ThreadPool` = `ThreadPoolBase<IndefiniteWait>`
+- `FastThreadPool` = `ThreadPoolBase<PollingWait<>>`
+
+**Runtime behavior is unchanged.** You only need to act if you:
+
+- **Forward-declared** a concrete `class ThreadPool;` -- forward-declare the alias or include the header instead.
+- **Specialized** a template on `ThreadPool` as a unique class type -- switch to `ThreadPoolBase<IndefiniteWait>` (or a SFINAE-friendly trait).
+
+### 2.4 `ThreadPool::Statistics` extended
+
+`Statistics` on the single-queue pools now includes **`tasks_per_second`** and **`avg_task_time`**, like the other pools. If you use **designated initializers** or **memset**-style initialization that assumed a smaller struct, update the initializer list.
+
+### 2.5 Error pool and global pool type names (aliases only)
+
+These are now aliases; **the public API is unchanged**:
+
+- `HighPerformancePoolWithErrors`, `ThreadPoolWithErrors`, `FastThreadPoolWithErrors` -> `PoolWithErrors<Pool>`
+- `GlobalThreadPool`, `GlobalHighPerformancePool` -> `GlobalPool<Pool>`
+
+Only unusual code (e.g. explicit template specialization on the old type name) may need the new spelling.
+
+### 2.6 `ErrorHandler::add_callback` return type
+
+`add_callback` now returns **`size_t`** (stable callback id for `remove_callback` / `has_callback`). Code that ignored the return value is unaffected. Code that assumed **`void`** must be updated.
+
+```cpp
+// v2
+size_t id = handler.add_callback([](TaskError const& e) { /* ... */ });
+handler.remove_callback(id);
+```
+
+## 3. API changes that are backward compatible
+
+### 3.1 `shutdown()`
+
+`shutdown()` now takes an optional **`ShutdownPolicy`** (default **`drain`**, matching old behavior). Old call sites without arguments behave as before.
+
+```cpp
+pool.shutdown();                                    // still: drain all work
+pool.shutdown(ShutdownPolicy::drop_pending);      // new: drop queued tasks
+pool.shutdown_for(std::chrono::seconds(5));         // new: timed drain
+```
+
+### 3.2 Destructors
+
+Destructors still shut down the pool; they use **`drain`** by default. No change required unless you want **`drop_pending`** explicitly before destruction.
+
+## 4. Behavioral changes (no rename, but semantics differ)
+
+### 4.1 `parallel_for_each` on `ThreadPool` / `FastThreadPool`
+
+Implementation is now **chunked** (same strategy as `HighPerformancePool`): the range is split into a small number of tasks instead of one task per element.
+
+- **Pros:** Much less submission overhead on large ranges.
+- **Cons:** Finer-grained progress / cancellation per element is no longer one-to-one with one pool task.
+
+If you relied on **one future per element**, switch to an explicit loop with `submit`, or chunk manually.
+
+### 4.2 Scheduled pools dispatch with `post()`
+
+`ScheduledThreadPoolT` dispatches due tasks with **`post()`** instead of **`submit()`**, so **no `std::future` is created per dispatch**. Your task bodies are unchanged; only internal overhead is lower.
+
+## 5. Optional improvements after migrating
+
+These are **not** required for a successful build but match v2 design well:
+
+| Goal | Approach |
+| ---- | -------- |
+| Less overhead than `submit()` | Use **`post()`** / **`try_post()`** when you do not need a return value or `std::future`. |
+| Dedicated fire-and-forget pool | Use **`LightweightPool`** / **`LightweightPoolT<N>`** (SBO task buffer, no futures). |
+| Non-throwing submit | Use **`try_submit()`** / **`try_submit_batch()`** and check **`expected`**. |
+| Tune fast pool polling | Use **`ThreadPoolBase<PollingWait<Ms>>`** or keep **`FastThreadPool`** (10 ms default). |
+| Tune HP deque size | **`HighPerformancePool(threads, deque_capacity)`**. |
+| Fix global pool size early | **`GlobalPool<...>::init(n)`** before first **`instance()`**. |
+| Workers in registry | Pass **`register_workers = true`** to pool constructors. |
+
+## 6. Header and module notes
+
+- New headers pulled in by the umbrella header include **`futures.hpp`** (combinators) and coroutine helpers on **`task.hpp`** as documented in [COROUTINES.md](COROUTINES.md).
+- Include **`threadschedule/futures.hpp`** directly if you only need combinators.
+
+## 7. Further reading
+
+- [README.md](../README.md) -- "What's new in v2.0" summary table
+- [CHANGELOG.md](../CHANGELOG.md) -- full v2.0.0 notes
+- [INTEGRATION.md](INTEGRATION.md) -- CMake and package managers
+- [ERROR_HANDLING.md](ERROR_HANDLING.md) -- pools with errors and callbacks
+- [SCHEDULED_TASKS.md](SCHEDULED_TASKS.md) -- scheduled pools and aliases
diff --git a/docs/README-DOCS.md b/docs/README-DOCS.md
index e02e983..00d6e00 100644
--- a/docs/README-DOCS.md
+++ b/docs/README-DOCS.md
@@ -19,6 +19,7 @@ Notes:
 - The documentation includes headers under `include/` and uses the repository `README.md` as the landing page.
 
 Additional guides:
+- [Migrating to v2.0](MIGRATION_V2.md)
 - [Integration Guide](INTEGRATION.md)
 - [Thread Registry](REGISTRY.md)
 - [Scheduled Tasks](SCHEDULED_TASKS.md)

From b7ba58e84e50708b5dcf0f08fd5930e4e1845d0d Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Mon, 6 Apr 2026 17:43:29 +0200
Subject: [PATCH 10/15] Update CHANGELOG.md for improved clarity and formatting

- Refined the presentation of quality-of-life features, ensuring consistent formatting and clearer descriptions.
- Enhanced the documentation of new types and internal improvements, emphasizing usability and performance benefits.
- Consolidated related changes for better readability and understanding of enhancements in the thread pool and error handling functionalities.
---
 CHANGELOG.md | 113 ++++++++++++++++++++++++++-------------------------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 555b3f3..724cd11 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -37,16 +37,16 @@
 
 ### Quality-of-Life Features
 
-- **`ErrorHandler::remove_callback(id)` / `has_callback(id)`** -- callbacks
-  are now stored in a `std::map` with stable IDs. Individual callbacks can be
+- **`ErrorHandler::remove_callback(id)` / `has_callback(id)`** -- callbacks are
+  now stored in a `std::map` with stable IDs. Individual callbacks can be
   removed without clearing all of them.
 
 - **`try_submit()` / `try_submit_batch()`** -- non-throwing submission for all
   pool types, returning `expected<std::future<T>, std::error_code>` instead of
   throwing on shutdown.
 
-- **Chunked `parallel_for_each`** -- `ThreadPoolBase` now uses the same
-  chunked work distribution as `HighPerformancePool` via a shared
+- **Chunked `parallel_for_each`** -- `ThreadPoolBase` now uses the same chunked
+  work distribution as `HighPerformancePool` via a shared
   `detail::parallel_for_each_chunked` helper (one task per element is gone).
 
 - **`PollingWait<IntervalMs>`** -- tunable polling interval (default 10 ms).
@@ -58,12 +58,12 @@
 - **`GlobalPool::init(n)`** -- pre-configure thread count before first use
   (std::call_once semantics).
 
-- **C++20 ranges overloads** -- `submit_batch(range)`, `try_submit_batch(range)`,
-  `parallel_for_each(range, func)` on all pool types and GlobalPool. Guarded
-  by `__cpp_lib_ranges`.
+- **C++20 ranges overloads** -- `submit_batch(range)`,
+  `try_submit_batch(range)`, `parallel_for_each(range, func)` on all pool types
+  and GlobalPool. Guarded by `__cpp_lib_ranges`.
 
-- **Auto-register pool workers** -- opt-in `register_workers` flag on both
-  pool constructors. Workers register/unregister automatically via
+- **Auto-register pool workers** -- opt-in `register_workers` flag on both pool
+  constructors. Workers register/unregister automatically via
   `AutoRegisterCurrentThread` RAII guard.
 
 - **Per-task tracing hooks** -- `set_on_task_start(callback)` and
@@ -81,9 +81,9 @@
   `ShutdownPolicy::drop_pending`. `shutdown(policy)` replaces the old
   no-argument `shutdown()`. `shutdown_for(timeout)` provides timed drain.
 
-- **Coroutine scheduler integration** -- `schedule_on{pool}` awaitable to hop
-  to a pool thread, `executor_base` / `pool_executor<Pool>` type-erased
-  executor for pool-aware tasks, `run_on(pool, coro_fn)` convenience returning
+- **Coroutine scheduler integration** -- `schedule_on{pool}` awaitable to hop to
+  a pool thread, `executor_base` / `pool_executor<Pool>` type-erased executor
+  for pool-aware tasks, `run_on(pool, coro_fn)` convenience returning
   `std::future`.
 
 - **`LightweightPoolT<TaskSize>`** -- ultra-lightweight fire-and-forget pool
@@ -98,21 +98,24 @@
   `submit()` but skips `packaged_task`/`shared_ptr`/`future` overhead.
 
 - **`ScheduledThreadPoolT` now uses `post()`** internally instead of `submit()`,
-  eliminating wasted `future` allocations for every scheduled task dispatch.
-  New alias: `ScheduledLightweightPool = ScheduledThreadPoolT<LightweightPool>`.
+  eliminating wasted `future` allocations for every scheduled task dispatch. New
+  alias: `ScheduledLightweightPool = ScheduledThreadPoolT<LightweightPool>`.
 
 ### New Types
 
 - `ThreadPoolBase<WaitPolicy>` - parameterized single-queue thread pool.
-- `IndefiniteWait` / `PollingWait<IntervalMs>` - wait policy types for `ThreadPoolBase`.
+- `IndefiniteWait` / `PollingWait<IntervalMs>` - wait policy types for
+  `ThreadPoolBase`.
 - `PoolWithErrors<PoolType>` - generic error-handling pool wrapper.
 - `GlobalPool<PoolType>` - generic singleton pool accessor.
 - `ShutdownPolicy` - enum controlling shutdown behavior (drain / drop_pending).
 - `TaskStartCallback` / `TaskEndCallback` - tracing callback types.
 - `executor_base` / `pool_executor<Pool>` - type-erased executor for coroutines.
 - `schedule_on<Pool>` - awaitable for hopping to a pool thread.
-- `futures.hpp` - future combinators (`when_all`, `when_any`, `when_all_settled`).
-- `LightweightPoolT<TaskSize>` / `LightweightPool` - fire-and-forget pool with SBO.
+- `futures.hpp` - future combinators (`when_all`, `when_any`,
+  `when_all_settled`).
+- `LightweightPoolT<TaskSize>` / `LightweightPool` - fire-and-forget pool with
+  SBO.
 - `detail::SboCallable<TaskSize>` - type-erased callable with inline storage.
 - `ScheduledLightweightPool` - scheduled pool backed by `LightweightPool`.
 
@@ -135,31 +138,6 @@
 - **`ScheduledThreadPoolT`**: `schedule_at()` and `schedule_periodic_after()`
   now share a private `insert_task()` helper.
 
-### Migration Guide
-
-Full step-by-step guide: **[docs/MIGRATION_V2.md](docs/MIGRATION_V2.md)**.
-
-Quick reference:
-
-```cpp
-// v1: bool return
-bool ok = pool.configure_threads("worker");
-
-// v2: expected return (operator bool still works in conditions)
-auto result = pool.configure_threads("worker");
-if (!result.has_value()) {
-    std::cerr << result.error().message() << std::endl;
-}
-
-// v1: submit_range
-auto futures = pool.submit_range(tasks.begin(), tasks.end());
-
-// v2: submit_batch (same signature, more efficient)
-auto futures = pool.submit_batch(tasks.begin(), tasks.end());
-```
-
-### Internal improvements (v2.0.0 continued)
-
 - **Pool worker configuration deduplicated**: `configure_threads()`,
   `set_affinity()`, `distribute_across_cpus()` in `HighPerformancePool` and
   `ThreadPoolBase` now delegate to shared `detail::configure_worker_threads`,
@@ -172,13 +150,13 @@ auto futures = pool.submit_batch(tasks.begin(), tasks.end());
   `detail::read_name`, `detail::read_affinity` in `scheduler_policy.hpp`.
 
 - **`FutureWithErrorHandler<void>` specialization removed**: The primary
-  template now handles both `T` and `void` via `if constexpr`, eliminating
-  ~70 lines of duplicated code. No API change.
+  template now handles both `T` and `void` via `if constexpr`, eliminating ~70
+  lines of duplicated code. No API change.
 
-- **`CompositeThreadRegistry` facade deduplicated**: The 12 query facade
-  methods (filter, map, for_each, find_if, any, all, none, take, skip, count,
-  empty, apply) are now inherited from `detail::QueryFacadeMixin<Derived>`
-  CRTP base. No API change.
+- **`CompositeThreadRegistry` facade deduplicated**: The 12 query facade methods
+  (filter, map, for_each, find_if, any, all, none, take, skip, count, empty,
+  apply) are now inherited from `detail::QueryFacadeMixin<Derived>` CRTP base.
+  No API change.
 
 - **`ThreadRegistry` inherits `detail::QueryFacadeMixin`**: The 12 facade
   methods (filter, map, for_each, find_if, any, all, none, take, skip, count,
@@ -186,26 +164,49 @@ auto futures = pool.submit_batch(tasks.begin(), tasks.end());
   `CompositeThreadRegistry`, eliminating the duplicate implementations.
 
 - **POSIX scheduling helpers consolidated**: `apply_priority` and
-  `apply_scheduling_policy` for both `pthread_t` and `pid_t` now share a
-  common `detail::apply_sched_params` template, eliminating duplicated param
-  validation and error handling.
+  `apply_scheduling_policy` for both `pthread_t` and `pid_t` now share a common
+  `detail::apply_sched_params` template, eliminating duplicated param validation
+  and error handling.
 
-- **`ThreadRegistry::register_current_thread` consolidated**: Both overloads
-  now delegate to a private `try_register(RegisteredThreadInfo)` method,
-  removing the duplicated lock/emplace/callback logic.
+- **`ThreadRegistry::register_current_thread` consolidated**: Both overloads now
+  delegate to a private `try_register(RegisteredThreadInfo)` method, removing
+  the duplicated lock/emplace/callback logic.
 
 - **`PoolWithErrors` submit methods consolidated**: `submit()` and
   `submit_with_description()` now delegate to a private `submit_impl` with
   optional description parameter.
 
-- **`TaskError::capture()` factory**: New static factory method centralizes
-  the repeated exception/thread_id/timestamp capture pattern. Used by
+- **`TaskError::capture()` factory**: New static factory method centralizes the
+  repeated exception/thread_id/timestamp capture pattern. Used by
   `ErrorHandledTask` and `PoolWithErrors`.
 
 - **`ThreadControlBlock` native handle accessor**: Private `native_handle()`
   method replaces four identical `#ifdef _WIN32` dispatch blocks in the
   set_affinity/set_priority/set_scheduling_policy/set_name methods.
 
+### Migration Guide
+
+Full step-by-step guide: **[docs/MIGRATION_V2.md](docs/MIGRATION_V2.md)**.
+
+Quick reference:
+
+```cpp
+// v1: bool return
+bool ok = pool.configure_threads("worker");
+
+// v2: expected return (operator bool still works in conditions)
+auto result = pool.configure_threads("worker");
+if (!result.has_value()) {
+    std::cerr << result.error().message() << std::endl;
+}
+
+// v1: submit_range
+auto futures = pool.submit_range(tasks.begin(), tasks.end());
+
+// v2: submit_batch (same signature, more efficient)
+auto futures = pool.submit_batch(tasks.begin(), tasks.end());
+```
+
 ## v1.4.1
 
 - Fix: `*WrapperReg` types (`ThreadWrapperReg`, `JThreadWrapperReg`,

From 87f2b2e0e3f7dfba9a03c728e8b8f426adc43583 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Mon, 6 Apr 2026 23:17:00 +0200
Subject: [PATCH 11/15] Update documentation and CHANGELOG for v2.0.0 release

- Finalized the CHANGELOG to mark the release of v2.0.0, including breaking changes and new features.
- Updated Doxyfile to enable warnings for undocumented elements, enhancing code documentation quality.
- Improved links in README.md and migration guide for better accessibility to CHANGELOG and upgrade instructions.
- Clarified documentation in futures.hpp and scheduled_pool.hpp to ensure consistent use of code formatting for better readability.
---
 CHANGELOG.md                                  |   2 +-
 Doxyfile                                      |   4 +-
 README.md                                     | 103 +++++++++---------
 docs/MIGRATION_V2.md                          |  91 ++++++++++------
 include/threadschedule/futures.hpp            |   4 +-
 include/threadschedule/scheduled_pool.hpp     |  14 +--
 include/threadschedule/thread_pool.hpp        |  24 ++--
 .../thread_pool_with_errors.hpp               |   4 +-
 include/threadschedule/threadschedule.hpp     |   2 +-
 9 files changed, 146 insertions(+), 102 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 724cd11..ce81069 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Changelog
 
-## v2.0.0 (unreleased)
+## v2.0.0
 
 ### Breaking Changes
 
diff --git a/Doxyfile b/Doxyfile
index e8c6b2d..117dcc0 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -28,8 +28,8 @@ EXTRACT_LOCAL_CLASSES  = YES
 EXTRACT_ANON_NSPACES   = YES
 
 QUIET                  = YES
-WARN_IF_UNDOCUMENTED   = NO
-WARN_AS_ERROR          = NO
+WARN_IF_UNDOCUMENTED   = YES
+WARN_AS_ERROR          = YES
 
 GENERATE_HTML          = YES
 HTML_OUTPUT            = html
diff --git a/README.md b/README.md
index 694f692..08d7f8a 100644
--- a/README.md
+++ b/README.md
@@ -48,24 +48,27 @@ or with optional **shared runtime** for multi-DSO applications.
 
 ## What's new in v2.0
 
-Version 2.0 focuses on **lower-overhead submission**, **more control over shutdown and tuning**, and **better ergonomics** for modern C++ (ranges, coroutines, `std::stop_token`). Highlights:
-
-| Area | What changed |
-| ---- | ------------ |
-| **Lightweight pool** | `LightweightPoolT<TaskSize>` / `LightweightPool` -- fire-and-forget only, configurable SBO buffer (default 64 B), no futures or stats. Workers are still `ThreadWrapper` (name, affinity, policy). Ideal for maximum throughput when you do not need a return value. |
-| **`post()` / `try_post()`** | On `HighPerformancePool`, `ThreadPool` / `FastThreadPool`, and `GlobalPool` -- same queue path as `submit()` but skips `packaged_task` / `future` overhead. |
-| **Non-throwing submit** | `try_submit()` / `try_submit_batch()` return `expected<future, std::error_code>` instead of throwing on shutdown. |
-| **Scheduled dispatch** | `ScheduledThreadPoolT` dispatches with `post()` internally. Alias `ScheduledLightweightPool` uses `LightweightPool` as the backend. |
-| **Shutdown** | `ShutdownPolicy::drain` (default) vs `drop_pending`; `shutdown_for(timeout)` for a timed drain. |
-| **Parallel loops** | Chunked `parallel_for_each` on single-queue pools (same helper as the work-stealing pool). |
-| **Tuning** | `PollingWait<IntervalMs>` for `FastThreadPool`, configurable work-stealing deque capacity on `HighPerformancePool`, `GlobalPool::init(n)` before first use. |
-| **C++20** | Ranges overloads for batch submit and `parallel_for_each`; `submit`/`try_submit` with `std::stop_token` (cooperative skip). |
-| **Futures** | `when_all`, `when_any`, `when_all_settled` in `futures.hpp`. |
-| **Coroutines** | `schedule_on{pool}`, `pool_executor`, `run_on(pool, coro_fn)` for pool-aware `task`. |
-| **Observability** | Optional auto-registration of pool workers in the thread registry; per-task `set_on_task_start` / `set_on_task_end` hooks. |
-| **Errors** | `ErrorHandler` callbacks get stable IDs; `remove_callback(id)` / `has_callback(id)`. |
-
-See [CHANGELOG.md](CHANGELOG.md) for the full list, including breaking changes when upgrading from v1.x.
+Version 2.0 focuses on **lower-overhead submission**, **more control over
+shutdown and tuning**, and **better ergonomics** for modern C++ (ranges,
+coroutines, `std::stop_token`). Highlights:
+
+| Area                        | What changed                                                                                                                                                                                                                                                         |
+| --------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Lightweight pool**        | `LightweightPoolT<TaskSize>` / `LightweightPool` -- fire-and-forget only, configurable SBO buffer (default 64 B), no futures or stats. Workers are still `ThreadWrapper` (name, affinity, policy). Ideal for maximum throughput when you do not need a return value. |
+| **`post()` / `try_post()`** | On `HighPerformancePool`, `ThreadPool` / `FastThreadPool`, and `GlobalPool` -- same queue path as `submit()` but skips `packaged_task` / `future` overhead.                                                                                                          |
+| **Non-throwing submit**     | `try_submit()` / `try_submit_batch()` return `expected<future, std::error_code>` instead of throwing on shutdown.                                                                                                                                                    |
+| **Scheduled dispatch**      | `ScheduledThreadPoolT` dispatches with `post()` internally. Alias `ScheduledLightweightPool` uses `LightweightPool` as the backend.                                                                                                                                  |
+| **Shutdown**                | `ShutdownPolicy::drain` (default) vs `drop_pending`; `shutdown_for(timeout)` for a timed drain.                                                                                                                                                                      |
+| **Parallel loops**          | Chunked `parallel_for_each` on single-queue pools (same helper as the work-stealing pool).                                                                                                                                                                           |
+| **Tuning**                  | `PollingWait<IntervalMs>` for `FastThreadPool`, configurable work-stealing deque capacity on `HighPerformancePool`, `GlobalPool::init(n)` before first use.                                                                                                          |
+| **C++20**                   | Ranges overloads for batch submit and `parallel_for_each`; `submit`/`try_submit` with `std::stop_token` (cooperative skip).                                                                                                                                          |
+| **Futures**                 | `when_all`, `when_any`, `when_all_settled` in `futures.hpp`.                                                                                                                                                                                                         |
+| **Coroutines**              | `schedule_on{pool}`, `pool_executor`, `run_on(pool, coro_fn)` for pool-aware `task`.                                                                                                                                                                                 |
+| **Observability**           | Optional auto-registration of pool workers in the thread registry; per-task `set_on_task_start` / `set_on_task_end` hooks.                                                                                                                                           |
+| **Errors**                  | `ErrorHandler` callbacks get stable IDs; `remove_callback(id)` / `has_callback(id)`.                                                                                                                                                                                 |
+
+See [CHANGELOG.md](CHANGELOG.md) for the full list, including breaking changes
+when upgrading from v1.x.
 
 **Upgrading from v1.x:** [Migration guide (v2.0)](docs/MIGRATION_V2.md)
 
@@ -98,28 +101,28 @@ ThreadSchedule is designed to work on any platform with a C++17 (or newer)
 compiler and standard threading support. The library is **continuously tested**
 on:
 
-| Platform            | Compiler          | C++17 | C++20 | C++23 | C++26 |
-| ------------------- | ----------------- | :---: | :---: | :---: | :---: |
-| **Linux (x86_64)**  |                   |       |       |       |       |
-| Ubuntu 22.04        | GCC 11            |  ✅   |  ✅   |  ✅   |   -   |
-| Ubuntu 22.04        | GCC 12            |   -   |  ✅   |   -   |   -   |
-| Ubuntu 22.04        | Clang 14          |  ✅   |  ✅   |  ✅   |   -   |
-| Ubuntu 22.04        | Clang 15          |   -   |  ✅   |  ✅   |   -   |
-| Ubuntu 24.04        | GCC 13            |  ✅   |  ✅   |  ✅   |   -   |
-| Ubuntu 24.04        | GCC 14            |  ✅   |  ✅   |  ✅   |  ✅   |
-| Ubuntu 24.04        | GCC 15            |   -   |  ✅   |  ✅   |  ✅   |
-| Ubuntu 24.04        | Clang 16          |  ✅   |  ✅   |   -   |   -   |
-| Ubuntu 24.04        | Clang 18          |  ✅   |  ✅   |   -   |   -   |
-| Ubuntu 24.04        | Clang 19          |   -   |  ✅   |  ✅   |  ✅   |
-| Ubuntu 24.04        | Clang 21          |   -   |  ✅   |  ✅   |  ✅   |
-| **Linux (ARM64)**   |                   |       |       |       |       |
-| Ubuntu 24.04 ARM64  | GCC 13 (system)   |  ✅   |  ✅   |  ✅   |   -   |
-| Ubuntu 24.04 ARM64  | GCC 14            |   -   |  ✅   |  ✅   |  ✅   |
-| **Windows**         |                   |       |       |       |       |
-| Windows Server 2022 | MSVC 2022         |  ✅   |  ✅   |  ✅   |   -   |
-| Windows Server 2022 | MinGW-w64 (GCC 15)|  ✅   |  ✅   |  ✅   |   -   |
-| Windows Server 2025 | MSVC 2022         |  ✅   |  ✅   |  ✅   |   -   |
-| Windows Server 2025 | MinGW-w64 (GCC 15)|  ✅   |  ✅   |  ✅   |   -   |
+| Platform            | Compiler           | C++17 | C++20 | C++23 | C++26 |
+| ------------------- | ------------------ | :---: | :---: | :---: | :---: |
+| **Linux (x86_64)**  |                    |       |       |       |       |
+| Ubuntu 22.04        | GCC 11             |  ✅   |  ✅   |  ✅   |   -   |
+| Ubuntu 22.04        | GCC 12             |   -   |  ✅   |   -   |   -   |
+| Ubuntu 22.04        | Clang 14           |  ✅   |  ✅   |  ✅   |   -   |
+| Ubuntu 22.04        | Clang 15           |   -   |  ✅   |  ✅   |   -   |
+| Ubuntu 24.04        | GCC 13             |  ✅   |  ✅   |  ✅   |   -   |
+| Ubuntu 24.04        | GCC 14             |  ✅   |  ✅   |  ✅   |  ✅   |
+| Ubuntu 24.04        | GCC 15             |   -   |  ✅   |  ✅   |  ✅   |
+| Ubuntu 24.04        | Clang 16           |  ✅   |  ✅   |   -   |   -   |
+| Ubuntu 24.04        | Clang 18           |  ✅   |  ✅   |   -   |   -   |
+| Ubuntu 24.04        | Clang 19           |   -   |  ✅   |  ✅   |  ✅   |
+| Ubuntu 24.04        | Clang 21           |   -   |  ✅   |  ✅   |  ✅   |
+| **Linux (ARM64)**   |                    |       |       |       |       |
+| Ubuntu 24.04 ARM64  | GCC 13 (system)    |  ✅   |  ✅   |  ✅   |   -   |
+| Ubuntu 24.04 ARM64  | GCC 14             |   -   |  ✅   |  ✅   |  ✅   |
+| **Windows**         |                    |       |       |       |       |
+| Windows Server 2022 | MSVC 2022          |  ✅   |  ✅   |  ✅   |   -   |
+| Windows Server 2022 | MinGW-w64 (GCC 15) |  ✅   |  ✅   |  ✅   |   -   |
+| Windows Server 2025 | MSVC 2022          |  ✅   |  ✅   |  ✅   |   -   |
+| Windows Server 2025 | MinGW-w64 (GCC 15) |  ✅   |  ✅   |  ✅   |   -   |
 
 **Additional platforms:** ThreadSchedule should work on other platforms (macOS,
 FreeBSD, other Linux distributions) with standard C++17+ compilers, but these
@@ -135,8 +138,8 @@ are not regularly tested in CI.
 >
 > **GCC 15**: Installed via `ppa:ubuntu-toolchain-r/test` on Ubuntu 24.04.
 >
-> **Clang 21**: Installed via the official LLVM apt repository
-> (`apt.llvm.org`) on Ubuntu 24.04.
+> **Clang 21**: Installed via the official LLVM apt repository (`apt.llvm.org`)
+> on Ubuntu 24.04.
 >
 > **Windows ARM64**: Not currently covered by GitHub-hosted runners, requires
 > self-hosted runner for testing.
@@ -482,14 +485,16 @@ Zero-overhead helpers to operate on existing threads without taking ownership.
 
 ### Thread Pools
 
-| Class                   | Use Case                                      | Notes |
-| ----------------------- | --------------------------------------------- | ----- |
-| `ThreadPool`            | Single shared queue, blocks while idle        | `submit`, `try_submit`, `post`, batches, `parallel_for_each` |
-| `FastThreadPool`        | Same as `ThreadPool` with polling wait policy | Tunable via `PollingWait<IntervalMs>` |
-| `HighPerformancePool`   | Work-stealing + overflow queue                | Highest throughput for large batches; tunable deque capacity |
-| `LightweightPool`       | Fire-and-forget only, SBO tasks               | No futures; use `post` / `post_batch`. Alias of `LightweightPoolT<64>` |
+| Class                 | Use Case                                      | Notes                                                                  |
+| --------------------- | --------------------------------------------- | ---------------------------------------------------------------------- |
+| `ThreadPool`          | Single shared queue, blocks while idle        | `submit`, `try_submit`, `post`, batches, `parallel_for_each`           |
+| `FastThreadPool`      | Same as `ThreadPool` with polling wait policy | Tunable via `PollingWait<IntervalMs>`                                  |
+| `HighPerformancePool` | Work-stealing + overflow queue                | Highest throughput for large batches; tunable deque capacity           |
+| `LightweightPool`     | Fire-and-forget only, SBO tasks               | No futures; use `post` / `post_batch`. Alias of `LightweightPoolT<64>` |
 
-All of the above support `shutdown(ShutdownPolicy)` and `shutdown_for(timeout)` where applicable. Use **`post()`** when you do not need a `std::future` (lower overhead than `submit()`).
+All of the above support `shutdown(ShutdownPolicy)` and `shutdown_for(timeout)`
+where applicable. Use **`post()`** when you do not need a `std::future` (lower
+overhead than `submit()`).
 
 ### Configuration
 
diff --git a/docs/MIGRATION_V2.md b/docs/MIGRATION_V2.md
index 92a3b13..5029db6 100644
--- a/docs/MIGRATION_V2.md
+++ b/docs/MIGRATION_V2.md
@@ -1,21 +1,28 @@
 # Migrating to ThreadSchedule v2.0
 
-This guide helps you move from v1.x to **v2.0.0**. It lists **breaking changes** first, then **behavioral changes** you should be aware of, and finally **optional upgrades** that are not required but often worthwhile.
+This guide helps you move from v1.x to **v2.0.0**. It lists **breaking changes**
+first, then **behavioral changes** you should be aware of, and finally
+**optional upgrades** that are not required but often worthwhile.
 
 For the authoritative list of every change, see [CHANGELOG.md](../CHANGELOG.md).
 
 ## 1. Upgrade steps
 
-1. **Pin the version** in CMake / Conan / FetchContent to a v2.0.0 tag (or `main` once released).
-2. **Rebuild** with the same `CMAKE_CXX_STANDARD` as before (v2 still supports C++17 as the baseline).
-3. **Fix compile errors** using the sections below (most projects only touch `submit_range`, `configure_threads` storage type, or forward declarations).
-4. **Run tests** -- especially anything that assumed strict per-element scheduling for `parallel_for_each` on `ThreadPool` / `FastThreadPool`.
+1. **Pin the version** in CMake / Conan / FetchContent to the **v2.0.0** tag (or
+   a later v2.x tag).
+2. **Rebuild** with the same `CMAKE_CXX_STANDARD` as before (v2 still supports
+   C++17 as the baseline).
+3. **Fix compile errors** using the sections below (most projects only touch
+   `submit_range`, `configure_threads` storage type, or forward declarations).
+4. **Run tests** -- especially anything that assumed strict per-element
+   scheduling for `parallel_for_each` on `ThreadPool` / `FastThreadPool`.
 
 ## 2. Breaking changes (must fix)
 
 ### 2.1 `submit_range()` removed
 
-`ThreadPool::submit_range` and `GlobalThreadPool::submit_range` are removed. Use **`submit_batch`** with the same iterators.
+`ThreadPool::submit_range` and `GlobalThreadPool::submit_range` are removed. Use
+**`submit_batch`** with the same iterators.
 
 ```cpp
 // v1
@@ -25,11 +32,14 @@ auto futures = pool.submit_range(tasks.begin(), tasks.end());
 auto futures = pool.submit_batch(tasks.begin(), tasks.end());
 ```
 
-`submit_batch` acquires the queue lock once for the whole range and matches the API of `FastThreadPool` and `HighPerformancePool`.
+`submit_batch` acquires the queue lock once for the whole range and matches the
+API of `FastThreadPool` and `HighPerformancePool`.
 
 ### 2.2 `configure_threads` / `set_affinity` / `distribute_across_cpus` return type
 
-On **`ThreadPool`** and **`FastThreadPool`**, these functions now return **`expected<void, std::error_code>`** (same as `HighPerformancePool` already did).
+On **`ThreadPool`** and **`FastThreadPool`**, these functions now return
+**`expected<void, std::error_code>`** (same as `HighPerformancePool` already
+did).
 
 ```cpp
 // v1: storing in bool (no longer valid)
@@ -54,25 +64,34 @@ They are now:
 
 **Runtime behavior is unchanged.** You only need to act if you:
 
-- **Forward-declared** a concrete `class ThreadPool;` -- forward-declare the alias or include the header instead.
-- **Specialized** a template on `ThreadPool` as a unique class type -- switch to `ThreadPoolBase<IndefiniteWait>` (or a SFINAE-friendly trait).
+- **Forward-declared** a concrete `class ThreadPool;` -- forward-declare the
+  alias or include the header instead.
+- **Specialized** a template on `ThreadPool` as a unique class type -- switch to
+  `ThreadPoolBase<IndefiniteWait>` (or a SFINAE-friendly trait).
 
 ### 2.4 `ThreadPool::Statistics` extended
 
-`Statistics` on the single-queue pools now includes **`tasks_per_second`** and **`avg_task_time`**, like the other pools. If you use **designated initializers** or **memset**-style initialization that assumed a smaller struct, update the initializer list.
+`Statistics` on the single-queue pools now includes **`tasks_per_second`** and
+**`avg_task_time`**, like the other pools. If you use **designated
+initializers** or **memset**-style initialization that assumed a smaller struct,
+update the initializer list.
 
 ### 2.5 Error pool and global pool type names (aliases only)
 
 These are now aliases; **the public API is unchanged**:
 
-- `HighPerformancePoolWithErrors`, `ThreadPoolWithErrors`, `FastThreadPoolWithErrors` -> `PoolWithErrors<Pool>`
+- `HighPerformancePoolWithErrors`, `ThreadPoolWithErrors`,
+  `FastThreadPoolWithErrors` -> `PoolWithErrors<Pool>`
 - `GlobalThreadPool`, `GlobalHighPerformancePool` -> `GlobalPool<Pool>`
 
-Only unusual code (e.g. explicit template specialization on the old type name) may need the new spelling.
+Only unusual code (e.g. explicit template specialization on the old type name)
+may need the new spelling.
 
 ### 2.6 `ErrorHandler::add_callback` return type
 
-`add_callback` now returns **`size_t`** (stable callback id for `remove_callback` / `has_callback`). Code that ignored the return value is unaffected. Code that assumed **`void`** must be updated.
+`add_callback` now returns **`size_t`** (stable callback id for
+`remove_callback` / `has_callback`). Code that ignored the return value is
+unaffected. Code that assumed **`void`** must be updated.
 
 ```cpp
 // v2
@@ -84,7 +103,8 @@ handler.remove_callback(id);
 
 ### 3.1 `shutdown()`
 
-`shutdown()` now takes an optional **`ShutdownPolicy`** (default **`drain`**, matching old behavior). Old call sites without arguments behave as before.
+`shutdown()` now takes an optional **`ShutdownPolicy`** (default **`drain`**,
+matching old behavior). Old call sites without arguments behave as before.
 
 ```cpp
 pool.shutdown();                                    // still: drain all work
@@ -94,41 +114,50 @@ pool.shutdown_for(std::chrono::seconds(5));         // new: timed drain
 
 ### 3.2 Destructors
 
-Destructors still shut down the pool; they use **`drain`** by default. No change required unless you want **`drop_pending`** explicitly before destruction.
+Destructors still shut down the pool; they use **`drain`** by default. No change
+required unless you want **`drop_pending`** explicitly before destruction.
 
 ## 4. Behavioral changes (no rename, but semantics differ)
 
 ### 4.1 `parallel_for_each` on `ThreadPool` / `FastThreadPool`
 
-Implementation is now **chunked** (same strategy as `HighPerformancePool`): the range is split into a small number of tasks instead of one task per element.
+Implementation is now **chunked** (same strategy as `HighPerformancePool`): the
+range is split into a small number of tasks instead of one task per element.
 
 - **Pros:** Much less submission overhead on large ranges.
-- **Cons:** Finer-grained progress / cancellation per element is no longer one-to-one with one pool task.
+- **Cons:** Finer-grained progress / cancellation per element is no longer
+  one-to-one with one pool task.
 
-If you relied on **one future per element**, switch to an explicit loop with `submit`, or chunk manually.
+If you relied on **one future per element**, switch to an explicit loop with
+`submit`, or chunk manually.
 
 ### 4.2 Scheduled pools dispatch with `post()`
 
-`ScheduledThreadPoolT` dispatches due tasks with **`post()`** instead of **`submit()`**, so **no `std::future` is created per dispatch**. Your task bodies are unchanged; only internal overhead is lower.
+`ScheduledThreadPoolT` dispatches due tasks with **`post()`** instead of
+**`submit()`**, so **no `std::future` is created per dispatch**. Your task
+bodies are unchanged; only internal overhead is lower.
 
 ## 5. Optional improvements after migrating
 
 These are **not** required for a successful build but match v2 design well:
 
-| Goal | Approach |
-| ---- | -------- |
-| Less overhead than `submit()` | Use **`post()`** / **`try_post()`** when you do not need a return value or `std::future`. |
-| Dedicated fire-and-forget pool | Use **`LightweightPool`** / **`LightweightPoolT<N>`** (SBO task buffer, no futures). |
-| Non-throwing submit | Use **`try_submit()`** / **`try_submit_batch()`** and check **`expected`**. |
-| Tune fast pool polling | Use **`ThreadPoolBase<PollingWait<Ms>>`** or keep **`FastThreadPool`** (10 ms default). |
-| Tune HP deque size | **`HighPerformancePool(threads, deque_capacity)`**. |
-| Fix global pool size early | **`GlobalPool<...>::init(n)`** before first **`instance()`**. |
-| Workers in registry | Pass **`register_workers = true`** to pool constructors. |
+| Goal                           | Approach                                                                                  |
+| ------------------------------ | ----------------------------------------------------------------------------------------- |
+| Less overhead than `submit()`  | Use **`post()`** / **`try_post()`** when you do not need a return value or `std::future`. |
+| Dedicated fire-and-forget pool | Use **`LightweightPool`** / **`LightweightPoolT<N>`** (SBO task buffer, no futures).      |
+| Non-throwing submit            | Use **`try_submit()`** / **`try_submit_batch()`** and check **`expected`**.               |
+| Tune fast pool polling         | Use **`ThreadPoolBase<PollingWait<Ms>>`** or keep **`FastThreadPool`** (10 ms default).   |
+| Tune HP deque size             | **`HighPerformancePool(threads, deque_capacity)`**.                                       |
+| Fix global pool size early     | **`GlobalPool<...>::init(n)`** before first **`instance()`**.                             |
+| Workers in registry            | Pass **`register_workers = true`** to pool constructors.                                  |
 
 ## 6. Header and module notes
 
-- New headers pulled in by the umbrella header include **`futures.hpp`** (combinators) and coroutine helpers on **`task.hpp`** as documented in [COROUTINES.md](COROUTINES.md).
-- Include **`threadschedule/futures.hpp`** directly if you only need combinators.
+- New headers pulled in by the umbrella header include **`futures.hpp`**
+  (combinators) and coroutine helpers on **`task.hpp`** as documented in
+  [COROUTINES.md](COROUTINES.md).
+- Include **`threadschedule/futures.hpp`** directly if you only need
+  combinators.
 
 ## 7. Further reading
 
diff --git a/include/threadschedule/futures.hpp b/include/threadschedule/futures.hpp
index b637bfc..999fe63 100644
--- a/include/threadschedule/futures.hpp
+++ b/include/threadschedule/futures.hpp
@@ -2,8 +2,8 @@
 
 /**
  * @file futures.hpp
- * @brief Combinators for @c std::future: @ref when_all, @ref when_any,
- *        @ref when_all_settled.
+ * @brief Combinators for @c std::future: @c when_all, @c when_any,
+ *        @c when_all_settled.
  *
  * These utilities simplify waiting on multiple futures produced by thread
  * pool submissions.
diff --git a/include/threadschedule/scheduled_pool.hpp b/include/threadschedule/scheduled_pool.hpp
index 748ac8e..be29f07 100644
--- a/include/threadschedule/scheduled_pool.hpp
+++ b/include/threadschedule/scheduled_pool.hpp
@@ -63,7 +63,7 @@ class ScheduledTaskHandle
  * @brief Thread pool augmented with delayed and periodic task scheduling.
  *
  * Non-copyable, non-movable. Combines a dedicated scheduler thread with
- * an underlying PoolType (default: @ref ThreadPool) that does the actual work.
+ * an underlying PoolType (default: @c ThreadPool) that does the actual work.
  *
  * @par How task execution works
  * The pool owns a single scheduler thread that runs an internal loop
@@ -74,9 +74,9 @@ class ScheduledTaskHandle
  *   1. Removes it from the multimap.
  *   2. Checks if the task has been cancelled (via the atomic flag). If
  *      cancelled, the task is discarded.
- *   3. Submits the task to the underlying PoolType via pool_.submit().
+ *   3. Posts the task to the underlying PoolType via pool_.post().
  *      From this point on, the task follows the execution rules of the
- *      underlying pool (see @ref ThreadPool, @ref FastThreadPool, or
+ *      underlying pool (see @c ThreadPool, @c FastThreadPool, or
  *      @ref HighPerformancePool documentation).
  *   4. For periodic tasks, the scheduler immediately re-inserts the task
  *      into the multimap with next_run += interval. This means the next
@@ -97,7 +97,7 @@ class ScheduledTaskHandle
  *   execute. The scheduler thread exits immediately on shutdown, so
  *   future-scheduled tasks are lost.
  * - Cancellation is cooperative: calling handle.cancel() sets an atomic
- *   flag. The scheduler checks this flag before submitting the task to
+ *   flag. The scheduler checks this flag before posting the task to
  *   the pool. Additionally, the pool-side wrapper checks the flag again
  *   right before calling the task. However, a task that is already
  *   running will NOT be interrupted by cancel().
@@ -381,13 +381,13 @@ class ScheduledThreadPoolT
     }
 };
 
-/** @brief @ref ScheduledThreadPoolT using the default @ref ThreadPool backend. */
+/** @brief @ref ScheduledThreadPoolT using the default @c ThreadPool backend. */
 using ScheduledThreadPool = ScheduledThreadPoolT<ThreadPool>;
 /** @brief @ref ScheduledThreadPoolT using @ref HighPerformancePool as backend. */
 using ScheduledHighPerformancePool = ScheduledThreadPoolT<HighPerformancePool>;
-/** @brief @ref ScheduledThreadPoolT using @ref FastThreadPool as backend. */
+/** @brief @ref ScheduledThreadPoolT using @c FastThreadPool as backend. */
 using ScheduledFastThreadPool = ScheduledThreadPoolT<FastThreadPool>;
-/** @brief @ref ScheduledThreadPoolT using @ref LightweightPool as backend (minimal overhead). */
+/** @brief @ref ScheduledThreadPoolT using @c LightweightPool as backend (minimal overhead). */
 using ScheduledLightweightPool = ScheduledThreadPoolT<LightweightPool>;
 
 } // namespace threadschedule
diff --git a/include/threadschedule/thread_pool.hpp b/include/threadschedule/thread_pool.hpp
index a75ac3f..f510790 100644
--- a/include/threadschedule/thread_pool.hpp
+++ b/include/threadschedule/thread_pool.hpp
@@ -718,7 +718,7 @@ class HighPerformancePool
      * @brief Fire-and-forget task submission (throwing variant).
      *
      * Enqueues a callable without creating a @c std::packaged_task or
-     * @c std::future, giving roughly 3x higher throughput than @ref submit()
+     * @c std::future, giving roughly 3x higher throughput than \c submit()
      * for tasks whose return value is not needed.
      *
      * @throws std::runtime_error If the pool is shutting down.
@@ -812,7 +812,7 @@ class HighPerformancePool
      *
      * Acquires the lock once per batch, distributing tasks across worker
      * queues in round-robin fashion. Significantly more efficient than
-     * calling @ref submit() in a loop for large batches.
+     * calling @c submit() in a loop for large batches.
      *
      * @tparam Iterator Forward iterator whose value_type is callable as @c void().
      * @return @c expected containing a vector of futures, or
@@ -1743,6 +1743,7 @@ class ThreadPoolBase
 };
 
 /**
+ * @typedef ThreadPool
  * @brief General-purpose thread pool with indefinite blocking wait.
  *
  * Workers block on condition_variable::wait() when idle - zero CPU
@@ -1754,6 +1755,7 @@ class ThreadPoolBase
 using ThreadPool = ThreadPoolBase<IndefiniteWait>;
 
 /**
+ * @typedef FastThreadPool
  * @brief Thread pool with 10 ms polling wait for lower wake-up latency.
  *
  * Workers poll with condition_variable::wait_for(10 ms), trading a small
@@ -1771,7 +1773,8 @@ using FastThreadPool = ThreadPoolBase<PollingWait<>>;
  * @brief Ultra-lightweight fire-and-forget thread pool.
  *
  * Designed for maximum throughput on tasks whose return value is not needed.
- * Typical measured throughput is **3x** higher than @ref submit() on the
+ * Typical measured throughput is **3x** higher than @c submit() on e.g.
+ * @ref HighPerformancePool on the
  * same hardware, because @c LightweightPoolT avoids the overhead of
  * @c std::packaged_task, @c std::future, and @c std::shared_ptr entirely.
  *
@@ -1793,7 +1796,7 @@ using FastThreadPool = ThreadPoolBase<PollingWait<>>;
  *   (no heap allocation). Larger callables fall back to the heap.
  *
  * @par What is @e not included (by design)
- * - No @c std::future / @c std::packaged_task (use @ref submit() on other
+ * - No @c std::future / @c std::packaged_task (use @c submit() on other
  *   pools if you need return values).
  * - No statistics counters (@ref HighPerformancePool::get_statistics).
  * - No tracing hooks (@ref HighPerformancePool::set_on_task_start).
@@ -2098,6 +2101,7 @@ class LightweightPoolT
 };
 
 /**
+ * @typedef LightweightPool
  * @brief Default lightweight pool with 64-byte task slots (56 bytes usable).
  *
  * Sufficient for lambdas capturing up to ~7 pointers on 64-bit platforms.
@@ -2243,15 +2247,21 @@ class GlobalPool
     }
 };
 
-/** @brief Singleton @ref ThreadPool accessor. */
+/**
+ * @typedef GlobalThreadPool
+ * @brief Singleton accessor for the process-wide @c ThreadPool instance.
+ */
 using GlobalThreadPool = GlobalPool<ThreadPool>;
 
-/** @brief Singleton @ref HighPerformancePool accessor. */
+/**
+ * @typedef GlobalHighPerformancePool
+ * @brief Singleton accessor for the process-wide @ref HighPerformancePool instance.
+ */
 using GlobalHighPerformancePool = GlobalPool<HighPerformancePool>;
 
 /**
  * @brief Convenience wrapper that applies a callable to every element of a
- *        container in parallel using the @ref GlobalThreadPool singleton.
+ *        container in parallel using the @c GlobalThreadPool singleton.
  *
  * Equivalent to:
  * @code
diff --git a/include/threadschedule/thread_pool_with_errors.hpp b/include/threadschedule/thread_pool_with_errors.hpp
index 1454544..5f74047 100644
--- a/include/threadschedule/thread_pool_with_errors.hpp
+++ b/include/threadschedule/thread_pool_with_errors.hpp
@@ -184,10 +184,10 @@ class PoolWithErrors
 /** @brief @ref HighPerformancePool with integrated error handling. */
 using HighPerformancePoolWithErrors = PoolWithErrors<HighPerformancePool>;
 
-/** @brief @ref FastThreadPool with integrated error handling. */
+/** @brief @c FastThreadPool with integrated error handling. */
 using FastThreadPoolWithErrors = PoolWithErrors<FastThreadPool>;
 
-/** @brief @ref ThreadPool with integrated error handling. */
+/** @brief @c ThreadPool with integrated error handling. */
 using ThreadPoolWithErrors = PoolWithErrors<ThreadPool>;
 
 } // namespace threadschedule
diff --git a/include/threadschedule/threadschedule.hpp b/include/threadschedule/threadschedule.hpp
index 48c509e..0db0ac7 100644
--- a/include/threadschedule/threadschedule.hpp
+++ b/include/threadschedule/threadschedule.hpp
@@ -21,7 +21,7 @@
  * @brief Modern C++17/20/23/26 Thread Scheduling Library
  *
  * A comprehensive header-only library for advanced thread management
- * on Linux systems, providing C++ wrappers for pthreads, std::thread,
+ * on Linux and Windows, providing C++ wrappers for pthreads, std::thread,
  * and std::jthread with extended functionality.
  *
  * Features:

From 527d2a0b912336442515d8755ead32e0b1ab883c Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Mon, 6 Apr 2026 23:20:41 +0200
Subject: [PATCH 12/15] Update documentation links and Doxyfile settings for
 improved accessibility and extraction

- Changed Doxyfile to enable extraction of all documentation elements, enhancing the quality of generated documentation.
- Updated links in README.md and migration guide to use HTML anchor tags for better accessibility to CHANGELOG and upgrade instructions.
---
 Doxyfile             | 2 +-
 README.md            | 2 +-
 docs/MIGRATION_V2.md | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Doxyfile b/Doxyfile
index 117dcc0..b7998e3 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -21,7 +21,7 @@ FILE_PATTERNS          = *.hpp *.md
 RECURSIVE              = YES
 EXCLUDE_PATTERNS       = */build/* */.git/* */install/*
 
-EXTRACT_ALL            = NO
+EXTRACT_ALL            = YES
 EXTRACT_PRIVATE        = NO
 EXTRACT_STATIC         = NO
 EXTRACT_LOCAL_CLASSES  = YES
diff --git a/README.md b/README.md
index 08d7f8a..6365b4b 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ coroutines, `std::stop_token`). Highlights:
 | **Observability**           | Optional auto-registration of pool workers in the thread registry; per-task `set_on_task_start` / `set_on_task_end` hooks.                                                                                                                                           |
 | **Errors**                  | `ErrorHandler` callbacks get stable IDs; `remove_callback(id)` / `has_callback(id)`.                                                                                                                                                                                 |
 
-See [CHANGELOG.md](CHANGELOG.md) for the full list, including breaking changes
+See <a href="CHANGELOG.md">CHANGELOG.md</a> for the full list, including breaking changes
 when upgrading from v1.x.
 
 **Upgrading from v1.x:** [Migration guide (v2.0)](docs/MIGRATION_V2.md)
diff --git a/docs/MIGRATION_V2.md b/docs/MIGRATION_V2.md
index 5029db6..fdc6ed4 100644
--- a/docs/MIGRATION_V2.md
+++ b/docs/MIGRATION_V2.md
@@ -4,7 +4,7 @@ This guide helps you move from v1.x to **v2.0.0**. It lists **breaking changes**
 first, then **behavioral changes** you should be aware of, and finally
 **optional upgrades** that are not required but often worthwhile.
 
-For the authoritative list of every change, see [CHANGELOG.md](../CHANGELOG.md).
+For the authoritative list of every change, see <a href="../CHANGELOG.md">CHANGELOG.md</a>.
 
 ## 1. Upgrade steps
 
@@ -162,7 +162,7 @@ These are **not** required for a successful build but match v2 design well:
 ## 7. Further reading
 
 - [README.md](../README.md) -- "What's new in v2.0" summary table
-- [CHANGELOG.md](../CHANGELOG.md) -- full v2.0.0 notes
+- <a href="../CHANGELOG.md">CHANGELOG.md</a> -- full v2.0.0 notes
 - [INTEGRATION.md](INTEGRATION.md) -- CMake and package managers
 - [ERROR_HANDLING.md](ERROR_HANDLING.md) -- pools with errors and callbacks
 - [SCHEDULED_TASKS.md](SCHEDULED_TASKS.md) -- scheduled pools and aliases

From 6557689fb3b4e21b56a51d718e2d97c39c5489d1 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Mon, 6 Apr 2026 23:38:13 +0200
Subject: [PATCH 13/15] Enhance coroutine documentation with new features and
 examples

- Added `schedule_on{pool}` and `run_on(pool, fn)` to the coroutine documentation, detailing their usage and behavior with thread pools.
- Updated descriptions for `task<T>`, `sync_wait`, and `generator<T>` for consistency and clarity.
- Improved overall formatting and readability of the documentation to better guide users in utilizing coroutine features.
---
 docs/COROUTINES.md              | 81 ++++++++++++++++++++++++++++-----
 include/threadschedule/task.hpp | 74 ++++++++++++++++++++----------
 2 files changed, 120 insertions(+), 35 deletions(-)

diff --git a/docs/COROUTINES.md b/docs/COROUTINES.md
index 3a2c964..37c06c9 100644
--- a/docs/COROUTINES.md
+++ b/docs/COROUTINES.md
@@ -11,10 +11,14 @@ write asynchronous-looking code without building your own promise types.
 
 ## Features
 
-- **`task<T>`** -- Lazy single-value coroutine that starts only when `co_await`ed
-- **`task<void>`** -- Void specialisation for side-effect-only coroutines
-- **`sync_wait(task<T>)`** -- Blocking bridge to run a task from synchronous code
-- **`generator<T>`** -- Lazy sequence coroutine producing values via `co_yield`
+- **`task<T>`** - Lazy single-value coroutine that starts only when `co_await`ed
+- **`task<void>`** - Void specialisation for side-effect-only coroutines
+- **`sync_wait(task<T>)`** - Blocking bridge to run a task from synchronous code
+- **`generator<T>`** - Lazy sequence coroutine producing values via `co_yield`
+- **`schedule_on{pool}`** - `co_await`able hop onto a thread-pool worker (any pool
+  with `submit(Callable)`)
+- **`run_on(pool, fn)`** - Run a callable that returns `task<T>` on the pool and get a
+  `std::future<T>` for the result
 - Automatic `std::generator<T>` alias when C++23 `__cpp_lib_generator` is
   available
 
@@ -37,7 +41,7 @@ int main() {
 }
 ```
 
-## `task<T>` -- Lazy Single-Value Coroutine
+## `task<T>` - Lazy Single-Value Coroutine
 
 A `task<T>` represents a computation that will produce exactly one value (or
 throw). It is **lazy**: the coroutine body does not execute until someone
@@ -131,7 +135,7 @@ task<std::unique_ptr<int>> make_ptr() {
 auto ptr = sync_wait(make_ptr()); // std::unique_ptr<int>
 ```
 
-## `sync_wait` -- Blocking Bridge
+## `sync_wait` - Blocking Bridge
 
 `sync_wait` runs a task on the calling thread and blocks until it completes.
 This is the primary way to consume a `task<T>` from non-coroutine code (e.g.
@@ -149,9 +153,9 @@ int main() {
 
 > **Note:** `sync_wait` resumes the entire coroutine chain on the calling
 > thread. It is intended for top-level entry points. Avoid calling `sync_wait`
-> from inside a coroutine -- use `co_await` instead.
+> from inside a coroutine - use `co_await` instead.
 
-## `generator<T>` -- Lazy Sequence Coroutine
+## `generator<T>` - Lazy Sequence Coroutine
 
 A `generator<T>` produces a (potentially infinite) sequence of values
 on-demand via `co_yield`. It is compatible with range-based `for` loops.
@@ -171,7 +175,7 @@ for (int v : iota(0, 5)) {
 
 ### Infinite sequences
 
-Generators can represent infinite sequences -- just `break` out of the loop
+Generators can represent infinite sequences - just `break` out of the loop
 when you're done. The generator's destructor cleans up the coroutine frame:
 
 ```cpp
@@ -230,13 +234,64 @@ try {
 
 When your compiler provides `std::generator` (detected via
 `__cpp_lib_generator >= 202207L`), `threadschedule::generator<T>` is
-automatically aliased to `std::generator<T>`. No code changes needed -- the
+automatically aliased to `std::generator<T>`. No code changes needed - the
 API is compatible.
 
 ## Combining Coroutines with Thread Pools
 
-While the coroutine primitives are standalone, they compose naturally with the
-library's thread pools:
+### `schedule_on{pool}` - resume on a pool worker
+
+`schedule_on` is an awaitable: **`co_await schedule_on{pool}`** submits the current
+coroutine frame to the pool; when a worker runs it, execution continues **on that
+thread**. Any pool type works as long as it provides **`submit(Callable)`** (for
+example `HighPerformancePool`, `ThreadPool`, `FastThreadPool`, or the global
+singletons).
+
+```cpp
+#include <threadschedule/threadschedule.hpp>
+using namespace threadschedule;
+
+task<void> on_pool(HighPerformancePool& pool) {
+    co_await schedule_on{pool};
+    // this line runs on a pool worker thread
+    expensive_work();
+    co_return;
+}
+
+int main() {
+    HighPerformancePool pool(4);
+    sync_wait(on_pool(pool));
+}
+```
+
+Step-by-step behaviour, nested `schedule_on`, and comparison with `co_await` on
+another `task` are documented in Doxygen on **`schedule_on`** and **`run_on`** in
+[`include/threadschedule/task.hpp`](../include/threadschedule/task.hpp) (build
+with `THREADSCHEDULE_BUILD_DOCS=ON` and open the HTML API reference).
+
+### `run_on(pool, fn)` - `task` from synchronous code via `std::future`
+
+**`run_on`** takes a **callable that returns `task<T>`**, invokes it on a **pool
+worker**, runs **`sync_wait`** on that task inside the worker, and returns a
+**`std::future<T>`** to the caller. Handy when the entry point is not a coroutine
+but you want the body expressed as **`task`**.
+
+```cpp
+HighPerformancePool pool(4);
+
+auto future = run_on(pool, []() -> task<int> {
+    co_return expensive_work(); // runs on pool; co_await works inside
+});
+
+int result = future.get();
+```
+
+The callable is executed on the pool; **`co_await`** inside the returned task
+continues on that worker unless you transfer elsewhere with **`schedule_on`**.
+
+### Plain `submit` + `future` (no `run_on`)
+
+You can still bridge ordinary callables and futures without `run_on`:
 
 ```cpp
 #include <threadschedule/threadschedule.hpp>
@@ -261,6 +316,8 @@ int main() {
 | `task<void>` | `task.hpp` | Lazy void coroutine |
 | `sync_wait(task<T>)` | `task.hpp` | Blocking bridge, returns `T` |
 | `sync_wait(task<void>)` | `task.hpp` | Blocking bridge, void overload |
+| `schedule_on<Pool>` | `task.hpp` | Awaitable: continue coroutine on `pool` |
+| `run_on(pool, fn)` | `task.hpp` | Run `fn()` (`task<T>`) on pool; returns `std::future<T>` |
 | `generator<T>` | `generator.hpp` | Lazy multi-value sequence coroutine |
 
 All types live in `namespace threadschedule` (alias `ts`).
diff --git a/include/threadschedule/task.hpp b/include/threadschedule/task.hpp
index cbeccbd..ab30d18 100644
--- a/include/threadschedule/task.hpp
+++ b/include/threadschedule/task.hpp
@@ -2,11 +2,11 @@
 
 /**
  * @file task.hpp
- * @brief Lazy single-value coroutine (`task<T>`) and blocking bridge (`sync_wait`).
+ * @brief Coroutine @c task, @c sync_wait, and pool helpers @c schedule_on / @c run_on.
  *
- * A `task<T>` represents a lazy coroutine that produces exactly one value
- * (or throws). It does not begin execution until it is `co_await`ed by
- * another coroutine or passed to `sync_wait()`.
+ * @c task is lazy until @c co_await or @c sync_wait. For how work moves onto a
+ * thread pool and what nested @c schedule_on does, see struct @c schedule_on and
+ * function template @c run_on below (C++20 only).
  *
  * Requires C++20 coroutine support.
  */
@@ -114,9 +114,8 @@ struct final_awaiter
  * - **Continuation:** `continuation_` is set by the task's awaiter just
  *   before resuming the task. `final_awaiter` uses it to return control
  *   to the parent coroutine.
- * - **Executor:** If `executor_` is set (e.g. via `schedule_on`), the
- *   continuation is dispatched through the executor instead of using
- *   symmetric transfer.
+ * - **Executor:** If @c executor_ is set on the promise, the continuation is
+ *   dispatched through that executor instead of using symmetric transfer.
  */
 template <typename T>
 class task_promise_base
@@ -624,18 +623,44 @@ inline void sync_wait(task<void> t)
 // ---------------------------------------------------------------------------
 
 /**
- * @brief Awaitable that transfers execution to a thread pool.
- *
- * Use `co_await schedule_on{pool}` inside any coroutine to continue
- * execution on one of the pool's worker threads.
- *
- * @tparam Pool A thread pool type providing @c submit(Callable).
+ * @brief Awaitable that continues the current coroutine on a thread pool worker.
+ *
+ * @tparam Pool Pool type providing @c submit(Callable) (for example
+ *         HighPerformancePool, ThreadPool, FastThreadPool).
+ *
+ * @par Mechanism
+ * The coroutine stays a single @c task frame. Nothing is split into separate
+ * compiled "halves". @c co_await schedule_on{pool} does the following:
+ * -# The coroutine suspends at the @c co_await.
+ * -# @c await_suspend enqueues a job on @p pool that calls @c resume() on this
+ *    coroutine handle.
+ * -# A worker runs that job; @c resume() continues the coroutine on that
+ *    thread, starting with the line after the @c co_await. Everything after
+ *    that point runs on that worker until another explicit transfer.
+ *
+ * Code before the first @c co_await schedule_on{pool} runs on whatever thread
+ * was already running the coroutine (for example @c main under @c sync_wait, or
+ * a pool thread if you were already there). Code after runs on whichever worker
+ * picked up the @c submit.
+ *
+ * @par Nested @c schedule_on
+ * Each @c co_await schedule_on queues another @c submit(resume). With two
+ * different pools, you first continue on a worker of the first pool, then on a
+ * worker of the second. Nesting @c schedule_on on the same pool still uses
+ * @c pool.submit each time: you may run on another worker of that pool. There
+ * is no guarantee it is the same OS thread as before.
+ *
+ * @par Versus @c co_await on another @c task
+ * Awaiting another @c task usually does not post to a pool; when the child
+ * finishes, the parent is typically resumed on the same thread (symmetric
+ * transfer / direct resume). Only @c co_await schedule_on (or similar)
+ * explicitly pushes the continuation onto the pool queue.
  *
  * @par Example
  * @code
  * task<void> work(HighPerformancePool& pool) {
  *     co_await schedule_on{pool};
- *     // now running on a pool thread
+ *     // runs on a pool worker from here until the next transfer
  * }
  * @endcode
  */
@@ -659,15 +684,18 @@ struct schedule_on
 // ---------------------------------------------------------------------------
 
 /**
- * @brief Submit a coroutine-returning callable to a pool and return a
- *        @c std::future for its result.
- *
- * The callable is invoked on a pool worker thread. Inside the callable,
- * you can use `co_await` freely -- all continuations run on the calling
- * pool unless explicitly transferred elsewhere.
- *
- * @tparam Pool A thread pool type providing @c submit(Callable).
- * @tparam F    A callable returning @c task<T>.
+ * @brief Run a callable that returns @c task<T> on a pool worker; return
+ *        @c std::future for the result.
+ *
+ * @tparam Pool Pool type providing @c submit(Callable).
+ * @tparam F Callable with signature returning @c task<T> (for some @c T).
+ *
+ * @par Behaviour
+ * The callable is invoked on a worker thread. That worker calls @c sync_wait
+ * on the @c task returned by the callable, so the coroutine body runs there.
+ * Nested @c co_await on other @c task objects typically keeps resuming on that
+ * same worker unless you @c co_await schedule_on to hand off again. The
+ * @c std::future is fulfilled when @c sync_wait completes inside the worker.
  *
  * @par Example
  * @code

From 5ff5f1cf26b0e86f27dcf09eb1930f9687036214 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Tue, 7 Apr 2026 21:36:30 +0200
Subject: [PATCH 14/15] Update documentation for LightweightPool integration in
 ScheduledTaskHandle

- Added references to `LightweightPool` in the documentation for `ScheduledTaskHandle`, ensuring users are aware of all available pool types.
- Improved clarity and consistency in the documentation by including `LightweightPool` in the list of underlying pool types and convenience aliases.
---
 include/threadschedule/scheduled_pool.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/threadschedule/scheduled_pool.hpp b/include/threadschedule/scheduled_pool.hpp
index be29f07..873f647 100644
--- a/include/threadschedule/scheduled_pool.hpp
+++ b/include/threadschedule/scheduled_pool.hpp
@@ -76,8 +76,8 @@ class ScheduledTaskHandle
  *      cancelled, the task is discarded.
  *   3. Posts the task to the underlying PoolType via pool_.post().
  *      From this point on, the task follows the execution rules of the
- *      underlying pool (see @c ThreadPool, @c FastThreadPool, or
- *      @ref HighPerformancePool documentation).
+ *      underlying pool (see @c ThreadPool, @c FastThreadPool,
+ *      @ref HighPerformancePool, or @c LightweightPool documentation).
  *   4. For periodic tasks, the scheduler immediately re-inserts the task
  *      into the multimap with next_run += interval. This means the next
  *      execution is timed from the scheduled time, not from when the
@@ -126,7 +126,7 @@ class ScheduledTaskHandle
  *         (default: ThreadPool).
  *
  * @see ScheduledThreadPool, ScheduledHighPerformancePool,
- *      ScheduledFastThreadPool (convenience aliases)
+ *      ScheduledFastThreadPool, ScheduledLightweightPool (convenience aliases)
  */
 template <typename PoolType = ThreadPool>
 class ScheduledThreadPoolT

From e3e6efd64401e1d24e6186a2c07a0e2162fd8097 Mon Sep 17 00:00:00 2001
From: Katze719 <pauldorn1234@gmail.com>
Date: Tue, 7 Apr 2026 21:44:43 +0200
Subject: [PATCH 15/15] Update documentation for error handling and task
 scheduling features

- Revised the documentation in `ERROR_HANDLING.md` to enhance clarity and remove redundant checkmarks for features.
- Improved the `SCHEDULED_TASKS.md` documentation by adding details about the new `LightweightPool` and its use cases for fire-and-forget tasks.
- Updated various sections across documentation files to ensure consistent formatting and clearer descriptions, particularly regarding error handling and task scheduling functionalities.
---
 CHANGELOG.md                                  | 12 ++---
 README.md                                     | 52 +++++++++----------
 docs/ERROR_HANDLING.md                        | 10 ++--
 docs/INTEGRATION.md                           | 10 ++--
 docs/REGISTRY.md                              | 24 ++++-----
 docs/SCHEDULED_TASKS.md                       | 20 ++++---
 docs/TOPOLOGY_NUMA.md                         |  2 +-
 include/threadschedule/chaos.hpp              |  2 +-
 include/threadschedule/error_handler.hpp      |  7 ++-
 include/threadschedule/expected.hpp           | 16 +++---
 include/threadschedule/futures.hpp            |  2 +-
 include/threadschedule/pthread_wrapper.hpp    | 13 +++--
 include/threadschedule/registered_threads.hpp |  7 ++-
 include/threadschedule/scheduled_pool.hpp     |  7 ++-
 include/threadschedule/scheduler_policy.hpp   |  5 ++
 include/threadschedule/task.hpp               |  4 +-
 include/threadschedule/thread_pool.hpp        | 21 +++++---
 .../thread_pool_with_errors.hpp               |  5 ++
 include/threadschedule/thread_registry.hpp    | 41 ++++++++-------
 include/threadschedule/thread_wrapper.hpp     |  5 ++
 include/threadschedule/topology.hpp           |  2 +-
 21 files changed, 160 insertions(+), 107 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce81069..7cc1cb0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -217,7 +217,7 @@ auto futures = pool.submit_batch(tasks.begin(), tasks.end());
   pointers work as callables (e.g.
   `JThreadWrapperReg("n", "c", &MyClass::run, this)`).
 - Fix: `JThreadWrapperReg` now correctly forwards `std::stop_token` to callables
-  that accept it, while also supporting callables without `stop_token` — the
+  that accept it, while also supporting callables without `stop_token` - the
   previous `auto&&...` wrapper always claimed to accept a token, causing a
   compile error when the user's callable did not.
 
@@ -237,7 +237,7 @@ auto futures = pool.submit_batch(tasks.begin(), tasks.end());
   required scheduling API.
 - Added: `FastThreadPool::set_affinity()` and `FastThreadPool::wait_for_tasks()`
   for API parity with `ThreadPool` and `HighPerformancePool`.
-- Added: Missing forwarding methods in `WithErrors` wrappers —
+- Added: Missing forwarding methods in `WithErrors` wrappers -
   `HighPerformancePoolWithErrors::set_affinity()`,
   `FastThreadPoolWithErrors::set_affinity()` and
   `FastThreadPoolWithErrors::wait_for_tasks()`.
@@ -250,12 +250,12 @@ auto futures = pool.submit_batch(tasks.begin(), tasks.end());
   all pool classes, and `ScheduledTaskHandle`.
 - Removed: Unused `thread_local std::random_device` in
   `HighPerformancePool::worker_function`.
-- Added: C++20 coroutine primitive `task<T>` (`task.hpp`) — a lazy single-value
+- Added: C++20 coroutine primitive `task<T>` (`task.hpp`) - a lazy single-value
   coroutine that starts execution only when `co_await`ed. Includes full
   `task<void>` specialisation and exception propagation.
-- Added: `sync_wait(task<T>)` / `sync_wait(task<void>)` — blocking bridge that
+- Added: `sync_wait(task<T>)` / `sync_wait(task<void>)` - blocking bridge that
   runs a task on the calling thread and returns its result.
-- Added: C++20 coroutine primitive `generator<T>` (`generator.hpp`) — a lazy
+- Added: C++20 coroutine primitive `generator<T>` (`generator.hpp`) - a lazy
   multi-value coroutine producing elements via `co_yield`. Supports range-based
   for loops (`begin()` / `end()` with `std::default_sentinel_t`). Automatically
   aliases `std::generator<T>` when C++23 `__cpp_lib_generator` is available.
@@ -277,7 +277,7 @@ auto futures = pool.submit_batch(tasks.begin(), tasks.end());
 
 - Build/Style: Update `.clang-format` (`IndentPPDirectives: AfterHash`) for
   clearer preprocessor indentation.
-- Core: Improve `expected.hpp` header detection — check `<version>` or
+- Core: Improve `expected.hpp` header detection - check `<version>` or
   `<experimental/version>` presence before including `<expected>`.
 - Refactor: Simplify and clarify conditional compilation in `expected.hpp` for
   maintainability.
diff --git a/README.md b/README.md
index 6365b4b..fa5d0eb 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ or with optional **shared runtime** for multi-DSO applications.
 - **NUMA-aware Topology Helpers**: Easy affinity builders across nodes
 - **Chaos Testing**: RAII controller to perturb affinity/priority for validation
 - **C++20 Coroutines**: `task<T>`, `generator<T>`, and `sync_wait` out of the
-  box -- no boilerplate promise types needed
+  box - no boilerplate promise types needed
 - **High-Performance Pools**: Work-stealing pool, `post()` / `try_post()`, and
   optional `LightweightPool` for fire-and-forget workloads with minimal overhead
 - **Scheduled Tasks**: Run tasks at specific times, after delays, or
@@ -54,12 +54,12 @@ coroutines, `std::stop_token`). Highlights:
 
 | Area                        | What changed                                                                                                                                                                                                                                                         |
 | --------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Lightweight pool**        | `LightweightPoolT<TaskSize>` / `LightweightPool` -- fire-and-forget only, configurable SBO buffer (default 64 B), no futures or stats. Workers are still `ThreadWrapper` (name, affinity, policy). Ideal for maximum throughput when you do not need a return value. |
-| **`post()` / `try_post()`** | On `HighPerformancePool`, `ThreadPool` / `FastThreadPool`, and `GlobalPool` -- same queue path as `submit()` but skips `packaged_task` / `future` overhead.                                                                                                          |
-| **Non-throwing submit**     | `try_submit()` / `try_submit_batch()` return `expected<future, std::error_code>` instead of throwing on shutdown.                                                                                                                                                    |
+| **Lightweight pool**        | `LightweightPoolT<TaskSize>` / `LightweightPool` - fire-and-forget only, configurable SBO buffer (default 64 B), no futures or stats. Workers are still `ThreadWrapper` (name, affinity, policy). Ideal for maximum throughput when you do not need a return value. |
+| **`post()` / `try_post()`** | On `HighPerformancePool`, `ThreadPool` / `FastThreadPool`, and `GlobalPool` - same queue path as `submit()` but skips `packaged_task` / `future` overhead.                                                                                                          |
+| **Non-throwing submit**     | `try_submit()` returns `expected<future<R>, error_code>`; `try_submit_batch()` returns `expected<vector<future<void>>, error_code>` instead of throwing on shutdown.                                                                                                  |
 | **Scheduled dispatch**      | `ScheduledThreadPoolT` dispatches with `post()` internally. Alias `ScheduledLightweightPool` uses `LightweightPool` as the backend.                                                                                                                                  |
 | **Shutdown**                | `ShutdownPolicy::drain` (default) vs `drop_pending`; `shutdown_for(timeout)` for a timed drain.                                                                                                                                                                      |
-| **Parallel loops**          | Chunked `parallel_for_each` on single-queue pools (same helper as the work-stealing pool).                                                                                                                                                                           |
+| **Parallel loops**          | Chunked `parallel_for_each` on all pool types (shared helper across single-queue and work-stealing pools).                                                                                                                                                           |
 | **Tuning**                  | `PollingWait<IntervalMs>` for `FastThreadPool`, configurable work-stealing deque capacity on `HighPerformancePool`, `GlobalPool::init(n)` before first use.                                                                                                          |
 | **C++20**                   | Ranges overloads for batch submit and `parallel_for_each`; `submit`/`try_submit` with `std::stop_token` (cooperative skip).                                                                                                                                          |
 | **Futures**                 | `when_all`, `when_any`, `when_all_settled` in `futures.hpp`.                                                                                                                                                                                                         |
@@ -104,32 +104,32 @@ on:
 | Platform            | Compiler           | C++17 | C++20 | C++23 | C++26 |
 | ------------------- | ------------------ | :---: | :---: | :---: | :---: |
 | **Linux (x86_64)**  |                    |       |       |       |       |
-| Ubuntu 22.04        | GCC 11             |  ✅   |  ✅   |  ✅   |   -   |
-| Ubuntu 22.04        | GCC 12             |   -   |  ✅   |   -   |   -   |
-| Ubuntu 22.04        | Clang 14           |  ✅   |  ✅   |  ✅   |   -   |
-| Ubuntu 22.04        | Clang 15           |   -   |  ✅   |  ✅   |   -   |
-| Ubuntu 24.04        | GCC 13             |  ✅   |  ✅   |  ✅   |   -   |
-| Ubuntu 24.04        | GCC 14             |  ✅   |  ✅   |  ✅   |  ✅   |
-| Ubuntu 24.04        | GCC 15             |   -   |  ✅   |  ✅   |  ✅   |
-| Ubuntu 24.04        | Clang 16           |  ✅   |  ✅   |   -   |   -   |
-| Ubuntu 24.04        | Clang 18           |  ✅   |  ✅   |   -   |   -   |
-| Ubuntu 24.04        | Clang 19           |   -   |  ✅   |  ✅   |  ✅   |
-| Ubuntu 24.04        | Clang 21           |   -   |  ✅   |  ✅   |  ✅   |
+| Ubuntu 22.04        | GCC 11             |  yes   |  yes   |  yes   |   -   |
+| Ubuntu 22.04        | GCC 12             |   -   |  yes   |   -   |   -   |
+| Ubuntu 22.04        | Clang 14           |  yes   |  yes   |  yes   |   -   |
+| Ubuntu 22.04        | Clang 15           |   -   |  yes   |  yes   |   -   |
+| Ubuntu 24.04        | GCC 13             |  yes   |  yes   |  yes   |   -   |
+| Ubuntu 24.04        | GCC 14             |  yes   |  yes   |  yes   |  yes   |
+| Ubuntu 24.04        | GCC 15             |   -   |  yes   |  yes   |  yes   |
+| Ubuntu 24.04        | Clang 16           |  yes   |  yes   |   -   |   -   |
+| Ubuntu 24.04        | Clang 18           |  yes   |  yes   |   -   |   -   |
+| Ubuntu 24.04        | Clang 19           |   -   |  yes   |  yes   |  yes   |
+| Ubuntu 24.04        | Clang 21           |   -   |  yes   |  yes   |  yes   |
 | **Linux (ARM64)**   |                    |       |       |       |       |
-| Ubuntu 24.04 ARM64  | GCC 13 (system)    |  ✅   |  ✅   |  ✅   |   -   |
-| Ubuntu 24.04 ARM64  | GCC 14             |   -   |  ✅   |  ✅   |  ✅   |
+| Ubuntu 24.04 ARM64  | GCC 13 (system)    |  yes   |  yes   |  yes   |   -   |
+| Ubuntu 24.04 ARM64  | GCC 14             |   -   |  yes   |  yes   |  yes   |
 | **Windows**         |                    |       |       |       |       |
-| Windows Server 2022 | MSVC 2022          |  ✅   |  ✅   |  ✅   |   -   |
-| Windows Server 2022 | MinGW-w64 (GCC 15) |  ✅   |  ✅   |  ✅   |   -   |
-| Windows Server 2025 | MSVC 2022          |  ✅   |  ✅   |  ✅   |   -   |
-| Windows Server 2025 | MinGW-w64 (GCC 15) |  ✅   |  ✅   |  ✅   |   -   |
+| Windows Server 2022 | MSVC 2022          |  yes   |  yes   |  yes   |   -   |
+| Windows Server 2022 | MinGW-w64 (GCC 15) |  yes   |  yes   |  yes   |   -   |
+| Windows Server 2025 | MSVC 2022          |  yes   |  yes   |  yes   |   -   |
+| Windows Server 2025 | MinGW-w64 (GCC 15) |  yes   |  yes   |  yes   |   -   |
 
 **Additional platforms:** ThreadSchedule should work on other platforms (macOS,
 FreeBSD, other Linux distributions) with standard C++17+ compilers, but these
 are not regularly tested in CI.
 
 > **C++23**: GCC 12's libstdc++ lacks monadic `std::expected` operations
-> (`and_then`, `transform`, …). Clang 16/18 on Ubuntu 24.04 use GCC 14's
+> (`and_then`, `transform`, ...). Clang 16/18 on Ubuntu 24.04 use GCC 14's
 > libstdc++ headers which expose `std::expected` incorrectly to those Clang
 > versions. These combinations are therefore only tested up to C++20.
 >
@@ -243,7 +243,7 @@ int main() {
         std::cout << "Frequent task!" << std::endl;
     });
 
-    // v2: ScheduledLightweightPool -- same API, LightweightPool backend (post-based dispatch)
+    // v2: ScheduledLightweightPool - same API, LightweightPool backend (post-based dispatch)
     
     // Error handling
     HighPerformancePoolWithErrors pool_safe(4);
@@ -398,7 +398,7 @@ auto value = pool.submit([]{ return 42; }); // standard future-based API remains
 
 ### Coroutines (C++20)
 
-Lazy coroutine primitives -- no boilerplate promise types required.
+Lazy coroutine primitives - no boilerplate promise types required.
 
 ```cpp
 #include <threadschedule/threadschedule.hpp>
@@ -410,7 +410,7 @@ task<int> compute(int x) {
 }
 
 task<int> pipeline() {
-    int a = co_await compute(21);  // lazy -- starts here
+    int a = co_await compute(21);  // lazy - starts here
     co_return a;                   // 42
 }
 
diff --git a/docs/ERROR_HANDLING.md b/docs/ERROR_HANDLING.md
index 9967b53..c3c12e9 100644
--- a/docs/ERROR_HANDLING.md
+++ b/docs/ERROR_HANDLING.md
@@ -4,11 +4,11 @@ ThreadSchedule provides comprehensive error handling for asynchronous tasks with
 
 ## Features
 
-- ✅ **Global error callbacks** - Handle all exceptions in one place
-- ✅ **Per-future error callbacks** - Handle specific task errors
-- ✅ **Error context** - Get detailed information about errors (task description, thread ID, timestamp)
-- ✅ **Thread-safe** - Error handlers work correctly across threads
-- ✅ **Non-intrusive** - Original thread pools remain unchanged
+- **Global error callbacks** - Handle all exceptions in one place
+- **Per-future error callbacks** - Handle specific task errors
+- **Error context** - Get detailed information about errors (task description, thread ID, timestamp)
+- **Thread-safe** - Error handlers work correctly across threads
+- **Non-intrusive** - Original thread pools remain unchanged
 
 ## Quick Start
 
diff --git a/docs/INTEGRATION.md b/docs/INTEGRATION.md
index 8a4b410..360f276 100644
--- a/docs/INTEGRATION.md
+++ b/docs/INTEGRATION.md
@@ -84,11 +84,11 @@ int main() {
 ```
 
 **Why CPM?**
-- ✅ Automatic caching - downloads dependencies once
-- ✅ Version pinning - reproducible builds
-- ✅ No git submodules needed
-- ✅ Works seamlessly with CI/CD
-- ✅ Compatible with all CMake features
+- Automatic caching - downloads dependencies once
+- Version pinning - reproducible builds
+- No git submodules needed
+- Works seamlessly with CI/CD
+- Compatible with all CMake features
 
 ### Method 2: CMake FetchContent
 
diff --git a/docs/REGISTRY.md b/docs/REGISTRY.md
index a3aa7c9..ca2fe27 100644
--- a/docs/REGISTRY.md
+++ b/docs/REGISTRY.md
@@ -234,11 +234,11 @@ graph TD
 - **Runtime Mode**: Shared runtime created at startup, provides global registry instance
 
 - Core entrypoints:
-  - `threadschedule::registry()` – default global registry
-  - `threadschedule::set_external_registry(...)` – app-injected global registry
-  - `threadschedule::CompositeThreadRegistry` – merge multiple registries (views)
-  - `threadschedule::AutoRegisterCurrentThread` – RAII auto-registration
-  - `threadschedule::ThreadWrapperReg` – opt-in wrapper that auto-registers
+  - `threadschedule::registry()` - default global registry
+  - `threadschedule::set_external_registry(...)` - app-injected global registry
+  - `threadschedule::CompositeThreadRegistry` - merge multiple registries (views)
+  - `threadschedule::AutoRegisterCurrentThread` - RAII auto-registration
+  - `threadschedule::ThreadWrapperReg` - opt-in wrapper that auto-registers
 
 **Important:** The registry **requires control blocks** for all control operations (`set_affinity`, `set_priority`, `set_scheduling_policy`, `set_name`). Threads registered without control blocks can be queried but not controlled. Use `ThreadWrapperReg` or `AutoRegisterCurrentThread` to automatically create and register control blocks.
 
@@ -316,8 +316,8 @@ target_link_libraries(your_dso PRIVATE ThreadSchedule::ThreadSchedule ThreadSche
 ```
 
 - Exported APIs (same as header-only), provided by the runtime:
-  - `threadschedule::registry()` – returns the single process-wide registry instance
-  - `threadschedule::set_external_registry(ThreadRegistry*)` – optionally redirect runtime to an app-owned instance
+  - `threadschedule::registry()` - returns the single process-wide registry instance
+  - `threadschedule::set_external_registry(ThreadRegistry*)` - optionally redirect runtime to an app-owned instance
 
 Notes:
 - With `THREADSCHEDULE_RUNTIME=ON`, the header declares these functions and the `.so/.dll` provides the definitions.
@@ -529,7 +529,7 @@ void foreign_thread() {
 }
 ```
 
-#### 5) Runtime (shared) example – app + two DSOs
+#### 5) Runtime (shared) example - app + two DSOs
 
 This repository includes a minimal working example under `examples/runtime_shared/` that demonstrates using `THREADSCHEDULE_RUNTIME`:
 
@@ -581,7 +581,7 @@ cmake -B build -DTHREADSCHEDULE_RUNTIME=ON -DTHREADSCHEDULE_BUILD_EXAMPLES=ON
 cmake --build build --target runtime_main
 ```
 
-Run `runtime_main` – it will list threads from both DSOs via the single shared registry.
+Run `runtime_main` - it will list threads from both DSOs via the single shared registry.
 
 ### Platform notes
 
@@ -593,9 +593,9 @@ Run `runtime_main` – it will list threads from both DSOs via the single shared
 ### Error handling
 
 All control functions return `expected<void, std::error_code>`. Typical errors include:
-- `std::errc::no_such_process` – Thread not found in registry or no control block available
-- `std::errc::operation_not_permitted` – Insufficient privileges
-- `std::errc::invalid_argument` – Invalid parameters
+- `std::errc::no_such_process` - Thread not found in registry or no control block available
+- `std::errc::operation_not_permitted` - Insufficient privileges
+- `std::errc::invalid_argument` - Invalid parameters
 
 
 ### Duplicate registrations
diff --git a/docs/SCHEDULED_TASKS.md b/docs/SCHEDULED_TASKS.md
index 48b4761..05384ab 100644
--- a/docs/SCHEDULED_TASKS.md
+++ b/docs/SCHEDULED_TASKS.md
@@ -4,11 +4,11 @@ ThreadSchedule provides a powerful scheduling system for running tasks at specif
 
 ## Features
 
-- ✅ **One-time scheduled tasks** - Run a task after a delay or at a specific time
-- ✅ **Periodic tasks** - Run tasks repeatedly at fixed intervals
-- ✅ **Cancellable tasks** - Cancel scheduled tasks before they execute
-- ✅ **Flexible execution** - Choose from ThreadPool (default), HighPerformancePool, or FastThreadPool
-- ✅ **Thread-safe** - Safe to use from multiple threads
+- **One-time scheduled tasks** - Run a task after a delay or at a specific time
+- **Periodic tasks** - Run tasks repeatedly at fixed intervals
+- **Cancellable tasks** - Cancel scheduled tasks before they execute
+- **Flexible execution** - Choose from ThreadPool (default), HighPerformancePool, FastThreadPool, or LightweightPool
+- **Thread-safe** - Safe to use from multiple threads
 
 ## Quick Start
 
@@ -33,7 +33,7 @@ int main() {
 
 ### Pool Types
 
-ThreadSchedule provides three variants of the scheduled pool:
+ThreadSchedule provides four built-in variants of the scheduled pool:
 
 ```cpp
 // Default: Uses ThreadPool (< 1k tasks/sec, simple and efficient)
@@ -45,6 +45,9 @@ ScheduledHighPerformancePool scheduler_hp(4);
 // Fast: Uses FastThreadPool (1k-10k tasks/sec, single queue)
 ScheduledFastThreadPool scheduler_fast(4);
 
+// Lightweight: Uses LightweightPool (fire-and-forget, no futures/stats)
+ScheduledLightweightPool scheduler_lw(4);
+
 // Custom: Use any pool type
 ScheduledThreadPoolT<MyCustomPool> scheduler_custom(4);
 ```
@@ -271,6 +274,11 @@ auto backup = scheduler.schedule_periodic(std::chrono::hours(24), []() {
 - **Pros**: Single queue, balanced performance
 - **Best for**: Batch processing, moderate workloads
 
+### LightweightPool
+- **Use when**: Fire-and-forget scheduled work, no return values needed
+- **Pros**: Minimal overhead, no future/packaged_task allocation per dispatch
+- **Best for**: Periodic logging, telemetry, cleanup tasks
+
 **Example:**
 ```cpp
 // For infrequent timers (default)
diff --git a/docs/TOPOLOGY_NUMA.md b/docs/TOPOLOGY_NUMA.md
index 4eb99c2..55c407d 100644
--- a/docs/TOPOLOGY_NUMA.md
+++ b/docs/TOPOLOGY_NUMA.md
@@ -28,7 +28,7 @@ ThreadWrapper t([]{ /* work */ });
 ThreadPool pool(8);
 auto affs = distribute_affinities_by_numa(pool.size());
 for (size_t i = 0; i < pool.size(); ++i) {
-    // In simple ThreadPool: use set_affinity returning bool
+    // set_affinity returns expected<void, std::error_code>
     (void)pool.set_affinity(affs[i]);
 }
 ```
diff --git a/include/threadschedule/chaos.hpp b/include/threadschedule/chaos.hpp
index 1be7538..78883b3 100644
--- a/include/threadschedule/chaos.hpp
+++ b/include/threadschedule/chaos.hpp
@@ -66,7 +66,7 @@ struct ChaosConfig
  * synchronized, so multiple controllers or concurrent registrations are
  * safe.
  *
- * @warning Intended for testing and validation only -- not for production
+ * @warning Intended for testing and validation only - not for production
  *          use. Perturbations may cause spurious priority inversions and
  *          cache-thrashing.
  *
diff --git a/include/threadschedule/error_handler.hpp b/include/threadschedule/error_handler.hpp
index ed8b576..920957f 100644
--- a/include/threadschedule/error_handler.hpp
+++ b/include/threadschedule/error_handler.hpp
@@ -1,5 +1,10 @@
 #pragma once
 
+/**
+ * @file error_handler.hpp
+ * @brief Error handling primitives: TaskError, ErrorHandler, and ErrorHandledTask.
+ */
+
 #include <chrono>
 #include <exception>
 #include <functional>
@@ -122,7 +127,7 @@ using ErrorCallback = std::function<void(TaskError const&)>;
  *
  * @par Callback execution
  * - Callbacks are invoked in the order they were registered (FIFO).
- * - Callbacks run **under the lock** -- keep them short and non-blocking to
+ * - Callbacks run **under the lock** - keep them short and non-blocking to
  *   avoid contention with other threads that may call handle_error() or
  *   add_callback() concurrently.
  * - If a callback itself throws, the exception is silently swallowed so that
diff --git a/include/threadschedule/expected.hpp b/include/threadschedule/expected.hpp
index 551187c..bf1dfec 100644
--- a/include/threadschedule/expected.hpp
+++ b/include/threadschedule/expected.hpp
@@ -18,10 +18,10 @@
  * @par Monadic operations
  * Both the primary template and the @c void specialization support the four
  * monadic combinators from P0323R12:
- * - @c and_then  -- chain an operation that returns an @c expected
- * - @c or_else   -- recover from an error, returning an @c expected
- * - @c transform -- map the contained value
- * - @c transform_error -- map the contained error
+ * - @c and_then  - chain an operation that returns an @c expected
+ * - @c or_else   - recover from an error, returning an @c expected
+ * - @c transform - map the contained value
+ * - @c transform_error - map the contained error
  */
 
 #include <exception>
@@ -205,10 +205,10 @@ class unexpected
  *
  * @par Monadic operations
  * The following combinators are provided (matching the C++23 specification):
- * - @c and_then(f)        -- if has_value(), invoke @p f with the value and return the result
- * - @c or_else(f)         -- if in error state, invoke @p f with the error and return the result
- * - @c transform(f)       -- if has_value(), apply @p f to the value and wrap the result
- * - @c transform_error(f) -- if in error state, apply @p f to the error and wrap the result
+ * - @c and_then(f)        - if has_value(), invoke @p f with the value and return the result
+ * - @c or_else(f)         - if in error state, invoke @p f with the error and return the result
+ * - @c transform(f)       - if has_value(), apply @p f to the value and wrap the result
+ * - @c transform_error(f) - if in error state, apply @p f to the error and wrap the result
  */
 template <typename T, typename E = std::error_code>
 class expected
diff --git a/include/threadschedule/futures.hpp b/include/threadschedule/futures.hpp
index 999fe63..19c8c9e 100644
--- a/include/threadschedule/futures.hpp
+++ b/include/threadschedule/futures.hpp
@@ -144,7 +144,7 @@ inline auto when_all_settled(std::vector<std::future<void>>& futures)
  * Polls all futures round-robin with a 1 ms timeout until one is ready,
  * then returns its index and value.
  *
- * @note The remaining futures are left in their current state -- the caller
+ * @note The remaining futures are left in their current state - the caller
  *       is responsible for managing their lifetime.
  *
  * @tparam T The value type of each future.
diff --git a/include/threadschedule/pthread_wrapper.hpp b/include/threadschedule/pthread_wrapper.hpp
index db7ee4b..51816de 100644
--- a/include/threadschedule/pthread_wrapper.hpp
+++ b/include/threadschedule/pthread_wrapper.hpp
@@ -1,5 +1,10 @@
 #pragma once
 
+/**
+ * @file pthread_wrapper.hpp
+ * @brief RAII wrapper around POSIX threads (Linux only).
+ */
+
 #include "concepts.hpp"
 #include "expected.hpp"
 #include "scheduler_policy.hpp"
@@ -26,7 +31,7 @@ namespace threadschedule
 /**
  * @brief RAII wrapper around POSIX threads with a modern C++ interface.
  *
- * Linux-only -- not available on Windows (guarded by @c _WIN32).
+ * Linux-only - not available on Windows (guarded by @c _WIN32).
  *
  * Non-copyable, movable. The destructor automatically joins the thread
  * if it is still joinable, which **blocks** until the thread finishes.
@@ -41,8 +46,8 @@ namespace threadschedule
  *       affect the **calling** thread, not the PThreadWrapper's thread.
  *
  * @par Factory methods
- * - create_with_config()      -- creates a thread and applies name/policy/priority.
- * - create_with_attributes()  -- creates a thread from a raw @c pthread_attr_t.
+ * - create_with_config()      - creates a thread and applies name/policy/priority.
+ * - create_with_attributes()  - creates a thread from a raw @c pthread_attr_t.
  *
  * @see is_thread_like<PThreadWrapper> (specialised to @c true_type at end of file)
  */
@@ -410,7 +415,7 @@ class PThreadAttributes
  *
  * @note The constructor throws @c std::runtime_error if
  *       @c pthread_mutex_init fails. Unusually for a mutex type,
- *       lock() and unlock() also throw on error -- callers should be
+ *       lock() and unlock() also throw on error - callers should be
  *       aware of this when mixing with code that assumes non-throwing
  *       mutex operations.
  */
diff --git a/include/threadschedule/registered_threads.hpp b/include/threadschedule/registered_threads.hpp
index 63a607b..88fbc0b 100644
--- a/include/threadschedule/registered_threads.hpp
+++ b/include/threadschedule/registered_threads.hpp
@@ -1,5 +1,10 @@
 #pragma once
 
+/**
+ * @file registered_threads.hpp
+ * @brief Thread wrappers with automatic global registry registration.
+ */
+
 #include "pthread_wrapper.hpp"
 #include "thread_registry.hpp"
 #include "thread_wrapper.hpp"
@@ -50,7 +55,7 @@ class ThreadWrapperReg : public ThreadWrapper
  * Non-copyable, movable. C++20 only. Behaves like @ref ThreadWrapperReg
  * but wraps a @c std::jthread and handles @c std::stop_token
  * forwarding: the callable may accept a @c stop_token as its first
- * argument, its last argument, or not at all -- all three signatures
+ * argument, its last argument, or not at all - all three signatures
  * are detected at compile time and dispatched accordingly.
  */
 class JThreadWrapperReg : public JThreadWrapper
diff --git a/include/threadschedule/scheduled_pool.hpp b/include/threadschedule/scheduled_pool.hpp
index 873f647..f3f1863 100644
--- a/include/threadschedule/scheduled_pool.hpp
+++ b/include/threadschedule/scheduled_pool.hpp
@@ -1,5 +1,10 @@
 #pragma once
 
+/**
+ * @file scheduled_pool.hpp
+ * @brief Delayed and periodic task scheduling on top of any pool type.
+ */
+
 #include "expected.hpp"
 #include "thread_pool.hpp"
 #include <atomic>
@@ -107,7 +112,7 @@ class ScheduledTaskHandle
  *   from when the task actually finishes.
  * - There is no returned std::future for scheduled tasks. If you need
  *   to observe the result, use the underlying pool directly via
- *   thread_pool().submit().
+ *   thread_pool().post() or thread_pool().submit().
  *
  * @par Thread safety
  * All schedule_* methods are thread-safe (protected by an internal
diff --git a/include/threadschedule/scheduler_policy.hpp b/include/threadschedule/scheduler_policy.hpp
index 61a9dfc..cb167d7 100644
--- a/include/threadschedule/scheduler_policy.hpp
+++ b/include/threadschedule/scheduler_policy.hpp
@@ -1,5 +1,10 @@
 #pragma once
 
+/**
+ * @file scheduler_policy.hpp
+ * @brief Scheduling policies, thread priority, and CPU affinity types.
+ */
+
 #include "expected.hpp"
 #include <algorithm>
 #include <cstdint>
diff --git a/include/threadschedule/task.hpp b/include/threadschedule/task.hpp
index ab30d18..3fd4e51 100644
--- a/include/threadschedule/task.hpp
+++ b/include/threadschedule/task.hpp
@@ -536,7 +536,7 @@ class sync_wait_task
  *        its result.
  *
  * This is the primary bridge between coroutine code and synchronous code.
- * The task is resumed **on the calling thread** -- no thread pool or
+ * The task is resumed **on the calling thread** - no thread pool or
  * executor is involved.
  *
  * If the task's coroutine body throws an exception, `sync_wait`
@@ -587,7 +587,7 @@ auto sync_wait(task<T> t) -> T
  * Overload for void tasks. Behaves identically to the `task<T>` overload
  * but returns nothing.
  *
- * The task is resumed **on the calling thread** -- no thread pool or
+ * The task is resumed **on the calling thread** - no thread pool or
  * executor is involved. If the task body throws, the exception is
  * re-thrown to the caller.
  *
diff --git a/include/threadschedule/thread_pool.hpp b/include/threadschedule/thread_pool.hpp
index f510790..47c5436 100644
--- a/include/threadschedule/thread_pool.hpp
+++ b/include/threadschedule/thread_pool.hpp
@@ -1,5 +1,10 @@
 #pragma once
 
+/**
+ * @file thread_pool.hpp
+ * @brief Thread pools: HighPerformancePool, ThreadPoolBase, LightweightPoolT, and GlobalPool.
+ */
+
 #include "expected.hpp"
 #include "scheduler_policy.hpp"
 #include "thread_registry.hpp"
@@ -777,8 +782,10 @@ class HighPerformancePool
 
 #if __cpp_lib_jthread >= 201911L
     /**
-     * @brief Submit a cancellable task. If stop is already requested the task
-     *        is skipped and the future throws @c std::future_error (broken_promise).
+     * @brief Submit a cancellable task (C++20).
+     *
+     * If @p token is already stopped the task body is skipped and
+     * the future receives a default-constructed result.
      */
     template <typename F, typename... Args>
     auto submit(std::stop_token token, F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>
@@ -791,9 +798,7 @@ class HighPerformancePool
         });
     }
 
-    /**
-     * @brief Non-throwing cancellable submission.
-     */
+    /// @brief Non-throwing cancellable submission (C++20).
     template <typename F, typename... Args>
     auto try_submit(std::stop_token token, F&& f, Args&&... args)
         -> expected<std::future<std::invoke_result_t<F, Args...>>, std::error_code>
@@ -973,7 +978,7 @@ class HighPerformancePool
      *
      * Each worker is named @c name_prefix + "_0", @c "_1", etc.
      *
-     * @return @c expected<void, std::error_code> -- error if the OS
+     * @return @c expected<void, std::error_code> - error if the OS
      *         rejected any configuration call.
      */
     auto configure_threads(std::string const& name_prefix, SchedulingPolicy policy = SchedulingPolicy::OTHER,
@@ -1963,8 +1968,8 @@ class LightweightPoolT
     /**
      * @brief Shut the pool down.
      *
-     * @param policy @c drain (default) -- workers finish all queued tasks
-     *               before exiting. @c drop_pending -- the queue is cleared
+     * @param policy @c drain (default) - workers finish all queued tasks
+     *               before exiting. @c drop_pending - the queue is cleared
      *               and only the currently executing tasks are allowed to
      *               finish.
      *
diff --git a/include/threadschedule/thread_pool_with_errors.hpp b/include/threadschedule/thread_pool_with_errors.hpp
index 5f74047..756a320 100644
--- a/include/threadschedule/thread_pool_with_errors.hpp
+++ b/include/threadschedule/thread_pool_with_errors.hpp
@@ -1,5 +1,10 @@
 #pragma once
 
+/**
+ * @file thread_pool_with_errors.hpp
+ * @brief PoolWithErrors wrapper that combines any pool with an ErrorHandler.
+ */
+
 #include "error_handler.hpp"
 #include "thread_pool.hpp"
 #include <memory>
diff --git a/include/threadschedule/thread_registry.hpp b/include/threadschedule/thread_registry.hpp
index 7dd5b9e..baf5d5b 100644
--- a/include/threadschedule/thread_registry.hpp
+++ b/include/threadschedule/thread_registry.hpp
@@ -1,5 +1,10 @@
 #pragma once
 
+/**
+ * @file thread_registry.hpp
+ * @brief Process-wide thread registry, control blocks, and composite registry.
+ */
+
 #include "expected.hpp"
 #include "scheduler_policy.hpp"
 #include "thread_wrapper.hpp" // for ThreadInfo, ThreadAffinity
@@ -61,19 +66,19 @@ using Tid = pid_t; // Linux TID via gettid()
  * Fully copyable and movable (regular value semantics).
  *
  * @par Lifetime
- * A RegisteredThreadInfo is a *snapshot* -- it may outlive the thread it
+ * A RegisteredThreadInfo is a *snapshot* - it may outlive the thread it
  * describes.  The @c alive flag reflects the state at the time the snapshot
  * was taken; it is **not** updated retroactively when the thread unregisters.
  *
  * @par Fields
- * - @c tid   -- OS-level thread identifier (@c pid_t on Linux via
+ * - @c tid   - OS-level thread identifier (@c pid_t on Linux via
  *               @c gettid(), @c DWORD on Windows).
- * - @c stdId -- The corresponding @c std::thread::id.
- * - @c name  -- Human-readable name given at registration time.
- * - @c componentTag -- Optional logical grouping tag (e.g. "io", "compute").
- * - @c alive -- @c true while the thread is registered; set to @c false when
+ * - @c stdId - The corresponding @c std::thread::id.
+ * - @c name  - Human-readable name given at registration time.
+ * - @c componentTag - Optional logical grouping tag (e.g. "io", "compute").
+ * - @c alive - @c true while the thread is registered; set to @c false when
  *               the thread calls @c unregister_current_thread().
- * - @c control -- Shared pointer to the thread's @ref ThreadControlBlock.  May be
+ * - @c control - Shared pointer to the thread's @ref ThreadControlBlock.  May be
  *                 @c nullptr if the thread was registered without a control
  *                 block (i.e. via the name-only overload of
  *                 @c register_current_thread()).
@@ -108,7 +113,7 @@ struct RegisteredThreadInfo
  *
  * @par Thread safety
  * - The object is **not** copyable and **not** movable (identity type).
- * - All @c set_* methods are safe to call from **any** thread -- they operate
+ * - All @c set_* methods are safe to call from **any** thread - they operate
  *   on the stored native handle, not on thread-local state.
  * - Concurrent calls to different @c set_* methods on the same instance are
  *   safe (each call is a single OS syscall on the stored handle).
@@ -404,7 +409,7 @@ class ThreadRegistry : public detail::QueryFacadeMixin<ThreadRegistry>
      *
      * A QueryView is produced by ThreadRegistry::query() (or by chaining
      * operations on an existing QueryView).  It holds an internal
-     * @c std::vector<RegisteredThreadInfo> that is a **snapshot** -- mutations
+     * @c std::vector<RegisteredThreadInfo> that is a **snapshot** - mutations
      * to the originating ThreadRegistry after the QueryView was created are
      * not visible.
      *
@@ -421,18 +426,18 @@ class ThreadRegistry : public detail::QueryFacadeMixin<ThreadRegistry>
      *
      * @par API
      * Provides a functional-style interface:
-     * - **filter(pred)** -- returns a new QueryView containing only entries
+     * - **filter(pred)** - returns a new QueryView containing only entries
      *   that satisfy @p pred.
-     * - **map(fn)** -- transforms each entry and returns a
+     * - **map(fn)** - transforms each entry and returns a
      *   @c std::vector<R>.
-     * - **for_each(fn)** -- applies @p fn to every entry.
-     * - **find_if(pred)** -- returns the first matching entry, or
+     * - **for_each(fn)** - applies @p fn to every entry.
+     * - **find_if(pred)** - returns the first matching entry, or
      *   @c std::nullopt.
-     * - **any / all / none(pred)** -- boolean aggregation predicates.
-     * - **take(n) / skip(n)** -- positional slicing, returning new
+     * - **any / all / none(pred)** - boolean aggregation predicates.
+     * - **take(n) / skip(n)** - positional slicing, returning new
      *   QueryViews.
-     * - **count() / empty()** -- size queries.
-     * - **entries()** -- direct access to the underlying vector.
+     * - **count() / empty()** - size queries.
+     * - **entries()** - direct access to the underlying vector.
      */
     class QueryView
     {
@@ -838,7 +843,7 @@ class CompositeThreadRegistry : public detail::QueryFacadeMixin<CompositeThreadR
  *
  * @par Copyability / movability
  * - **Not copyable** (deleted).
- * - **Movable** -- move construction / assignment transfers registration
+ * - **Movable** - move construction / assignment transfers registration
  *   ownership to the new instance and disarms the source.
  *
  * @par Thread safety
diff --git a/include/threadschedule/thread_wrapper.hpp b/include/threadschedule/thread_wrapper.hpp
index 2847cb5..19a3176 100644
--- a/include/threadschedule/thread_wrapper.hpp
+++ b/include/threadschedule/thread_wrapper.hpp
@@ -1,5 +1,10 @@
 #pragma once
 
+/**
+ * @file thread_wrapper.hpp
+ * @brief Enhanced thread wrappers: ThreadWrapper, JThreadWrapper, and non-owning views.
+ */
+
 #include "expected.hpp"
 #include "scheduler_policy.hpp"
 #include <optional>
diff --git a/include/threadschedule/topology.hpp b/include/threadschedule/topology.hpp
index 76dfc90..3f78d36 100644
--- a/include/threadschedule/topology.hpp
+++ b/include/threadschedule/topology.hpp
@@ -47,7 +47,7 @@ struct CpuTopology
  *        Windows: single node, sequential CPU indices.
  *
  * Called frequently by chaos/affinity helpers. The result is not
- * cached internally -- consider caching the returned CpuTopology
+ * cached internally - consider caching the returned CpuTopology
  * yourself if performance of repeated calls matters.
  */
 inline auto read_topology() -> CpuTopology