NVIDIA · mzient · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/dali/benchmark/displacement_cpu_bench.cc b/dali/benchmark/displacement_cpu_bench.cc
@@ -94,7 +94,7 @@ void DisplacementBench(benchmark::State& st) {//NOLINT
   }
 
   // We need a thread pool
-  ThreadPool tp(4, 0, false, "DisplacementBench");
+  OldThreadPool tp(4, 0, false, "DisplacementBench");
 
   // Create workspace and set input and output
   Workspace ws;

diff --git a/dali/benchmark/operator_bench.h b/dali/benchmark/operator_bench.h
@@ -68,7 +68,7 @@ class OperatorBench : public DALIBenchmark {
     // Create workspace and set input and output
     Workspace ws;
     ws.AddInput(data_in);
-    ThreadPool tp(num_threads, 0, false, "OperatorBench");
+    OldThreadPool tp(num_threads, 0, false, "OperatorBench");
     ws.SetThreadPool(&tp);
 
     Setup<TensorList<CPUBackend>>(op_ptr, op_spec, ws, batch_size);

diff --git a/dali/benchmark/thread_pool_bench.cc b/dali/benchmark/thread_pool_bench.cc
@@ -33,7 +33,7 @@ BENCHMARK_DEFINE_F(ThreadPoolBench, AddWorkDeferred)(benchmark::State& st) {
   int work_size_max = st.range(2);
   int nthreads = st.range(3);
 
-  ThreadPool thread_pool(nthreads, 0, false, "ThreadPoolBench");
+  OldThreadPool thread_pool(nthreads, 0, false, "ThreadPoolBench");
   std::vector<uint8_t> data(2000, 0xFF);
 
   std::atomic<int64_t> total_count(0);

diff --git a/dali/kernels/test/scatter_gather_test.cc b/dali/kernels/test/scatter_gather_test.cc
@@ -142,7 +142,7 @@ class ScatterGatherTest : public testing::Test {
     this->template Memset<kind>(out_ptr.get(), 0, out.size());
 
     T sg(max_block);
-    ThreadPool tp(4, 0, false, "test TP");
+    OldThreadPool tp(4, 0, false, "test TP");
     // copy
     for (auto &r : ranges)
       sg.AddCopy(r.dst, r.src, r.size);

diff --git a/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h b/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h
@@ -1150,8 +1150,8 @@ class nvJPEGDecoder : public StatelessOperator<MixedBackend>, CachedDecoderImpl
   nvjpegDevAllocator_t device_allocator_;
   nvjpegPinnedAllocator_t pinned_allocator_;
 
-  ThreadPool thread_pool_;
-  ThreadPool nvjpeg2k_thread_;
+  OldThreadPool thread_pool_;
+  OldThreadPool nvjpeg2k_thread_;
   static constexpr int kOutputDim = 3;
 
   TensorList<CPUBackend> hw_decoder_images_staging_;

diff --git a/dali/operators/imgcodec/image_decoder.h b/dali/operators/imgcodec/image_decoder.h
@@ -229,7 +229,7 @@ class ImageDecoder : public StatelessOperator<Backend> {
     GetDecoderSpecificArguments(spec);
 
     if (std::is_same<MixedBackend, Backend>::value) {
-      thread_pool_ = std::make_unique<ThreadPool>(num_threads_, device_id_,
+      thread_pool_ = std::make_unique<OldThreadPool>(num_threads_, device_id_,
                                                   spec.GetArgument<bool>("affine"), "MixedDecoder");
       if (spec_.HasArgument("cache_size"))
         cache_ = std::make_unique<CachedDecoderImpl>(spec_);

diff --git a/dali/operators/reader/fits_reader_op.cc b/dali/operators/reader/fits_reader_op.cc
@@ -43,7 +43,7 @@ This operator can be used in the following modes:
 2. Read file names from a text file indicated in `file_list` argument.
 3. Read files listed in `files` argument.
 4. Number of outputs per sample corresponds to the length of `hdu_indices` argument. By default,
-first HDU with data is read from each file, so the number of outputs defaults to 1. 
+first HDU with data is read from each file, so the number of outputs defaults to 1.
 )")
     .NumInput(0)
     .OutputFn(detail::FitsReaderOutputFn)
@@ -94,7 +94,7 @@ If `file_root` is provided, the paths are treated as being relative to it.
 This argument is mutually exclusive with `file_list`.)",
                                     nullptr)
     .AddOptionalArg("hdu_indices",
-                    R"(HDU indices to read. If not provided, the first HDU after the primary 
+                    R"(HDU indices to read. If not provided, the first HDU after the primary
 will be yielded. Since HDUs are indexed starting from 1, the default value is as follows: hdu_indices = [2].
 Size of the provided list hdu_indices defines number of outputs per sample.)",
                     std::vector<int>{2})
@@ -114,7 +114,7 @@ void FitsReaderCPU::RunImpl(Workspace &ws) {
     auto &output = ws.Output<CPUBackend>(output_idx);
     for (int file_idx = 0; file_idx < num_samples; file_idx++) {
       auto &sample = GetSample(file_idx);
-      ThreadPool::Work copy_task = [output_idx = output_idx, data_idx = file_idx, &output,
+      auto copy_task = [output_idx = output_idx, data_idx = file_idx, &output,
                                     &sample](int) {
         std::memcpy(output.raw_mutable_tensor(data_idx), sample.data[output_idx].raw_data(),
                     sample.data[output_idx].nbytes());

diff --git a/dali/operators/reader/nemo_asr_reader_op.h b/dali/operators/reader/nemo_asr_reader_op.h
@@ -46,7 +46,7 @@ class NemoAsrReader : public DataReader<CPUBackend, AsrSample, AsrSample, true>
   DALIDataType dtype_;
 
   int num_threads_;
-  ThreadPool thread_pool_;
+  OldThreadPool thread_pool_;
 
   // prefetch_depth * batch_size set of buffers that we reuse to decode audio
   using TensorListPtr = std::unique_ptr<TensorList<CPUBackend>>;

diff --git a/dali/operators/reader/numpy_reader_gpu_op.h b/dali/operators/reader/numpy_reader_gpu_op.h
@@ -55,7 +55,7 @@ class NumpyReaderGPU : gds::GDSLazyInit, public NumpyReader<GPUBackend, NumpyFil
  protected:
   // we need to do the threading manually because gpu workspaces
   // do not have a thread pool
-  ThreadPool thread_pool_;
+  OldThreadPool thread_pool_;
 
   vector<TensorList<GPUBackend>> prefetched_batch_tensors_;
 

diff --git a/dali/operators/reader/numpy_reader_op.h b/dali/operators/reader/numpy_reader_op.h
@@ -196,8 +196,8 @@ class NumpyReaderCPU : public NumpyReader<CPUBackend, NumpyFileWrapper> {
    */
   size_t o_direct_alignm_ = 0;
   size_t o_direct_read_len_alignm_ = 0;
-  // ThreadPool for prefetch which is a separate thread
-  ThreadPool thread_pool_;
+  // Thread Pool for prefetch which is a separate thread
+  OldThreadPool thread_pool_;
 };
 
 }  // namespace dali

diff --git a/dali/operators/reader/tfrecord_reader_op.h b/dali/operators/reader/tfrecord_reader_op.h
@@ -79,8 +79,8 @@ class TFRecordReader
   bool dont_use_mmap_ = false;
   bool use_o_direct_ = false;
   size_t o_direct_chunk_size_ = 0;
-  // ThreadPool for prefetch which is a separate thread
-  ThreadPool thread_pool_;
+  // Thread Pool for prefetch which is a separate thread
+  OldThreadPool thread_pool_;
 };
 
 }  // namespace dali

diff --git a/dali/operators/reader/webdataset_reader_op.cc b/dali/operators/reader/webdataset_reader_op.cc
@@ -50,7 +50,7 @@ void WebdatasetReader::RunImpl(Workspace &ws) {
     auto& output = ws.Output<CPUBackend>(output_idx);
     for (int data_idx = 0; data_idx < num_samples; data_idx++) {
       auto& sample = GetSample(data_idx);
-      ThreadPool::Work copy_task = [output_idx = output_idx, data_idx = data_idx, &output,
+      auto copy_task = [output_idx = output_idx, data_idx = data_idx, &output,
                                     &sample](int) {
         output.SetMeta(data_idx, sample[output_idx].GetMeta());
         std::memcpy(output.raw_mutable_tensor(data_idx), sample[output_idx].raw_data(),

diff --git a/dali/operators/video/decoder/video_decoder_base.h b/dali/operators/video/decoder/video_decoder_base.h
@@ -48,7 +48,7 @@ class DLL_PUBLIC VideoDecoderBase : public Operator<Backend> {
     if (spec_.HasArgument("device")) {
       auto device_str = spec_.template GetArgument<std::string>("device");
       if (device_str == "mixed") {
-        thread_pool_ = std::make_unique<ThreadPool>(
+        thread_pool_ = std::make_unique<OldThreadPool>(
             spec.GetArgument<int>("num_threads"), spec.GetArgument<int>("device_id"),
             spec.GetArgument<bool>("affine"), "VideoDecoder");
       }

diff --git a/dali/operators/video/input/video_input.h b/dali/operators/video/input/video_input.h
@@ -99,8 +99,11 @@ class VideoInput : public InputOperator<Backend> {
         last_sequence_policy_ == "partial" || last_sequence_policy_ == "pad",
         make_string("Provided `last_sequence_policy` is not supported: ", last_sequence_policy_));
     if constexpr (!is_cpu) {
-      thread_pool_.emplace(this->num_threads_, spec.GetArgument<int>("device_id"),
-                           spec.GetArgument<bool>("affine"), "VideoInput<MixedBackend>");
+      thread_pool_ = std::make_unique<OldThreadPool>(
+          this->num_threads_,
+          spec.GetArgument<int>("device_id"),
+          spec.GetArgument<bool>("affine"),
+          "VideoInput<MixedBackend>");
     }
   }
 
@@ -208,7 +211,7 @@ class VideoInput : public InputOperator<Backend> {
     if constexpr (is_cpu) {
       return ws.GetThreadPool();
     } else {
-      assert(thread_pool_.has_value());
+      assert(thread_pool_);
       return *thread_pool_;
     }
   }
@@ -243,7 +246,7 @@ class VideoInput : public InputOperator<Backend> {
   uint8_t pad_frame_value_ = 0;
 
   /// CPU operators have default Thread Pool inside Workspace. Mixed and GPU ops don't.
-  std::optional<ThreadPool> thread_pool_ = std::nullopt;
+  std::unique_ptr<ThreadPool> thread_pool_;
 
   std::vector<std::unique_ptr<FramesDecoderImpl>> frames_decoders_;
 

diff --git a/dali/pipeline/executor/executor2/exec2.cc b/dali/pipeline/executor/executor2/exec2.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include "dali/pipeline/executor/executor2/exec_graph.h"
 #include "dali/pipeline/executor/executor2/stream_assignment.h"
 #include "dali/pipeline/operator/builtin/input_operator.h"
+#include "dali/pipeline/util/new_thread_pool.h"
 
 namespace dali {
 namespace exec2 {
@@ -42,9 +43,9 @@ void LimitBackendConcurrency(ExecGraph &graph, OpType backend, int max_concurren
 void ApplyConcurrencyLimit(ExecGraph &graph, OperatorConcurrency concurrency) {
   switch (concurrency) {
     case OperatorConcurrency::Full:
-      // TODO(michalz): Fix ThreadPool.
-      LimitBackendConcurrency(graph, OpType::CPU);
-      break;  // other operators have no restrictions
+      if (!UseNewThreadPool())  // old thread pool is not thread safe
+        LimitBackendConcurrency(graph, OpType::CPU);
-    case OperatorConcurrency::Full:
-      // TODO(michalz): Fix ThreadPool.
-      LimitBackendConcurrency(graph, OpType::CPU);
-      break;  // other operators have no restrictions
-      if (!UseNewThreadPool())  // old thread pool is not thread safe
-        LimitBackendConcurrency(graph, OpType::CPU);
+    case OperatorConcurrency::Full:
+      // NewThreadPool's Job-based scheduling is thread-safe, so concurrent CPU operators
+      // are safe. OldThreadPool is not thread-safe and must be serialized. Because
+      // UseNewThreadPool() evaluates its static bool only once, both this check and the
+      // call to SetupThreadPool() will always agree on which pool to use.
+      if (!UseNewThreadPool())
+        LimitBackendConcurrency(graph, OpType::CPU);
+      break;
-    case OperatorConcurrency::Full:
-      // TODO(michalz): Fix ThreadPool.
-      LimitBackendConcurrency(graph, OpType::CPU);
-      break;  // other operators have no restrictions
-      if (!UseNewThreadPool())  // old thread pool is not thread safe
-        LimitBackendConcurrency(graph, OpType::CPU);
+    case OperatorConcurrency::Full:
+      // NewThreadPool's Job-based scheduling is thread-safe, so concurrent CPU operators
+      // are safe. OldThreadPool is not thread-safe and must be serialized. Because
+      // UseNewThreadPool() evaluates its static bool only once, both this check and the
+      // call to SetupThreadPool() will always agree on which pool to use.
+      if (!UseNewThreadPool())
+        LimitBackendConcurrency(graph, OpType::CPU);
+      break;
+      break;
     case OperatorConcurrency::Backend:
       LimitBackendConcurrency(graph, OpType::CPU);
       LimitBackendConcurrency(graph, OpType::GPU);
@@ -345,18 +346,37 @@ class Executor2::Impl {
   }
 
   void SetupThreadPool() {
-    if (graph_info_.num_cpu > 0) {
-      tp_ = std::make_unique<ThreadPool>(
-        config_.thread_pool_threads,
-        config_.device.value_or(CPU_ONLY_DEVICE_ID),
-        config_.set_affinity,
-        "Executorv_v2");
+    thread_pool_wrappers_.clear();
+    new_tp_.reset();
+    old_tp_.reset();
+
+    if (UseNewThreadPool()) {
+      std::cerr << "\n!!! Forced use of NewThreadPool !!!" << std::endl;
+      if (graph_info_.num_cpu > 0) {
-    if (UseNewThreadPool()) {
-      std::cerr << "\n!!! Forced use of NewThreadPool !!!" << std::endl;
-      if (graph_info_.num_cpu > 0) {
+    if (UseNewThreadPool()) {
+      if (graph_info_.num_cpu > 0) {
-    if (UseNewThreadPool()) {
-      std::cerr << "\n!!! Forced use of NewThreadPool !!!" << std::endl;
-      if (graph_info_.num_cpu > 0) {
+      // TODO: Remove before merge - only for CI validation
+      std::cerr << "\n!!! Forced use of NewThreadPool !!!" << std::endl;
-    if (UseNewThreadPool()) {
-      std::cerr << "\n!!! Forced use of NewThreadPool !!!" << std::endl;
-      if (graph_info_.num_cpu > 0) {
+    if (UseNewThreadPool()) {
+      if (graph_info_.num_cpu > 0) {
-    if (UseNewThreadPool()) {
-      std::cerr << "\n!!! Forced use of NewThreadPool !!!" << std::endl;
-      if (graph_info_.num_cpu > 0) {
+      // TODO: Remove before merge - only for CI validation
+      std::cerr << "\n!!! Forced use of NewThreadPool !!!" << std::endl;
+        new_tp_ = std::make_unique<NewThreadPool>(
+          config_.thread_pool_threads,
+          config_.device.value_or(CPU_ONLY_DEVICE_ID),
+          config_.set_affinity,
+          "Executor_v2");
+      }
+      for (auto &n : graph_.Nodes()) {
+        if (n.backend == OpType::CPU) {
+          thread_pool_wrappers_.push_back(std::make_unique<ThreadPoolFacade>(new_tp_.get()));
+          n.env.thread_pool = thread_pool_wrappers_.back().get();
+        }
+      }
     } else {
-      tp_.reset();
-    }
-    for (auto &n : graph_.Nodes()) {
-      if (n.backend == OpType::CPU)
-        n.env.thread_pool = tp_.get();
+      if (graph_info_.num_cpu > 0) {
+        old_tp_ = std::make_unique<OldThreadPool>(
+          config_.thread_pool_threads,
+          config_.device.value_or(CPU_ONLY_DEVICE_ID),
+          config_.set_affinity,
+          "Executor_v2");
+      }
+      for (auto &n : graph_.Nodes()) {
+        if (n.backend == OpType::CPU)
+          n.env.thread_pool = old_tp_.get();
+      }
     }
   }
 
@@ -421,7 +441,9 @@ class Executor2::Impl {
 
   // Runtime environment
 
-  std::unique_ptr<ThreadPool> tp_;
+  std::unique_ptr<OldThreadPool> old_tp_;
+  std::unique_ptr<NewThreadPool> new_tp_;
+  std::vector<std::unique_ptr<ThreadPool>> thread_pool_wrappers_;
   std::queue<tasking::TaskFuture> pending_outputs_;
   std::vector<CUDAStreamLease> streams_;
   std::map<std::string, ExecNode *, std::less<>> node_map_;

diff --git a/dali/pipeline/executor/executor2/exec_graph_test.cc b/dali/pipeline/executor/executor2/exec_graph_test.cc
@@ -84,7 +84,7 @@ TEST(ExecGraphTest, SimpleGraph) {
   LimitBackendConcurrency(g, OpType::CPU);
 
   WorkspaceParams params = {};
-  auto tp = std::make_unique<ThreadPool>(std::thread::hardware_concurrency(), 0, false, "test");
+  auto tp = std::make_unique<OldThreadPool>(std::thread::hardware_concurrency(), 0, false, "test");
   ExecEnv env;
   env.thread_pool = tp.get();
   params.env = &env;
@@ -144,7 +144,7 @@ TEST(ExecGraphTest, SimpleGraphRepeat) {
   g.Link(n1, 0, n2, 1);
   g.Link(n2, 0, no, 0);
   LimitBackendConcurrency(g, OpType::CPU);
-  ThreadPool tp(4, 0, false, "test");
+  OldThreadPool tp(4, 0, false, "test");
   WorkspaceParams params = {};
   ExecEnv env;
   env.thread_pool = &tp;
@@ -212,7 +212,7 @@ TEST(ExecGraphTest, SimpleGraphScheduleAheadCPU) {
   g.Link(n2, 0, no, 0);
   LimitBackendConcurrency(g, OpType::CPU);
 
-  ThreadPool tp(4, 0, false, "test");
+  OldThreadPool tp(4, 0, false, "test");
   WorkspaceParams params = {};
   ExecEnv env;
   env.thread_pool = &tp;
@@ -306,7 +306,7 @@ TEST(ExecGraphTest, GraphScheduleAheadGPU) {
   n2->env.order = s2;
   no->env.order = s3;
 
-  ThreadPool tp(4, 0, false, "test");
+  OldThreadPool tp(4, 0, false, "test");
 
   n1->env.thread_pool = &tp;
 
@@ -379,7 +379,7 @@ TEST(ExecGraphTest, Exception) {
   g.Link(n1, 0, n2, 1);
   g.Link(n2, 0, no, 0);
   LimitBackendConcurrency(g, OpType::CPU);
-  ThreadPool tp(std::thread::hardware_concurrency(), 0, false, "test");
+  OldThreadPool tp(std::thread::hardware_concurrency(), 0, false, "test");
   WorkspaceParams params = {};
   ExecEnv env;
   env.thread_pool = &tp;
@@ -460,7 +460,7 @@ TEST(ExecGraphTest, LoweredExec) {
   g.Lower(def);
   LimitBackendConcurrency(g, OpType::CPU);
 
-  ThreadPool tp(std::thread::hardware_concurrency(), 0, false, "test");
+  OldThreadPool tp(std::thread::hardware_concurrency(), 0, false, "test");
   WorkspaceParams params = {};
   ExecEnv env;
   env.thread_pool = &tp;

diff --git a/dali/pipeline/executor/executor_impl.h b/dali/pipeline/executor/executor_impl.h
@@ -318,7 +318,7 @@ class DLL_PUBLIC Executor : public ExecutorBase, public QueuePolicy {
 
   OpGraph *graph_ = nullptr;
   EventPool event_pool_;
-  ThreadPool thread_pool_;
+  OldThreadPool thread_pool_;
   std::vector<ErrorInfo> errors_;
   mutable std::mutex errors_mutex_;
   bool exec_error_;

diff --git a/dali/pipeline/operator/eager_operator.h b/dali/pipeline/operator/eager_operator.h
@@ -140,8 +140,8 @@ class DLL_PUBLIC EagerOperator {
   DLL_PUBLIC inline static void UpdateThreadPool(int num_threads) {
     std::lock_guard lock(shared_thread_pool_mutex_);
 
-    SharedThreadPoolInstance().reset(
-        new ThreadPool(num_threads, CPU_ONLY_DEVICE_ID, false, "EagerOperator"));
+    SharedThreadPoolInstance() = std::make_unique<OldThreadPool>(
+        num_threads, CPU_ONLY_DEVICE_ID, false, "EagerOperator");
   }
 
   // Update shared CUDA stream used for all direct operators.
@@ -170,7 +170,7 @@ class DLL_PUBLIC EagerOperator {
   }
 
   static inline std::shared_ptr<ThreadPool> &SharedThreadPoolInstance() {
-    static std::shared_ptr<ThreadPool> thread_pool = std::make_shared<ThreadPool>(
+    static std::shared_ptr<ThreadPool> thread_pool = std::make_shared<OldThreadPool>(
         GetDefaultNumThreads(), CPU_ONLY_DEVICE_ID, false, "EagerOperator");
 
     return thread_pool;

diff --git a/dali/pipeline/operator/false_gpu_operator.h b/dali/pipeline/operator/false_gpu_operator.h
@@ -116,7 +116,7 @@ class FalseGPUOperator : public Operator<GPUBackend> {
 
  private:
   CPUOperator cpu_impl_;
-  ThreadPool thread_pool_;
+  OldThreadPool thread_pool_;
   Workspace cpu_ws_;
 
   // Keep it here so that we can modify (ws gives only const ref to inputs)

diff --git a/dali/pipeline/pipeline_debug.h b/dali/pipeline/pipeline_debug.h
@@ -95,7 +95,7 @@ class DLL_PUBLIC PipelineDebug {
   int device_id_;
   int num_threads_;
   CUDAStreamLease cuda_stream_;
-  ThreadPool thread_pool_;
+  OldThreadPool thread_pool_;
   std::unordered_map<int, EagerOperator<CPUBackend>> cpu_operators_;
   std::unordered_map<int, EagerOperator<GPUBackend>> gpu_operators_;
   std::unordered_map<int, EagerOperator<MixedBackend>> mixed_operators_;