diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp
index 64b1bb4bb1..a4f52cb4ed 100644
--- a/benchmarks/linear_programming/cuopt/run_mip.cpp
+++ b/benchmarks/linear_programming/cuopt/run_mip.cpp
@@ -36,6 +36,7 @@
 #include <rmm/mr/device/owning_wrapper.hpp>
 
 #include <fcntl.h>
+#include <omp.h>
 #include <sys/file.h>
 #include <sys/wait.h>
 #include <unistd.h>
@@ -382,6 +383,8 @@ int main(int argc, char* argv[])
   double memory_limit    = program.get<double>("--memory-limit");
   bool track_allocations = program.get<std::string>("--track-allocations")[0] == 't';
 
+  if (num_cpu_threads < 0) { num_cpu_threads = omp_get_max_threads() / n_gpus; }
+
   if (program.is_used("--out-dir")) {
     out_dir     = program.get<std::string>("--out-dir");
     result_file = out_dir + "/final_result.csv";
diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index 1a7ea7d825..5870a1b46a 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -15,9 +15,9 @@
  * limitations under the License.
  */
 
-#include <cmath>
+#include <omp.h>
+#include <algorithm>
 #include <dual_simplex/branch_and_bound.hpp>
-
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/logger.hpp>
 #include <dual_simplex/mip_node.hpp>
@@ -29,10 +29,12 @@
 #include <dual_simplex/tic_toc.hpp>
 #include <dual_simplex/user_problem.hpp>
 
+#include <cmath>
 #include <cstdio>
 #include <cstdlib>
-#include <future>
+#include <deque>
 #include <limits>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -133,37 +135,6 @@ void set_uninitialized_steepest_edge_norms(std::vector<f_t>& edge_norms)
   }
 }
 
-constexpr int write_graphviz = false;
-
-template <typename i_t, typename f_t>
-void graphviz_node(const simplex_solver_settings_t<i_t, f_t>& settings,
-                   mip_node_t<i_t, f_t>* node_ptr,
-                   std::string label,
-                   f_t val)
-{
-  if (write_graphviz) {
-    settings.log.printf("Node%d [label=\"%s %.16e\"]\n", node_ptr->node_id, label.c_str(), val);
-  }
-}
-
-template <typename i_t, typename f_t>
-void graphviz_edge(const simplex_solver_settings_t<i_t, f_t>& settings,
-                   mip_node_t<i_t, f_t>* origin_ptr,
-                   mip_node_t<i_t, f_t>* dest_ptr,
-                   i_t branch_var,
-                   i_t branch_dir,
-                   f_t bound)
-{
-  if (write_graphviz) {
-    settings.log.printf("Node%d -> Node%d [label=\"x%d %s %e\"]\n",
-                        origin_ptr->node_id,
-                        dest_ptr->node_id,
-                        branch_var,
-                        branch_dir == 0 ? "<=" : ">=",
-                        bound);
-  }
-}
-
 dual::status_t convert_lp_status_to_dual_status(lp_status_t status)
 {
   if (status == lp_status_t::OPTIMAL) {
@@ -199,7 +170,18 @@ f_t relative_gap(f_t obj_value, f_t lower_bound)
   f_t user_mip_gap = obj_value == 0.0
                        ? (lower_bound == 0.0 ? 0.0 : std::numeric_limits<f_t>::infinity())
                        : std::abs(obj_value - lower_bound) / std::abs(obj_value);
-  // Handle NaNs (i.e., NaN != NaN)
+  if (std::isnan(user_mip_gap)) { return std::numeric_limits<f_t>::infinity(); }
+  return user_mip_gap;
+}
+
+template <typename i_t, typename f_t>
+f_t user_relative_gap(const lp_problem_t<i_t, f_t>& lp, f_t obj_value, f_t lower_bound)
+{
+  f_t user_obj         = compute_user_objective(lp, obj_value);
+  f_t user_lower_bound = compute_user_objective(lp, lower_bound);
+  f_t user_mip_gap     = user_obj == 0.0
+                           ? (user_lower_bound == 0.0 ? 0.0 : std::numeric_limits<f_t>::infinity())
+                           : std::abs(user_obj - user_lower_bound) / std::abs(user_obj);
   if (std::isnan(user_mip_gap)) { return std::numeric_limits<f_t>::infinity(); }
   return user_mip_gap;
 }
@@ -229,7 +211,8 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
     original_lp_(1, 1, 1),
     incumbent_(1),
     root_relax_soln_(1, 1),
-    pc_(1)
+    pc_(1),
+    status_(mip_exploration_status_t::UNSET)
 {
   stats_.start_time = tic();
   convert_user_problem(original_problem_, settings_, original_lp_, new_slacks_);
@@ -238,14 +221,6 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   mutex_upper_.lock();
   upper_bound_ = inf;
   mutex_upper_.unlock();
-
-  mutex_lower_.lock();
-  lower_bound_ = -inf;
-  mutex_lower_.unlock();
-
-  mutex_branching_.lock();
-  currently_branching_ = false;
-  mutex_branching_.unlock();
 }
 
 template <typename i_t, typename f_t>
@@ -260,12 +235,27 @@ f_t branch_and_bound_t<i_t, f_t>::get_upper_bound()
 template <typename i_t, typename f_t>
 f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
 {
-  mutex_lower_.lock();
-  const f_t lower_bound = lower_bound_;
-  mutex_lower_.unlock();
+  f_t lower_bound = lower_bound_ceiling_.load();
+  mutex_heap_.lock();
+  if (heap_.size() > 0) { lower_bound = std::min(heap_.top()->lower_bound, lower_bound); }
+  mutex_heap_.unlock();
+
+  for (i_t i = 0; i < local_lower_bounds_.size(); ++i) {
+    lower_bound = std::min(local_lower_bounds_[i].load(), lower_bound);
+  }
+
   return lower_bound;
 }
 
+template <typename i_t, typename f_t>
+i_t branch_and_bound_t<i_t, f_t>::get_heap_size()
+{
+  mutex_heap_.lock();
+  i_t size = heap_.size();
+  mutex_heap_.unlock();
+  return size;
+}
+
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::set_new_solution(const std::vector<f_t>& solution)
 {
@@ -304,16 +294,13 @@ void branch_and_bound_t<i_t, f_t>::set_new_solution(const std::vector<f_t>& solu
   mutex_upper_.unlock();
 
   if (is_feasible) {
-    mutex_branching_.lock();
-    bool currently_branching = currently_branching_;
-    mutex_branching_.unlock();
-    if (currently_branching) {
+    if (status_ == mip_exploration_status_t::RUNNING) {
       f_t user_obj    = compute_user_objective(original_lp_, obj);
       f_t user_lower  = compute_user_objective(original_lp_, get_lower_bound());
       std::string gap = user_mip_gap<f_t>(user_obj, user_lower);
 
       settings_.log.printf(
-        "H                        %+13.6e  %+10.6e                      %s %9.2f\n",
+        "H                           %+13.6e    %+10.6e                        %s %9.2f\n",
         user_obj,
         user_lower,
         gap.c_str(),
@@ -424,6 +411,7 @@ void branch_and_bound_t<i_t, f_t>::repair_heuristic_solutions()
           f_t obj              = compute_user_objective(original_lp_, repaired_obj);
           f_t lower            = compute_user_objective(original_lp_, get_lower_bound());
           std::string user_gap = user_mip_gap<f_t>(obj, lower);
+
           settings_.log.printf(
             "H                        %+13.6e  %+10.6e                      %s %9.2f\n",
             obj,
@@ -445,58 +433,83 @@ void branch_and_bound_t<i_t, f_t>::repair_heuristic_solutions()
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::branch(mip_node_t<i_t, f_t>* parent_node,
-                                          i_t branch_var,
-                                          f_t branch_var_val,
-                                          const std::vector<variable_status_t>& parent_vstatus)
+mip_status_t branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t, f_t>& solution,
+                                                              f_t lower_bound)
 {
-  // down child
-  auto down_child = std::make_unique<mip_node_t<i_t, f_t>>(
-    original_lp_, parent_node, ++stats_.num_nodes, branch_var, 0, branch_var_val, parent_vstatus);
+  mip_status_t mip_status = mip_status_t::UNSET;
 
-  graphviz_edge(
-    settings_, parent_node, down_child.get(), branch_var, 0, std::floor(branch_var_val));
+  if (status_ == mip_exploration_status_t::NUMERICAL) {
+    settings_.log.printf("Numerical issue encountered. Stopping the solver...\n");
+    mip_status = mip_status_t::NUMERICAL;
+  }
 
-  // up child
-  auto up_child = std::make_unique<mip_node_t<i_t, f_t>>(
-    original_lp_, parent_node, ++stats_.num_nodes, branch_var, 1, branch_var_val, parent_vstatus);
+  if (status_ == mip_exploration_status_t::TIME_LIMIT) {
+    settings_.log.printf("Time limit reached. Stopping the solver...\n");
+    mip_status = mip_status_t::TIME_LIMIT;
+  }
 
-  graphviz_edge(settings_, parent_node, up_child.get(), branch_var, 1, std::ceil(branch_var_val));
+  f_t upper_bound = get_upper_bound();
+  f_t gap         = upper_bound - lower_bound;
+  f_t obj         = compute_user_objective(original_lp_, upper_bound);
+  f_t user_lower  = compute_user_objective(original_lp_, lower_bound);
+  f_t gap_rel     = user_relative_gap(original_lp_, upper_bound, lower_bound);
 
-  assert(parent_vstatus.size() == original_lp_.num_cols);
-  parent_node->add_children(std::move(down_child),
-                            std::move(up_child));  // child pointers moved into the tree
-}
+  settings_.log.printf(
+    "Explored %d nodes in %.2fs.\n", stats_.nodes_explored, toc(stats_.start_time));
+  settings_.log.printf("Absolute Gap %e Objective %.16e Lower Bound %.16e\n", gap, obj, user_lower);
 
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::update_tree(mip_node_t<i_t, f_t>* node_ptr, node_status_t status)
-{
-  std::vector<mip_node_t<i_t, f_t>*> stack;
-  node_ptr->set_status(status, stack);
-  remove_fathomed_nodes(stack);
+  if (gap <= settings_.absolute_mip_gap_tol || gap_rel <= settings_.relative_mip_gap_tol) {
+    mip_status = mip_status_t::OPTIMAL;
+    if (gap > 0 && gap <= settings_.absolute_mip_gap_tol) {
+      settings_.log.printf("Optimal solution found within absolute MIP gap tolerance (%.1e)\n",
+                           settings_.absolute_mip_gap_tol);
+    } else if (gap > 0 && gap_rel <= settings_.relative_mip_gap_tol) {
+      settings_.log.printf("Optimal solution found within relative MIP gap tolerance (%.1e)\n",
+                           settings_.relative_mip_gap_tol);
+    } else {
+      settings_.log.printf("Optimal solution found.\n");
+    }
+    if (settings_.heuristic_preemption_callback != nullptr) {
+      settings_.heuristic_preemption_callback();
+    }
+  }
+
+  if (stats_.nodes_explored > 0 && stats_.nodes_unexplored == 0 && upper_bound == inf) {
+    settings_.log.printf("Integer infeasible.\n");
+    mip_status = mip_status_t::INFEASIBLE;
+    if (settings_.heuristic_preemption_callback != nullptr) {
+      settings_.heuristic_preemption_callback();
+    }
+  }
+
+  uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, solution.x);
+  solution.objective          = incumbent_.objective;
+  solution.lower_bound        = lower_bound;
+  solution.nodes_explored     = stats_.nodes_explored;
+  solution.simplex_iterations = stats_.total_lp_iters;
+
+  return mip_status;
 }
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::add_feasible_solution(f_t leaf_objective,
                                                          const std::vector<f_t>& leaf_solution,
                                                          i_t leaf_depth,
-                                                         char symbol)
+                                                         char thread_type)
 {
   bool send_solution   = false;
   i_t nodes_explored   = stats_.nodes_explored;
   i_t nodes_unexplored = stats_.nodes_unexplored;
-  f_t gap;
 
   mutex_upper_.lock();
   if (leaf_objective < upper_bound_) {
     incumbent_.set_incumbent_solution(leaf_objective, leaf_solution);
     upper_bound_    = leaf_objective;
     f_t lower_bound = get_lower_bound();
-    gap             = upper_bound_ - lower_bound;
     f_t obj         = compute_user_objective(original_lp_, upper_bound_);
     f_t lower       = compute_user_objective(original_lp_, lower_bound);
-    settings_.log.printf("%c%8d %8lu       %+13.6e  %+10.6e   %4d   %7.1e     %s %9.2f\n",
-                         symbol,
+    settings_.log.printf("%c%10d   %10lu    %+13.6e    %+10.6e   %6d   %7.1e     %s %9.2f\n",
+                         thread_type,
                          nodes_explored,
                          nodes_unexplored,
                          obj,
@@ -515,29 +528,57 @@ void branch_and_bound_t<i_t, f_t>::add_feasible_solution(f_t leaf_objective,
     settings_.solution_callback(original_x, upper_bound_);
   }
   mutex_upper_.unlock();
+}
 
-  if (send_solution) {
-    mutex_gap_.lock();
-    gap_ = gap;
-    mutex_gap_.unlock();
+template <typename i_t, typename f_t>
+std::pair<mip_node_t<i_t, f_t>*, mip_node_t<i_t, f_t>*>
+branch_and_bound_t<i_t, f_t>::child_selection(mip_node_t<i_t, f_t>* node_ptr)
+{
+  const i_t branch_var     = node_ptr->get_down_child()->branch_var;
+  const f_t branch_var_val = node_ptr->get_down_child()->fractional_val;
+  const f_t down_val       = std::floor(root_relax_soln_.x[branch_var]);
+  const f_t up_val         = std::ceil(root_relax_soln_.x[branch_var]);
+  const f_t down_dist      = branch_var_val - down_val;
+  const f_t up_dist        = up_val - branch_var_val;
+  constexpr f_t eps        = 1e-6;
+
+  if (down_dist < up_dist + eps) {
+    return std::make_pair(node_ptr->get_down_child(), node_ptr->get_up_child());
+
+  } else {
+    return std::make_pair(node_ptr->get_up_child(), node_ptr->get_down_child());
   }
 }
 
 template <typename i_t, typename f_t>
-dual::status_t branch_and_bound_t<i_t, f_t>::node_dual_simplex(
-  i_t leaf_id,
-  lp_problem_t<i_t, f_t>& leaf_problem,
-  std::vector<variable_status_t>& leaf_vstatus,
-  lp_solution_t<i_t, f_t>& leaf_solution,
-  std::vector<bool>& bounds_changed,
-  csc_matrix_t<i_t, f_t>& Arow,
-  f_t upper_bound,
-  logger_t& log)
+node_status_t branch_and_bound_t<i_t, f_t>::solve_node(search_tree_t<i_t, f_t>& search_tree,
+                                                       mip_node_t<i_t, f_t>* node_ptr,
+                                                       lp_problem_t<i_t, f_t>& leaf_problem,
+                                                       const csc_matrix_t<i_t, f_t>& Arow,
+                                                       f_t upper_bound,
+                                                       logger_t& log,
+                                                       char thread_type)
 {
-  i_t node_iter = 0;
+  f_t abs_fathom_tol = settings_.absolute_mip_gap_tol / 10;
+
+  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
+  lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
   assert(leaf_vstatus.size() == leaf_problem.num_cols);
-  f_t lp_start_time                     = tic();
-  std::vector<f_t> leaf_edge_norms      = edge_norms_;  // = node.steepest_edge_norms;
+
+  // Set the correct bounds for the leaf problem
+  leaf_problem.lower = original_lp_.lower;
+  leaf_problem.upper = original_lp_.upper;
+
+  std::vector<bool> bounds_changed(leaf_problem.num_cols, false);
+  // Technically, we can get the already strengthened bounds from the node/parent instead of
+  // getting it from the original problem and re-strengthening. But this requires storing
+  // two vectors at each node and potentially cause memory issues
+  node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
+
+  i_t node_iter                    = 0;
+  f_t lp_start_time                = tic();
+  std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
+
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.set_log(false);
   lp_settings.cut_off    = upper_bound + settings_.dual_tol;
@@ -562,307 +603,410 @@ dual::status_t branch_and_bound_t<i_t, f_t>::node_dual_simplex(
                             leaf_edge_norms);
 
     if (lp_status == dual::status_t::NUMERICAL) {
-      log.printf("Numerical issue node %d. Resolving from scratch.\n", leaf_id);
+      log.printf("Numerical issue node %d. Resolving from scratch.\n", node_ptr->node_id);
       lp_status_t second_status = solve_linear_program_advanced(
         leaf_problem, lp_start_time, lp_settings, leaf_solution, leaf_vstatus, leaf_edge_norms);
       lp_status = convert_lp_status_to_dual_status(second_status);
     }
   }
 
-  mutex_stats_.lock();
   stats_.total_lp_solve_time += toc(lp_start_time);
   stats_.total_lp_iters += node_iter;
-  mutex_stats_.unlock();
-
-  return lp_status;
-}
-
-template <typename i_t, typename f_t>
-mip_status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
-  mip_node_t<i_t, f_t>* node_ptr,
-  lp_problem_t<i_t, f_t>& leaf_problem,
-  csc_matrix_t<i_t, f_t>& Arow,
-  const std::vector<variable_type_t>& var_types,
-  f_t upper_bound)
-{
-  logger_t log;
-  log.log                                      = false;
-  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
-
-  // Set the correct bounds for the leaf problem
-  leaf_problem.lower = original_lp_.lower;
-  leaf_problem.upper = original_lp_.upper;
-
-  std::vector<bool> bounds_changed(original_lp_.num_cols, false);
-  // Technically, we can get the already strengthened bounds from the node/parent instead of
-  // getting it from the original problem and re-strengthening. But this requires storing
-  // two vectors at each node and potentially cause memory issues
-  node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
-
-  dual::status_t lp_status = node_dual_simplex(node_ptr->node_id,
-                                               leaf_problem,
-                                               leaf_vstatus,
-                                               leaf_solution,
-                                               bounds_changed,
-                                               Arow,
-                                               upper_bound,
-                                               settings_.log);
 
   if (lp_status == dual::status_t::DUAL_UNBOUNDED) {
-    node_ptr->lower_bound = inf;
-    graphviz_node(settings_, node_ptr, "infeasible", 0.0);
-    update_tree(node_ptr, node_status_t::INFEASIBLE);
     // Node was infeasible. Do not branch
+    node_ptr->lower_bound = inf;
+    search_tree.graphviz_node(log, node_ptr, "infeasible", 0.0);
+    search_tree.update_tree(node_ptr, node_status_t::INFEASIBLE);
+    return node_status_t::INFEASIBLE;
 
   } else if (lp_status == dual::status_t::CUTOFF) {
+    // Node was cut off. Do not branch
     node_ptr->lower_bound = upper_bound;
     f_t leaf_objective    = compute_objective(leaf_problem, leaf_solution.x);
-    graphviz_node(settings_, node_ptr, "cut off", leaf_objective);
-    update_tree(node_ptr, node_status_t::FATHOMED);
-    // Node was cut off. Do not branch
+    search_tree.graphviz_node(log, node_ptr, "cut off", leaf_objective);
+    search_tree.update_tree(node_ptr, node_status_t::FATHOMED);
+    return node_status_t::FATHOMED;
+
   } else if (lp_status == dual::status_t::OPTIMAL) {
     // LP was feasible
-    std::vector<i_t> fractional;
-    const i_t leaf_fractional =
-      fractional_variables(settings_, leaf_solution.x, var_types_, fractional);
-    f_t leaf_objective = compute_objective(leaf_problem, leaf_solution.x);
-    graphviz_node(settings_, node_ptr, "lower bound", leaf_objective);
-
-    mutex_pc_.lock();
-    pc_.update_pseudo_costs(node_ptr, leaf_objective);
-    mutex_pc_.unlock();
+    std::vector<i_t> leaf_fractional;
+    i_t leaf_num_fractional =
+      fractional_variables(settings_, leaf_solution.x, var_types_, leaf_fractional);
 
+    f_t leaf_objective    = compute_objective(leaf_problem, leaf_solution.x);
     node_ptr->lower_bound = leaf_objective;
+    search_tree.graphviz_node(log, node_ptr, "lower bound", leaf_objective);
+    pc_.update_pseudo_costs(node_ptr, leaf_objective);
 
-    constexpr f_t fathom_tol = 1e-5;
-    if (leaf_fractional == 0) {
-      add_feasible_solution(leaf_objective, leaf_solution.x, node_ptr->depth, 'B');
-      graphviz_node(settings_, node_ptr, "integer feasible", leaf_objective);
-      update_tree(node_ptr, node_status_t::INTEGER_FEASIBLE);
+    if (leaf_num_fractional == 0) {
+      // Found a integer feasible solution
+      add_feasible_solution(leaf_objective, leaf_solution.x, node_ptr->depth, thread_type);
+      search_tree.graphviz_node(log, node_ptr, "integer feasible", leaf_objective);
+      search_tree.update_tree(node_ptr, node_status_t::INTEGER_FEASIBLE);
+      return node_status_t::INTEGER_FEASIBLE;
 
-    } else if (leaf_objective <= upper_bound + fathom_tol) {
+    } else if (leaf_objective <= upper_bound + abs_fathom_tol) {
       // Choose fractional variable to branch on
-      mutex_pc_.lock();
-      const i_t branch_var = pc_.variable_selection(
-        fractional, leaf_solution.x, leaf_problem.lower, leaf_problem.upper, log);
-      mutex_pc_.unlock();
+      const i_t branch_var =
+        pc_.variable_selection(leaf_fractional, leaf_solution.x, lp_settings.log);
 
       assert(leaf_vstatus.size() == leaf_problem.num_cols);
-      branch(node_ptr, branch_var, leaf_solution.x[branch_var], leaf_vstatus);
+      search_tree.branch(
+        node_ptr, branch_var, leaf_solution.x[branch_var], leaf_vstatus, original_lp_, log);
       node_ptr->status = node_status_t::HAS_CHILDREN;
+      return node_status_t::HAS_CHILDREN;
 
     } else {
-      graphviz_node(settings_, node_ptr, "fathomed", leaf_objective);
-      update_tree(node_ptr, node_status_t::FATHOMED);
+      search_tree.graphviz_node(log, node_ptr, "fathomed", leaf_objective);
+      search_tree.update_tree(node_ptr, node_status_t::FATHOMED);
+      return node_status_t::FATHOMED;
     }
+  } else if (lp_status == dual::status_t::TIME_LIMIT) {
+    search_tree.graphviz_node(log, node_ptr, "timeout", 0.0);
+    search_tree.update_tree(node_ptr, node_status_t::TIME_LIMIT);
+    return node_status_t::TIME_LIMIT;
+
   } else {
-    graphviz_node(settings_, node_ptr, "numerical", 0.0);
-    settings_.log.printf("Encountered LP status %d. This indicates a numerical issue.\n",
-                         lp_status);
-    return mip_status_t::NUMERICAL;
-  }
+    if (thread_type == 'B') {
+      lower_bound_ceiling_.fetch_min(node_ptr->lower_bound);
+      log.printf(
+        "LP returned status %d on node %d. This indicates a numerical issue. The best bound is set "
+        "to "
+        "%+10.6e.\n",
+        lp_status,
+        node_ptr->node_id,
+        compute_user_objective(original_lp_, lower_bound_ceiling_.load()));
+    }
 
-  return mip_status_t::UNSET;
+    search_tree.graphviz_node(log, node_ptr, "numerical", 0.0);
+    search_tree.update_tree(node_ptr, node_status_t::NUMERICAL);
+    return node_status_t::NUMERICAL;
+  }
 }
 
 template <typename i_t, typename f_t>
-mip_status_t branch_and_bound_t<i_t, f_t>::explore_tree(i_t branch_var,
-                                                        mip_solution_t<i_t, f_t>& solution)
+void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(search_tree_t<i_t, f_t>* search_tree,
+                                                       mip_node_t<i_t, f_t>* node,
+                                                       lp_problem_t<i_t, f_t>& leaf_problem,
+                                                       const csc_matrix_t<i_t, f_t>& Arow,
+                                                       i_t initial_heap_size)
 {
-  mip_status_t status = mip_status_t::UNSET;
-  logger_t log;
-  log.log      = false;
-  auto compare = [](mip_node_t<i_t, f_t>* a, mip_node_t<i_t, f_t>* b) {
-    return a->lower_bound >
-           b->lower_bound;  // True if a comes before b, elements that come before are output last
-  };
+  if (status_ != mip_exploration_status_t::RUNNING) { return; }
+  if (omp_get_thread_num() == 0) { repair_heuristic_solutions(); }
+
+  f_t lower_bound      = node->lower_bound;
+  f_t upper_bound      = get_upper_bound();
+  f_t rel_gap          = user_relative_gap(original_lp_, upper_bound, lower_bound);
+  f_t abs_gap          = upper_bound - lower_bound;
+  i_t nodes_explored   = 0;
+  i_t nodes_unexplored = 0;
+
+  nodes_explored   = (stats_.nodes_explored++);
+  nodes_unexplored = (stats_.nodes_unexplored--);
+  stats_.nodes_since_last_log++;
+
+  if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
+    search_tree->graphviz_node(settings_.log, node, "cutoff", node->lower_bound);
+    search_tree->update_tree(node, node_status_t::FATHOMED);
+    return;
+  }
 
-  std::priority_queue<mip_node_t<i_t, f_t>*, std::vector<mip_node_t<i_t, f_t>*>, decltype(compare)>
-    heap(compare);
+  f_t now = toc(stats_.start_time);
 
-  mip_node_t<i_t, f_t> root_node(root_objective_, root_vstatus_);
-  graphviz_node(settings_, &root_node, "lower bound", root_objective_);
+  if (omp_get_thread_num() == 0) {
+    f_t time_since_last_log = stats_.last_log == 0 ? 1.0 : toc(stats_.last_log);
 
-  branch(&root_node, branch_var, root_relax_soln_.x[branch_var], root_vstatus_);
+    if (((stats_.nodes_since_last_log >= 10 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+         (time_since_last_log >= 1)) ||
+        (time_since_last_log > 30) || now > settings_.time_limit) {
+      f_t obj              = compute_user_objective(original_lp_, upper_bound);
+      f_t user_lower       = compute_user_objective(original_lp_, root_objective_);
+      std::string gap_user = user_mip_gap<f_t>(obj, user_lower);
 
-  // the stack does not own the unique_ptr the tree does
-  heap.push(root_node.get_down_child());
-  heap.push(root_node.get_up_child());
+      settings_.log.printf(" %10d   %10lu    %+13.6e    %+10.6e   %6d   %7.1e     %s %9.2f\n",
+                           nodes_explored,
+                           nodes_unexplored,
+                           obj,
+                           user_lower,
+                           node->depth,
+                           nodes_explored > 0 ? stats_.total_lp_iters / nodes_explored : 0,
+                           gap_user.c_str(),
+                           now);
 
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t leaf_problem = original_lp_;
-  csc_matrix_t<i_t, f_t> Arow(1, 1, 1);
-  original_lp_.A.transpose(Arow);
+      stats_.nodes_since_last_log = 0;
+    }
+  }
+
+  if (toc(stats_.start_time) > settings_.time_limit) {
+    status_ = mip_exploration_status_t::TIME_LIMIT;
+    return;
+  }
+  node_status_t node_status =
+    solve_node(*search_tree, node, leaf_problem, Arow, upper_bound, settings_.log, 'B');
+
+  if (node_status == node_status_t::TIME_LIMIT) {
+    status_ = mip_exploration_status_t::TIME_LIMIT;
+    return;
+
+  } else if (node_status == node_status_t::HAS_CHILDREN) {
+    stats_.nodes_unexplored += 2;
+
+    // If we haven't generated enough nodes to keep the threads busy, continue the ramp up phase
+    if (stats_.nodes_unexplored < initial_heap_size) {
+#pragma omp task
+      exploration_ramp_up(
+        search_tree, node->get_down_child(), leaf_problem, Arow, initial_heap_size);
+
+#pragma omp task
+      exploration_ramp_up(search_tree, node->get_up_child(), leaf_problem, Arow, initial_heap_size);
+
+    } else {
+      // We've generated enough nodes, push further nodes onto the heap
+      mutex_heap_.lock();
+      heap_.push(node->get_down_child());
+      heap_.push(node->get_up_child());
+      mutex_heap_.unlock();
+    }
+  }
+}
 
-  f_t lower_bound = get_lower_bound();
-  f_t gap         = get_upper_bound() - lower_bound;
-  f_t last_log    = 0;
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t id,
+                                                   search_tree_t<i_t, f_t>& search_tree,
+                                                   mip_node_t<i_t, f_t>* start_node,
+                                                   lp_problem_t<i_t, f_t>& leaf_problem,
+                                                   const csc_matrix_t<i_t, f_t>& Arow)
+{
+  std::deque<mip_node_t<i_t, f_t>*> stack;
+  stack.push_front(start_node);
 
-  f_t user_upper_bound = compute_user_objective(original_lp_, get_upper_bound());
-  f_t user_lower_bound = compute_user_objective(original_lp_, lower_bound);
-  f_t rel_gap          = relative_gap(user_upper_bound, user_lower_bound);
-  while (gap > settings_.absolute_mip_gap_tol && rel_gap > settings_.relative_mip_gap_tol &&
-         heap.size() > 0) {
-    repair_heuristic_solutions();
+  while (stack.size() > 0 && status_ == mip_exploration_status_t::RUNNING) {
+    if (omp_get_thread_num() == 0) { repair_heuristic_solutions(); }
 
-    // Get a node off the heap
-    mip_node_t<i_t, f_t>* node_ptr = heap.top();
-    heap.pop();
-    stats_.nodes_explored++;
+    mip_node_t<i_t, f_t>* node_ptr = stack.front();
+    stack.pop_front();
 
+    f_t lower_bound = node_ptr->lower_bound;
     f_t upper_bound = get_upper_bound();
-    if (upper_bound < node_ptr->lower_bound) {
-      // This node was put on the heap earlier but its lower bound is now greater than the current
-      // upper bound
-      update_tree(node_ptr, node_status_t::FATHOMED);
-      graphviz_node(settings_, node_ptr, "cutoff", node_ptr->lower_bound);
+    f_t abs_gap     = upper_bound - lower_bound;
+    f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
+
+    // This is based on three assumptions:
+    // - The stack only contains sibling nodes, i.e., the current node and it sibling (if
+    // applicable)
+    // - The current node and its siblings uses the lower bound of the parent before solving the LP
+    // relaxation
+    // - The lower bound of the parent is lower or equal to its children
+    assert(id < local_lower_bounds_.size());
+    local_lower_bounds_[id] = lower_bound;
+    i_t nodes_explored      = stats_.nodes_explored++;
+    i_t nodes_unexplored    = stats_.nodes_unexplored--;
+    stats_.nodes_since_last_log++;
+
+    if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
+      search_tree.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
+      search_tree.update_tree(node_ptr, node_status_t::FATHOMED);
       continue;
     }
-    mutex_lower_.lock();
-    lower_bound = lower_bound_ = node_ptr->lower_bound;
-    mutex_lower_.unlock();
-
-    mutex_gap_.lock();
-    gap_ = gap = upper_bound - lower_bound;
-    mutex_gap_.unlock();
-
-    i_t nodes_explored = stats_.nodes_explored;
-    f_t now            = toc(stats_.start_time);
-    f_t time_since_log = last_log == 0 ? 1.0 : toc(last_log);
-    if ((nodes_explored % 1000 == 0 || gap < 10 * settings_.absolute_mip_gap_tol ||
-         nodes_explored < 1000) &&
-          (time_since_log >= 1) ||
-        (time_since_log > 60) || now > settings_.time_limit) {
-      f_t user_obj         = compute_user_objective(original_lp_, upper_bound);
-      f_t user_lower       = compute_user_objective(original_lp_, lower_bound);
-      std::string user_gap = user_mip_gap<f_t>(user_obj, user_lower);
-
-      settings_.log.printf(" %8d %8lu       %+13.6e  %+10.6e   %4d   %7.1e     %s %9.2f\n",
-                           nodes_explored,
-                           heap.size(),
-                           user_obj,
-                           user_lower,
-                           node_ptr->depth,
-                           nodes_explored > 0 ? stats_.total_lp_iters / nodes_explored : 0,
-                           user_gap.c_str(),
-                           now);
-      last_log = tic();
+
+    f_t now = toc(stats_.start_time);
+
+    if (id == 0) {
+      f_t time_since_last_log = stats_.last_log == 0 ? 1.0 : toc(stats_.last_log);
+
+      if (((stats_.nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+           time_since_last_log >= 1) ||
+          (time_since_last_log > 30) || now > settings_.time_limit) {
+        f_t obj              = compute_user_objective(original_lp_, upper_bound);
+        f_t user_lower       = compute_user_objective(original_lp_, get_lower_bound());
+        std::string gap_user = user_mip_gap<f_t>(obj, user_lower);
+        settings_.log.printf(" %10d   %10lu    %+13.6e    %+10.6e   %6d   %7.1e     %s %9.2f\n",
+                             nodes_explored,
+                             nodes_unexplored,
+                             obj,
+                             user_lower,
+                             node_ptr->depth,
+                             nodes_explored > 0 ? stats_.total_lp_iters / nodes_explored : 0,
+                             gap_user.c_str(),
+                             now);
+        stats_.last_log             = tic();
+        stats_.nodes_since_last_log = 0;
+      }
     }
 
     if (toc(stats_.start_time) > settings_.time_limit) {
-      settings_.log.printf("Hit time limit. Stopping\n");
-      status = mip_status_t::TIME_LIMIT;
+      status_ = mip_exploration_status_t::TIME_LIMIT;
       break;
     }
 
-    status = solve_node_lp(node_ptr, leaf_problem, Arow, var_types_, upper_bound);
+    node_status_t node_status =
+      solve_node(search_tree, node_ptr, leaf_problem, Arow, upper_bound, settings_.log, 'B');
+
+    if (node_status == node_status_t::TIME_LIMIT) {
+      status_ = mip_exploration_status_t::TIME_LIMIT;
+      break;
+
+    } else if (node_status == node_status_t::HAS_CHILDREN) {
+      // The stack should only contain the children of the current parent.
+      // If the stack size is greater than 0,
+      // we pop the current node from the stack and place it in the global heap,
+      // since we are about to add the two children to the stack
+      if (stack.size() > 0) {
+        mip_node_t<i_t, f_t>* node = stack.back();
+        stack.pop_back();
+
+        // The order here matters. We want to create a copy of the node
+        // before adding to the global heap. Otherwise,
+        // some thread may consume the node (possibly fathoming it)
+        // before we had the chance to add to the diving queue.
+        // This lead to a SIGSEGV. Although, in this case, it
+        // would be better if we discard the node instead.
+        if (get_heap_size() > settings_.num_bfs_threads) {
+          mutex_dive_queue_.lock();
+          dive_queue_.push(node->detach_copy());
+          mutex_dive_queue_.unlock();
+        }
+
+        mutex_heap_.lock();
+        heap_.push(node);
+        mutex_heap_.unlock();
+      }
 
-    if (status == mip_status_t::NUMERICAL) { break; }
+      stats_.nodes_unexplored += 2;
 
-    if (node_ptr->status == node_status_t::HAS_CHILDREN) {
-      // the heap does not own the unique_ptr the tree does
-      heap.push(node_ptr->get_down_child());
-      heap.push(node_ptr->get_up_child());
+      auto [first, second] = child_selection(node_ptr);
+      stack.push_front(second);
+      stack.push_front(first);
     }
   }
+}
 
-  stats_.nodes_unexplored = heap.size();
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::best_first_thread(i_t id,
+                                                     search_tree_t<i_t, f_t>& search_tree,
+                                                     lp_problem_t<i_t, f_t>& leaf_problem,
+                                                     const csc_matrix_t<i_t, f_t>& Arow)
+{
+  f_t lower_bound = -inf;
+  f_t upper_bound = inf;
+  f_t abs_gap     = inf;
+  f_t rel_gap     = inf;
+
+  while (status_ == mip_exploration_status_t::RUNNING && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol &&
+         (active_subtrees_ > 0 || get_heap_size() > 0)) {
+    mip_node_t<i_t, f_t>* node_ptr = nullptr;
+
+    // If there any node left in the heap, we pop the top node and explore it.
+    mutex_heap_.lock();
+    if (heap_.size() > 0) {
+      node_ptr = heap_.top();
+      heap_.pop();
+      active_subtrees_++;
+    }
+    mutex_heap_.unlock();
+
+    if (node_ptr != nullptr) {
+      if (get_upper_bound() < node_ptr->lower_bound) {
+        // This node was put on the heap earlier but its lower bound is now greater than the
+        // current upper bound
+        search_tree.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
+        search_tree.update_tree(node_ptr, node_status_t::FATHOMED);
+        active_subtrees_--;
+        continue;
+      }
 
-  if (stats_.nodes_unexplored == 0) {
-    mutex_lower_.lock();
-    lower_bound = lower_bound_ = root_node.lower_bound;
-    mutex_lower_.unlock();
+      // Best-first search with plunging
+      explore_subtree(id, search_tree, node_ptr, leaf_problem, Arow);
+      active_subtrees_--;
+    }
 
-    mutex_gap_.lock();
-    gap_ = gap = get_upper_bound() - lower_bound;
-    mutex_gap_.unlock();
+    lower_bound = get_lower_bound();
+    upper_bound = get_upper_bound();
+    abs_gap     = upper_bound - lower_bound;
+    rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
   }
 
-  return status;
+  // Check if it is the last thread that exited the loop and no
+  // timeout or numerical error has happen.
+  if (status_ == mip_exploration_status_t::RUNNING) {
+    if (active_subtrees_ == 0) {
+      status_ = mip_exploration_status_t::COMPLETED;
+    } else {
+      local_lower_bounds_[id] = inf;
+    }
+  }
 }
 
 template <typename i_t, typename f_t>
-mip_status_t branch_and_bound_t<i_t, f_t>::dive(i_t branch_var, mip_solution_t<i_t, f_t>& solution)
+void branch_and_bound_t<i_t, f_t>::diving_thread(lp_problem_t<i_t, f_t>& leaf_problem,
+                                                 const csc_matrix_t<i_t, f_t>& Arow)
 {
-  mip_status_t status = mip_status_t::UNSET;
-
   logger_t log;
   log.log = false;
 
-  std::vector<mip_node_t<i_t, f_t>*> node_stack;
-
-  mip_node_t<i_t, f_t> root_node(root_objective_, root_vstatus_);
-  graphviz_node(settings_, &root_node, "lower bound", root_objective_);
-
-  branch(&root_node, branch_var, root_relax_soln_.x[branch_var], root_vstatus_);
+  while (status_ == mip_exploration_status_t::RUNNING &&
+         (active_subtrees_ > 0 || get_heap_size() > 0)) {
+    std::optional<mip_node_t<i_t, f_t>> start_node;
 
-  // the stack does not own the unique_ptr the tree does
-  node_stack.push_back(root_node.get_down_child());
-  node_stack.push_back(root_node.get_up_child());
+    mutex_dive_queue_.lock();
+    if (dive_queue_.size() > 0) { start_node = dive_queue_.pop(); }
+    mutex_dive_queue_.unlock();
 
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t leaf_problem = original_lp_;
+    if (start_node.has_value()) {
+      if (get_upper_bound() < start_node->lower_bound) { continue; }
 
-  csc_matrix_t<i_t, f_t> Arow(1, 1, 1);
-  original_lp_.A.transpose(Arow);
+      search_tree_t<i_t, f_t> subtree(std::move(start_node.value()));
+      std::deque<mip_node_t<i_t, f_t>*> stack;
+      stack.push_front(&subtree.root);
 
-  f_t lower_bound    = get_lower_bound();
-  f_t gap            = get_upper_bound() - lower_bound;
-  i_t nodes_explored = 0;
-
-  while (node_stack.size() > 0) {
-    // Get a node off the stack
-    mip_node_t<i_t, f_t>* node_ptr = node_stack.back();
-    node_stack.pop_back();
-    nodes_explored++;
-
-    f_t upper_bound      = get_upper_bound();
-    lower_bound          = get_lower_bound();
-    gap                  = upper_bound - lower_bound;
-    f_t user_upper_bound = compute_user_objective(original_lp_, get_upper_bound());
-    f_t user_lower_bound = compute_user_objective(original_lp_, lower_bound);
-    f_t rel_gap          = relative_gap(user_upper_bound, user_lower_bound);
-    if (gap < settings_.absolute_mip_gap_tol && rel_gap < settings_.relative_mip_gap_tol) {
-      update_tree(node_ptr, node_status_t::FATHOMED);
-      continue;
-    }
+      while (stack.size() > 0 && status_ == mip_exploration_status_t::RUNNING) {
+        mip_node_t<i_t, f_t>* node_ptr = stack.front();
+        stack.pop_front();
+        f_t upper_bound = get_upper_bound();
+        f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, node_ptr->lower_bound);
 
-    if (toc(stats_.start_time) > settings_.time_limit) {
-      status = mip_status_t::TIME_LIMIT;
-      break;
-    }
+        if (node_ptr->lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
+          continue;
+        }
 
-    status = solve_node_lp(node_ptr, leaf_problem, Arow, var_types_, upper_bound);
-
-    if (status == mip_status_t::NUMERICAL) { continue; }
-
-    if (node_ptr->status == node_status_t::HAS_CHILDREN) {
-      // Martin's child selection
-      const i_t branch_var     = node_ptr->get_down_child()->branch_var;
-      const f_t branch_var_val = node_ptr->get_down_child()->fractional_val;
-      const f_t down_val       = std::floor(root_relax_soln_.x[branch_var]);
-      const f_t up_val         = std::ceil(root_relax_soln_.x[branch_var]);
-      const f_t down_dist      = branch_var_val - down_val;
-      const f_t up_dist        = up_val - branch_var_val;
-
-      if (down_dist < up_dist) {
-        node_stack.push_back(node_ptr->get_up_child());
-        node_stack.push_back(node_ptr->get_down_child());
-      } else {
-        node_stack.push_back(node_ptr->get_down_child());
-        node_stack.push_back(node_ptr->get_up_child());
+        node_status_t node_status =
+          solve_node(subtree, node_ptr, leaf_problem, Arow, upper_bound, log, 'D');
+
+        if (node_status == node_status_t::TIME_LIMIT) {
+          break;
+
+        } else if (node_status == node_status_t::HAS_CHILDREN) {
+          auto [first, second] = child_selection(node_ptr);
+          stack.push_front(second);
+          stack.push_front(first);
+
+          // If the diving thread is consuming the nodes faster than the
+          // best first search, then we split the current subtree at the
+          // lowest possible point and move to the queue, so it can
+          // be picked by another thread.
+          if (dive_queue_.size() < min_diving_queue_size_) {
+            mutex_dive_queue_.lock();
+            mip_node_t<i_t, f_t>* new_node = stack.back();
+            stack.pop_back();
+            dive_queue_.push(new_node->detach_copy());
+            mutex_dive_queue_.unlock();
+          }
+        }
       }
     }
   }
-
-  return status;
 }
 
 template <typename i_t, typename f_t>
 mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solution)
 {
-  mip_status_t status = mip_status_t::UNSET;
+  logger_t log;
+  log.log                 = false;
+  status_                 = mip_exploration_status_t::UNSET;
+  stats_.nodes_unexplored = 0;
+  stats_.nodes_explored   = 0;
 
   if (guess_.size() != 0) {
     std::vector<f_t> crushed_guess;
@@ -888,6 +1032,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   lp_settings.inside_mip                = 1;
   lp_status_t root_status               = solve_linear_program_advanced(
     original_lp_, stats_.start_time, lp_settings, root_relax_soln_, root_vstatus_, edge_norms_);
+  stats_.total_lp_iters      = root_relax_soln_.iterations;
   stats_.total_lp_solve_time = toc(stats_.start_time);
   assert(root_vstatus_.size() == original_lp_.num_cols);
   if (root_status == lp_status_t::INFEASIBLE) {
@@ -907,13 +1052,17 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     }
     return mip_status_t::UNBOUNDED;
   }
+
   if (root_status == lp_status_t::TIME_LIMIT) {
-    settings_.log.printf("Hit time limit\n");
-    return mip_status_t::TIME_LIMIT;
+    status_ = mip_exploration_status_t::TIME_LIMIT;
+    return set_final_solution(solution, -inf);
   }
+
   set_uninitialized_steepest_edge_norms<i_t, f_t>(edge_norms_);
 
   root_objective_ = compute_objective(original_lp_, root_relax_soln_.x);
+  local_lower_bounds_.assign(settings_.num_bfs_threads, root_objective_);
+
   if (settings_.set_simplex_solution_callback != nullptr) {
     std::vector<f_t> original_x;
     uncrush_primal_solution(original_problem_, original_lp_, root_relax_soln_.x, original_x);
@@ -928,9 +1077,6 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     settings_.set_simplex_solution_callback(
       original_x, original_dual, compute_user_objective(original_lp_, root_objective_));
   }
-  mutex_lower_.lock();
-  lower_bound_ = root_objective_;
-  mutex_lower_.unlock();
 
   std::vector<i_t> fractional;
   const i_t num_fractional =
@@ -944,12 +1090,13 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     // We should be done here
     uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, solution.x);
     solution.objective          = incumbent_.objective;
-    solution.lower_bound        = lower_bound_;
+    solution.lower_bound        = root_objective_;
     solution.nodes_explored     = 0;
     solution.simplex_iterations = root_relax_soln_.iterations;
     settings_.log.printf("Optimal solution found at root node. Objective %.16e. Time %.2f.\n",
                          compute_user_objective(original_lp_, root_objective_),
                          toc(stats_.start_time));
+
     if (settings_.solution_callback != nullptr) {
       settings_.solution_callback(solution.x, solution.objective);
     }
@@ -971,81 +1118,84 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                              edge_norms_,
                              pc_);
 
+  if (toc(stats_.start_time) > settings_.time_limit) {
+    status_ = mip_exploration_status_t::TIME_LIMIT;
+    return set_final_solution(solution, root_objective_);
+  }
+
   // Choose variable to branch on
-  logger_t log;
-  log.log        = false;
-  i_t branch_var = pc_.variable_selection(
-    fractional, root_relax_soln_.x, original_lp_.lower, original_lp_.upper, log);
+  i_t branch_var = pc_.variable_selection(fractional, root_relax_soln_.x, log);
 
-  stats_.total_lp_iters   = 0;
-  stats_.nodes_explored   = 0;
-  stats_.nodes_unexplored = 0;
-  stats_.num_nodes        = 1;
+  search_tree_t<i_t, f_t> search_tree(root_objective_, root_vstatus_);
+  search_tree.graphviz_node(settings_.log, &search_tree.root, "lower bound", root_objective_);
+  search_tree.branch(&search_tree.root,
+                     branch_var,
+                     root_relax_soln_.x[branch_var],
+                     root_vstatus_,
+                     original_lp_,
+                     log);
 
   settings_.log.printf(
-    "| Explored | Unexplored | Objective   |    Bound    | Depth | Iter/Node |  Gap   | "
-    "   Time \n");
+    "Exploring the B&B tree using %d best-first threads and %d diving threads (%d threads)\n",
+    settings_.num_bfs_threads,
+    settings_.num_diving_threads,
+    settings_.num_threads);
 
-  mutex_branching_.lock();
-  currently_branching_ = true;
-  mutex_branching_.unlock();
-
-  std::future<mip_status_t> diving_thread;
-
-  if (settings_.num_threads > 0) {
-    diving_thread = std::async(std::launch::async, [&]() { return dive(branch_var, solution); });
-  }
+  settings_.log.printf(
+    " | Explored | Unexplored |    Objective    |     Bound     | Depth | Iter/Node |   Gap    "
+    "|  Time  |\n");
 
-  status = explore_tree(branch_var, solution);
+  csc_matrix_t<i_t, f_t> Arow(1, 1, 1);
+  original_lp_.A.transpose(Arow);
 
-  if (settings_.num_threads > 0) { mip_status_t diving_status = diving_thread.get(); }
+  stats_.nodes_explored       = 0;
+  stats_.nodes_unexplored     = 2;
+  stats_.nodes_since_last_log = 0;
+  stats_.last_log             = tic();
+  active_subtrees_            = 0;
+  min_diving_queue_size_      = 4 * settings_.num_diving_threads;
+  status_                     = mip_exploration_status_t::RUNNING;
+  lower_bound_ceiling_        = inf;
+
+#pragma omp parallel num_threads(settings_.num_threads)
+  {
+    // Make a copy of the original LP. We will modify its bounds at each leaf
+    lp_problem_t leaf_problem = original_lp_;
+
+#pragma omp master
+    {
+      auto down_child  = search_tree.root.get_down_child();
+      auto up_child    = search_tree.root.get_up_child();
+      i_t initial_size = 2 * settings_.num_threads;
+
+#pragma omp task
+      exploration_ramp_up(&search_tree, down_child, leaf_problem, Arow, initial_size);
+
+#pragma omp task
+      exploration_ramp_up(&search_tree, up_child, leaf_problem, Arow, initial_size);
+    }
 
-  mutex_branching_.lock();
-  currently_branching_ = false;
-  mutex_branching_.unlock();
+#pragma omp barrier
 
-  f_t user_upper_bound = compute_user_objective(original_lp_, get_upper_bound());
-  f_t user_lower_bound = compute_user_objective(original_lp_, lower_bound_);
-  settings_.log.printf(
-    "Explored %d nodes in %.2fs.\nAbsolute Gap %e Objective %.16e Lower Bound %.16e\n",
-    stats_.nodes_explored.load(),
-    toc(stats_.start_time),
-    gap_,
-    user_upper_bound,
-    user_lower_bound);
-
-  if (gap_ <= settings_.absolute_mip_gap_tol ||
-      relative_gap(user_upper_bound, user_lower_bound) <= settings_.relative_mip_gap_tol) {
-    status = mip_status_t::OPTIMAL;
-    if (gap_ > 0 && gap_ <= settings_.absolute_mip_gap_tol) {
-      settings_.log.printf("Optimal solution found within absolute MIP gap tolerance (%.1e)\n",
-                           settings_.absolute_mip_gap_tol);
-    } else if (gap_ > 0 &&
-               relative_gap(user_upper_bound, user_lower_bound) <= settings_.relative_mip_gap_tol) {
-      settings_.log.printf("Optimal solution found within relative MIP gap tolerance (%.1e)\n",
-                           settings_.relative_mip_gap_tol);
-    } else {
-      settings_.log.printf("Optimal solution found.\n");
-    }
-    if (settings_.heuristic_preemption_callback != nullptr) {
-      settings_.heuristic_preemption_callback();
-    }
-  }
+#pragma omp master
+    {
+      if (status_ == mip_exploration_status_t::RUNNING &&
+          (active_subtrees_ > 0 || get_heap_size() > 0)) {
+        for (i_t i = 0; i < settings_.num_bfs_threads; i++) {
+#pragma omp task
+          best_first_thread(i, search_tree, leaf_problem, Arow);
+        }
 
-  if (stats_.nodes_unexplored == 0 && get_upper_bound() == inf) {
-    settings_.log.printf("Integer infeasible.\n");
-    status = mip_status_t::INFEASIBLE;
-    if (settings_.heuristic_preemption_callback != nullptr) {
-      settings_.heuristic_preemption_callback();
+        for (i_t i = 0; i < settings_.num_diving_threads; i++) {
+#pragma omp task
+          diving_thread(leaf_problem, Arow);
+        }
+      }
     }
   }
 
-  uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, solution.x);
-  solution.objective          = incumbent_.objective;
-  solution.lower_bound        = get_lower_bound();
-  solution.nodes_explored     = stats_.nodes_explored;
-  solution.simplex_iterations = stats_.total_lp_iters;
-  return status;
+  f_t lower_bound = heap_.size() > 0 ? heap_.top()->lower_bound : search_tree.root.lower_bound;
+  return set_final_solution(solution, lower_bound);
 }
 
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp
index efe28000bd..7b80f88fa9 100644
--- a/cpp/src/dual_simplex/branch_and_bound.hpp
+++ b/cpp/src/dual_simplex/branch_and_bound.hpp
@@ -18,40 +18,82 @@
 #pragma once
 
 #include <dual_simplex/initial_basis.hpp>
+#include <dual_simplex/mip_node.hpp>
 #include <dual_simplex/phase2.hpp>
 #include <dual_simplex/presolve.hpp>
 #include <dual_simplex/pseudo_costs.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
 #include <dual_simplex/solution.hpp>
 #include <dual_simplex/types.hpp>
+#include <utilities/omp_helpers.hpp>
 
-#include <atomic>
-#include <memory>
-#include <mutex>
+#include <omp.h>
 #include <queue>
-#include <string>
 #include <vector>
-#include "cuopt/linear_programming/mip/solver_settings.hpp"
-#include "dual_simplex/mip_node.hpp"
 
 namespace cuopt::linear_programming::dual_simplex {
 
 enum class mip_status_t {
-  OPTIMAL    = 0,
-  UNBOUNDED  = 1,
-  INFEASIBLE = 2,
-  TIME_LIMIT = 3,
-  NODE_LIMIT = 4,
-  NUMERICAL  = 5,
-  UNSET      = 6
+  OPTIMAL    = 0,  // The optimal integer solution was found
+  UNBOUNDED  = 1,  // The problem is unbounded
+  INFEASIBLE = 2,  // The problem is infeasible
+  TIME_LIMIT = 3,  // The solver reached a time limit
+  NODE_LIMIT = 4,  // The maximum number of nodes was reached (not implemented)
+  NUMERICAL  = 5,  // The solver encountered a numerical error
+  UNSET      = 6,  // The status is not set
+};
+
+enum class mip_exploration_status_t {
+  UNSET      = 0,  // The status is not set
+  TIME_LIMIT = 1,  // The solver reached a time limit
+  NODE_LIMIT = 2,  // The maximum number of nodes was reached (not implemented)
+  NUMERICAL  = 3,  // The solver encountered a numerical error
+  RUNNING    = 4,  // The solver is currently exploring the tree
+  COMPLETED  = 5,  // The solver finished exploring the tree
 };
 
 template <typename i_t, typename f_t>
 void upper_bound_callback(f_t upper_bound);
 
+// A min-heap for storing the starting nodes for the dives.
+// This has a maximum size of 8192, such that the container
+// will discard the least promising node if the queue is full.
+template <typename i_t, typename f_t>
+class dive_queue_t {
+ private:
+  std::vector<mip_node_t<i_t, f_t>> buffer;
+  static constexpr i_t max_size_ = 2048;
+
+ public:
+  dive_queue_t() { buffer.reserve(max_size_); }
+
+  void push(mip_node_t<i_t, f_t>&& node)
+  {
+    buffer.push_back(std::move(node));
+    std::push_heap(buffer.begin(), buffer.end(), node_compare_t<i_t, f_t>());
+    if (buffer.size() > max_size()) { buffer.pop_back(); }
+  }
+
+  mip_node_t<i_t, f_t> pop()
+  {
+    std::pop_heap(buffer.begin(), buffer.end(), node_compare_t<i_t, f_t>());
+    mip_node_t<i_t, f_t> node = std::move(buffer.back());
+    buffer.pop_back();
+    return node;
+  }
+
+  i_t size() const { return buffer.size(); }
+  constexpr i_t max_size() const { return max_size_; }
+  const mip_node_t<i_t, f_t>& top() const { return buffer.front(); }
+  void clear() { buffer.clear(); }
+};
+
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
+  template <typename T>
+  using mip_node_heap_t = std::priority_queue<T, std::vector<T>, node_compare_t<i_t, f_t>>;
+
   branch_and_bound_t(const user_problem_t<i_t, f_t>& user_problem,
                      const simplex_solver_settings_t<i_t, f_t>& solver_settings);
 
@@ -68,8 +110,8 @@ class branch_and_bound_t {
                        std::vector<f_t>& repaired_solution) const;
 
   f_t get_upper_bound();
-
   f_t get_lower_bound();
+  i_t get_heap_size();
 
   // The main entry routine. Returns the solver status and populates solution with the incumbent.
   mip_status_t solve(mip_solution_t<i_t, f_t>& solution);
@@ -81,18 +123,16 @@ class branch_and_bound_t {
   // Initial guess.
   std::vector<f_t> guess_;
 
+  // LP relaxation
   lp_problem_t<i_t, f_t> original_lp_;
   std::vector<i_t> new_slacks_;
   std::vector<variable_type_t> var_types_;
 
-  // Mutex for lower bound
-  std::mutex mutex_lower_;
-
-  // Global variable for lower bound
-  f_t lower_bound_;
+  // Local lower bounds for each thread
+  std::vector<omp_atomic_t<f_t>> local_lower_bounds_;
 
   // Mutex for upper bound
-  std::mutex mutex_upper_;
+  omp_mutex_t mutex_upper_;
 
   // Global variable for upper bound
   f_t upper_bound_;
@@ -100,31 +140,21 @@ class branch_and_bound_t {
   // Global variable for incumbent. The incumbent should be updated with the upper bound
   mip_solution_t<i_t, f_t> incumbent_;
 
-  // Mutex for gap
-  std::mutex mutex_gap_;
-
-  // Global variable for gap
-  f_t gap_;
-
-  // Mutex for branching
-  std::mutex mutex_branching_;
-  bool currently_branching_;
-
-  // Global variable for stats
-  std::mutex mutex_stats_;
-
-  // Note that floating point atomics are only supported in C++20.
+  // Structure with the general info of the solver.
   struct stats_t {
-    f_t start_time                    = 0.0;
-    f_t total_lp_solve_time           = 0.0;
-    std::atomic<i_t> nodes_explored   = 0;
-    std::atomic<i_t> nodes_unexplored = 0;
-    f_t total_lp_iters                = 0;
-    std::atomic<i_t> num_nodes        = 0;
+    f_t start_time                        = 0.0;
+    omp_atomic_t<f_t> total_lp_solve_time = 0.0;
+    omp_atomic_t<i_t> nodes_explored      = 0;
+    omp_atomic_t<i_t> nodes_unexplored    = 0;
+    omp_atomic_t<f_t> total_lp_iters      = 0;
+
+    // This should only be used by the main thread
+    f_t last_log                           = 0.0;
+    omp_atomic_t<i_t> nodes_since_last_log = 0;
   } stats_;
 
   // Mutex for repair
-  std::mutex mutex_repair_;
+  omp_mutex_t mutex_repair_;
   std::vector<std::vector<f_t>> repair_queue_;
 
   // Variables for the root node in the search tree.
@@ -135,49 +165,77 @@ class branch_and_bound_t {
 
   // Pseudocosts
   pseudo_costs_t<i_t, f_t> pc_;
-  std::mutex mutex_pc_;
 
-  // Update the status of the nodes in the search tree.
-  void update_tree(mip_node_t<i_t, f_t>* node_ptr, node_status_t status);
+  // Heap storing the nodes to be explored.
+  omp_mutex_t mutex_heap_;
+  mip_node_heap_t<mip_node_t<i_t, f_t>*> heap_;
+
+  // Count the number of subtrees that are currently being explored.
+  omp_atomic_t<i_t> active_subtrees_;
+
+  // Queue for storing the promising node for performing dives.
+  omp_mutex_t mutex_dive_queue_;
+  dive_queue_t<i_t, f_t> dive_queue_;
+  i_t min_diving_queue_size_;
+
+  // Global status of the solver.
+  omp_atomic_t<mip_exploration_status_t> status_;
+
+  // In case, a best-first thread encounters a numerical issue when solving a node,
+  // its blocks the progression of the lower bound.
+  omp_atomic_t<f_t> lower_bound_ceiling_;
+
+  // Set the final solution.
+  mip_status_t set_final_solution(mip_solution_t<i_t, f_t>& solution, f_t lower_bound);
 
   // Update the incumbent solution with the new feasible solution.
   // found during branch and bound.
   void add_feasible_solution(f_t leaf_objective,
                              const std::vector<f_t>& leaf_solution,
                              i_t leaf_depth,
-                             char symbol);
+                             char thread_type);
 
   // Repairs low-quality solutions from the heuristics, if it is applicable.
   void repair_heuristic_solutions();
 
-  // Explore the search tree using the best-first search strategy.
-  mip_status_t explore_tree(i_t branch_var, mip_solution_t<i_t, f_t>& solution);
-
-  // Explore the search tree using the depth-first search strategy.
-  mip_status_t dive(i_t branch_var, mip_solution_t<i_t, f_t>& solution);
-
-  // Branch the current node, creating two children.
-  void branch(mip_node_t<i_t, f_t>* parent_node,
-              i_t branch_var,
-              f_t branch_var_val,
-              const std::vector<variable_status_t>& parent_vstatus);
-
-  // Solve the LP relaxation of a leaf node.
-  mip_status_t solve_node_lp(mip_node_t<i_t, f_t>* node_ptr,
-                             lp_problem_t<i_t, f_t>& leaf_problem,
-                             csc_matrix_t<i_t, f_t>& Arow,
-                             const std::vector<variable_type_t>& var_types,
-                             f_t upper_bound);
-
-  // Solve the LP relaxation of a leaf node using the dual simplex method.
-  dual::status_t node_dual_simplex(i_t leaf_id,
-                                   lp_problem_t<i_t, f_t>& leaf_problem,
-                                   std::vector<variable_status_t>& leaf_vstatus,
-                                   lp_solution_t<i_t, f_t>& leaf_solution,
-                                   std::vector<bool>& bounds_changed,
-                                   csc_matrix_t<i_t, f_t>& Arow,
-                                   f_t upper_bound,
-                                   logger_t& log);
+  // Ramp-up phase of the solver, where we greedily expand the tree until
+  // there is enough unexplored nodes. This is done recursively using OpenMP tasks.
+  void exploration_ramp_up(search_tree_t<i_t, f_t>* search_tree,
+                           mip_node_t<i_t, f_t>* node,
+                           lp_problem_t<i_t, f_t>& leaf_problem,
+                           const csc_matrix_t<i_t, f_t>& Arow,
+                           i_t initial_heap_size);
+
+  // Explore the search tree using the best-first search with plunging strategy.
+  void explore_subtree(i_t id,
+                       search_tree_t<i_t, f_t>& search_tree,
+                       mip_node_t<i_t, f_t>* start_node,
+                       lp_problem_t<i_t, f_t>& leaf_problem,
+                       const csc_matrix_t<i_t, f_t>& Arow);
+
+  // Each "main" thread pops a node from the global heap and then performs a plunge
+  // (i.e., a shallow dive) into the subtree determined by the node.
+  void best_first_thread(i_t id,
+                         search_tree_t<i_t, f_t>& search_tree,
+                         lp_problem_t<i_t, f_t>& leaf_problem,
+                         const csc_matrix_t<i_t, f_t>& Arow);
+
+  // Each diving thread pops the first node from the dive queue and then performs
+  // a deep dive into the subtree determined by the node.
+  void diving_thread(lp_problem_t<i_t, f_t>& leaf_problem, const csc_matrix_t<i_t, f_t>& Arow);
+
+  // Solve the LP relaxation of a leaf node and update the tree.
+  node_status_t solve_node(search_tree_t<i_t, f_t>& search_tree,
+                           mip_node_t<i_t, f_t>* node_ptr,
+                           lp_problem_t<i_t, f_t>& leaf_problem,
+                           const csc_matrix_t<i_t, f_t>& Arow,
+                           f_t upper_bound,
+                           logger_t& log,
+                           char thread_type);
+
+  // Sort the children based on the Martin's criteria.
+  std::pair<mip_node_t<i_t, f_t>*, mip_node_t<i_t, f_t>*> child_selection(
+    mip_node_t<i_t, f_t>* node_ptr);
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/mip_node.cpp b/cpp/src/dual_simplex/mip_node.cpp
index f0b275d0a4..2a907c4022 100644
--- a/cpp/src/dual_simplex/mip_node.cpp
+++ b/cpp/src/dual_simplex/mip_node.cpp
@@ -22,7 +22,7 @@ namespace cuopt::linear_programming::dual_simplex {
 bool inactive_status(node_status_t status)
 {
   return (status == node_status_t::FATHOMED || status == node_status_t::INTEGER_FEASIBLE ||
-          status == node_status_t::INFEASIBLE);
+          status == node_status_t::INFEASIBLE || status == node_status_t::NUMERICAL);
 }
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/mip_node.hpp b/cpp/src/dual_simplex/mip_node.hpp
index 2d315fe0e3..a3a34eb81d 100644
--- a/cpp/src/dual_simplex/mip_node.hpp
+++ b/cpp/src/dual_simplex/mip_node.hpp
@@ -19,8 +19,8 @@
 
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/types.hpp>
+#include <utilities/omp_helpers.hpp>
 
-#include <atomic>
 #include <cmath>
 #include <list>
 #include <memory>
@@ -30,11 +30,12 @@ namespace cuopt::linear_programming::dual_simplex {
 
 enum class node_status_t : int {
   ACTIVE           = 0,  // Node still in the tree
-  IN_PROGRESS      = 1,  // Node is currently being solved
-  INTEGER_FEASIBLE = 2,  // Node has an integer feasible solution
-  INFEASIBLE       = 3,  // Node is infeasible
-  FATHOMED         = 4,  // Node objective is greater than the upper bound
-  HAS_CHILDREN     = 5,  // Node has children to explore
+  INTEGER_FEASIBLE = 1,  // Node has an integer feasible solution
+  INFEASIBLE       = 2,  // Node is infeasible
+  FATHOMED         = 3,  // Node objective is greater than the upper bound
+  HAS_CHILDREN     = 4,  // Node has children to explore
+  NUMERICAL        = 5,  // Encountered numerical issue when solving the LP relaxation
+  TIME_LIMIT       = 6   // Time out during the LP relaxation
 };
 
 bool inactive_status(node_status_t status);
@@ -201,6 +202,22 @@ class mip_node_t {
     }
   }
 
+  // This method creates a copy of the current node
+  // with its parent set to `nullptr`, `node_id = 0`
+  // and `depth = 0` such that it is the root
+  // of a separated tree.
+  mip_node_t<i_t, f_t> detach_copy() const
+  {
+    mip_node_t<i_t, f_t> copy(lower_bound, vstatus);
+    copy.branch_var       = branch_var;
+    copy.branch_dir       = branch_dir;
+    copy.branch_var_lower = branch_var_lower;
+    copy.branch_var_upper = branch_var_upper;
+    copy.fractional_val   = fractional_val;
+    copy.node_id          = node_id;
+    return copy;
+  }
+
   node_status_t status;
   f_t lower_bound;
   i_t depth;
@@ -230,11 +247,96 @@ void remove_fathomed_nodes(std::vector<mip_node_t<i_t, f_t>*>& stack)
 template <typename i_t, typename f_t>
 class node_compare_t {
  public:
-  bool operator()(mip_node_t<i_t, f_t>& a, mip_node_t<i_t, f_t>& b)
+  bool operator()(const mip_node_t<i_t, f_t>& a, const mip_node_t<i_t, f_t>& b) const
   {
     return a.lower_bound >
            b.lower_bound;  // True if a comes before b, elements that come before are output last
   }
+
+  bool operator()(const mip_node_t<i_t, f_t>* a, const mip_node_t<i_t, f_t>* b) const
+  {
+    return a->lower_bound >
+           b->lower_bound;  // True if a comes before b, elements that come before are output last
+  }
+};
+
+template <typename i_t, typename f_t>
+class search_tree_t {
+ public:
+  search_tree_t(f_t root_lower_bound, const std::vector<variable_status_t>& basis)
+    : root(root_lower_bound, basis), num_nodes(0)
+  {
+  }
+
+  search_tree_t(mip_node_t<i_t, f_t>&& node) : root(std::move(node)), num_nodes(0) {}
+
+  void update_tree(mip_node_t<i_t, f_t>* node_ptr, node_status_t status)
+  {
+    mutex.lock();
+    std::vector<mip_node_t<i_t, f_t>*> stack;
+    node_ptr->set_status(status, stack);
+    remove_fathomed_nodes(stack);
+    mutex.unlock();
+  }
+
+  void branch(mip_node_t<i_t, f_t>* parent_node,
+              const i_t branch_var,
+              const f_t fractional_val,
+              const std::vector<variable_status_t>& parent_vstatus,
+              const lp_problem_t<i_t, f_t>& original_lp,
+              logger_t& log)
+  {
+    i_t id = num_nodes.fetch_add(2);
+
+    // down child
+    auto down_child = std::make_unique<mip_node_t<i_t, f_t>>(
+      original_lp, parent_node, ++id, branch_var, 0, fractional_val, parent_vstatus);
+
+    graphviz_edge(log, parent_node, down_child.get(), branch_var, 0, std::floor(fractional_val));
+
+    // up child
+    auto up_child = std::make_unique<mip_node_t<i_t, f_t>>(
+      original_lp, parent_node, ++id, branch_var, 1, fractional_val, parent_vstatus);
+
+    graphviz_edge(log, parent_node, up_child.get(), branch_var, 1, std::ceil(fractional_val));
+
+    assert(parent_vstatus.size() == original_lp.num_cols);
+    parent_node->add_children(std::move(down_child),
+                              std::move(up_child));  // child pointers moved into the tree
+  }
+
+  void graphviz_node(logger_t& log,
+                     const mip_node_t<i_t, f_t>* node_ptr,
+                     const std::string label,
+                     const f_t val)
+  {
+    if (write_graphviz) {
+      log.printf("Node%d [label=\"%s %.16e\"]\n", node_ptr->node_id, label.c_str(), val);
+    }
+  }
+
+  void graphviz_edge(logger_t& log,
+                     const mip_node_t<i_t, f_t>* origin_ptr,
+                     const mip_node_t<i_t, f_t>* dest_ptr,
+                     const i_t branch_var,
+                     const i_t branch_dir,
+                     const f_t bound)
+  {
+    if (write_graphviz) {
+      log.printf("Node%d -> Node%d [label=\"x%d %s %e\"]\n",
+                 origin_ptr->node_id,
+                 dest_ptr->node_id,
+                 branch_var,
+                 branch_dir == 0 ? "<=" : ">=",
+                 bound);
+    }
+  }
+
+  mip_node_t<i_t, f_t> root;
+  omp_mutex_t mutex;
+  omp_atomic_t<i_t> num_nodes;
+
+  static constexpr bool write_graphviz = false;
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index 370badf332..01d995ba6b 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -29,9 +29,7 @@
 #include <cassert>
 #include <cmath>
 #include <cstdio>
-#include <iterator>
-#include <limits>
-#include <list>
+#include <ctime>
 
 namespace cuopt::linear_programming::dual_simplex {
 
diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp
index 95c96f859f..4bd9590e16 100644
--- a/cpp/src/dual_simplex/pseudo_costs.cpp
+++ b/cpp/src/dual_simplex/pseudo_costs.cpp
@@ -21,15 +21,14 @@
 #include <dual_simplex/solve.hpp>
 #include <dual_simplex/tic_toc.hpp>
 
-#include <random>
+#include <omp.h>
 
 namespace cuopt::linear_programming::dual_simplex {
 
 namespace {
 
 template <typename i_t, typename f_t>
-void strong_branch_helper(i_t thread_id,
-                          i_t start,
+void strong_branch_helper(i_t start,
                           i_t end,
                           f_t start_time,
                           const lp_problem_t<i_t, f_t>& original_lp,
@@ -46,6 +45,7 @@ void strong_branch_helper(i_t thread_id,
 
   constexpr bool verbose = false;
   f_t last_log           = tic();
+  i_t thread_id          = omp_get_thread_num();
   for (i_t k = start; k < end; ++k) {
     const i_t j = fractional[k];
 
@@ -79,7 +79,6 @@ void strong_branch_helper(i_t thread_id,
                                           solution,
                                           iter,
                                           child_edge_norms);
-      const f_t lp_solve_time                = toc(lp_start_time);
 
       f_t obj = std::numeric_limits<f_t>::infinity();
       if (status == dual::status_t::DUAL_UNBOUNDED) {
@@ -126,15 +125,10 @@ void strong_branch_helper(i_t thread_id,
     }
     if (toc(start_time) > settings.time_limit) { break; }
 
-    pc.strong_branches_completed.lock();
-    pc.num_strong_branches_completed++;
-    pc.strong_branches_completed.unlock();
+    const i_t completed = pc.num_strong_branches_completed++;
 
     if (thread_id == 0 && toc(last_log) > 10) {
       last_log = tic();
-      pc.strong_branches_completed.lock();
-      const i_t completed = pc.num_strong_branches_completed;
-      pc.strong_branches_completed.unlock();
       settings.log.printf("%d of %ld strong branches completed in %.1fs\n",
                           completed,
                           fractional.size(),
@@ -148,58 +142,6 @@ void strong_branch_helper(i_t thread_id,
   }
 }
 
-template <typename i_t>
-void get_partitioning(i_t N, i_t num_threads, i_t tid, i_t& begin, i_t& end)
-{
-  int base_tasks      = N / num_threads;
-  int remaining_tasks = N % num_threads;
-
-  if (tid < remaining_tasks) {
-    begin = tid * (base_tasks + 1);
-    end   = begin + base_tasks + 1;
-  } else {
-    begin = tid * base_tasks + remaining_tasks;
-    end   = begin + base_tasks;
-  }
-}
-
-template <typename i_t, typename f_t>
-void thread_strong_branch(i_t thread_id,
-                          i_t num_threads,
-                          f_t start_time,
-                          const lp_problem_t<i_t, f_t>& original_lp,
-                          const simplex_solver_settings_t<i_t, f_t>& settings,
-                          const std::vector<variable_type_t>& var_types,
-                          const std::vector<i_t>& fractional,
-                          f_t root_obj,
-                          const std::vector<f_t>& root_soln,
-                          const std::vector<variable_status_t>& root_vstatus,
-                          const std::vector<f_t>& edge_norms,
-                          pseudo_costs_t<i_t, f_t>& pc)
-{
-  i_t start, end;
-  get_partitioning(static_cast<i_t>(fractional.size()), num_threads, thread_id, start, end);
-  constexpr bool verbose = false;
-  if (verbose) {
-    settings.log.printf(
-      "Thread id %d start %d end %d. size %d\n", thread_id, start, end, end - start);
-  }
-
-  strong_branch_helper(thread_id,
-                       start,
-                       end,
-                       start_time,
-                       original_lp,
-                       settings,
-                       var_types,
-                       fractional,
-                       root_obj,
-                       root_soln,
-                       root_vstatus,
-                       edge_norms,
-                       pc);
-}
-
 }  // namespace
 
 template <typename i_t, typename f_t>
@@ -214,51 +156,49 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       const std::vector<f_t>& edge_norms,
                       pseudo_costs_t<i_t, f_t>& pc)
 {
+  pc.resize(original_lp.num_cols);
   pc.strong_branch_down.resize(fractional.size());
   pc.strong_branch_up.resize(fractional.size());
-
   pc.num_strong_branches_completed = 0;
 
-  if (fractional.size() > settings.num_threads) {
-    int num_threads = settings.num_threads;
-    std::thread threads[num_threads];
-    settings.log.printf("Strong branching using %d threads and %ld fractional variables\n",
-                        num_threads,
-                        fractional.size());
-    for (int thread_id = 0; thread_id < num_threads; thread_id++) {
-      threads[thread_id] = std::thread(thread_strong_branch<i_t, f_t>,
-                                       thread_id,
-                                       num_threads,
-                                       start_time,
-                                       original_lp,
-                                       settings,
-                                       var_types,
-                                       fractional,
-                                       root_obj,
-                                       root_soln,
-                                       root_vstatus,
-                                       edge_norms,
-                                       std::ref(pc));
-    }
+  settings.log.printf("Strong branching using %d threads and %ld fractional variables\n",
+                      settings.num_threads,
+                      fractional.size());
+
+#pragma omp parallel num_threads(settings.num_threads)
+  {
+    i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
+
+    // Here we are creating more tasks than the number of threads
+    // such that they can be scheduled dynamically to the threads.
+#pragma omp for schedule(dynamic, 1)
+    for (i_t k = 0; k < n; k++) {
+      i_t start = std::floor(k * fractional.size() / n);
+      i_t end   = std::floor((k + 1) * fractional.size() / n);
+
+      constexpr bool verbose = false;
+      if (verbose) {
+        settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n",
+                            omp_get_thread_num(),
+                            k,
+                            start,
+                            end,
+                            end - start);
+      }
 
-    for (int thread_id = 0; thread_id < num_threads; thread_id++) {
-      threads[thread_id].join();
+      strong_branch_helper(start,
+                           end,
+                           start_time,
+                           original_lp,
+                           settings,
+                           var_types,
+                           fractional,
+                           root_obj,
+                           root_soln,
+                           root_vstatus,
+                           edge_norms,
+                           pc);
     }
-  } else {
-    settings.log.printf("Strong branching on %ld fractional variables\n", fractional.size());
-    strong_branch_helper(0,
-                         0,
-                         static_cast<int>(fractional.size()),
-                         start_time,
-                         original_lp,
-                         settings,
-                         var_types,
-                         fractional,
-                         root_obj,
-                         root_soln,
-                         root_vstatus,
-                         edge_norms,
-                         pc);
   }
 
   pc.update_pseudo_costs_from_strong_branching(fractional, root_soln);
@@ -268,6 +208,7 @@ template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_ptr,
                                                    f_t leaf_objective)
 {
+  mutex.lock();
   const f_t change_in_obj = leaf_objective - node_ptr->lower_bound;
   const f_t frac          = node_ptr->branch_dir == 0
                               ? node_ptr->fractional_val - std::floor(node_ptr->fractional_val)
@@ -279,6 +220,7 @@ void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_pt
     pseudo_cost_sum_up[node_ptr->branch_var] += change_in_obj / frac;
     pseudo_cost_num_up[node_ptr->branch_var]++;
   }
+  mutex.unlock();
 }
 
 template <typename i_t, typename f_t>
@@ -317,10 +259,10 @@ void pseudo_costs_t<i_t, f_t>::initialized(i_t& num_initialized_down,
 template <typename i_t, typename f_t>
 i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractional,
                                                  const std::vector<f_t>& solution,
-                                                 const std::vector<f_t>& lower,
-                                                 const std::vector<f_t>& upper,
-                                                 logger_t& log) const
+                                                 logger_t& log)
 {
+  mutex.lock();
+
   const i_t num_fractional = fractional.size();
   std::vector<f_t> pseudo_cost_up(num_fractional);
   std::vector<f_t> pseudo_cost_down(num_fractional);
@@ -373,6 +315,8 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
   log.printf(
     "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], score[select]);
 
+  mutex.unlock();
+
   return branch_var;
 }
 
diff --git a/cpp/src/dual_simplex/pseudo_costs.hpp b/cpp/src/dual_simplex/pseudo_costs.hpp
index d26fe8d489..5bd03e3fcd 100644
--- a/cpp/src/dual_simplex/pseudo_costs.hpp
+++ b/cpp/src/dual_simplex/pseudo_costs.hpp
@@ -21,8 +21,9 @@
 #include <dual_simplex/mip_node.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
 #include <dual_simplex/types.hpp>
+#include <utilities/omp_helpers.hpp>
 
-#include <mutex>
+#include <omp.h>
 
 namespace cuopt::linear_programming::dual_simplex {
 
@@ -54,9 +55,7 @@ class pseudo_costs_t {
 
   i_t variable_selection(const std::vector<i_t>& fractional,
                          const std::vector<f_t>& solution,
-                         const std::vector<f_t>& lower,
-                         const std::vector<f_t>& upper,
-                         logger_t& log) const;
+                         logger_t& log);
 
   void update_pseudo_costs_from_strong_branching(const std::vector<i_t>& fractional,
                                                  const std::vector<f_t>& root_soln);
@@ -67,8 +66,8 @@ class pseudo_costs_t {
   std::vector<f_t> strong_branch_down;
   std::vector<f_t> strong_branch_up;
 
-  std::mutex strong_branches_completed;
-  i_t num_strong_branches_completed = 0;
+  omp_mutex_t mutex;
+  omp_atomic_t<i_t> num_strong_branches_completed = 0;
 };
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index 5b7e8bf0fa..9ff96ebbd0 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -20,11 +20,11 @@
 #include <dual_simplex/logger.hpp>
 #include <dual_simplex/types.hpp>
 
+#include <omp.h>
 #include <algorithm>
 #include <atomic>
 #include <functional>
 #include <limits>
-#include <thread>
 
 namespace cuopt::linear_programming::dual_simplex {
 
@@ -59,9 +59,9 @@ struct simplex_solver_settings_t {
       refactor_frequency(100),
       iteration_log_frequency(1000),
       first_iteration_log(2),
-      num_threads(std::thread::hardware_concurrency() > 8
-                    ? (std::thread::hardware_concurrency() / 8)
-                    : std::thread::hardware_concurrency()),
+      num_threads(omp_get_max_threads() - 1),
+      num_bfs_threads(std::min(num_threads / 4, 1)),
+      num_diving_threads(std::min(num_threads - num_bfs_threads, 1)),
       random_seed(0),
       inside_mip(0),
       solution_callback(nullptr),
@@ -97,15 +97,17 @@ struct simplex_solver_settings_t {
   bool use_bound_flip_ratio;       // true if using the bound flip ratio test
   bool scale_columns;              // true to scale the columns of A
   bool relaxation;                 // true to only solve the LP relaxation of a MIP
-  bool
-    use_left_looking_lu;  // true to use left looking LU factorization, false to use right looking
-  bool eliminate_singletons;    // true to eliminate singletons from the basis
-  bool print_presolve_stats;    // true to print presolve stats
-  i_t refactor_frequency;       // number of basis updates before refactorization
-  i_t iteration_log_frequency;  // number of iterations between log updates
-  i_t first_iteration_log;      // number of iterations to log at beginning of solve
-  i_t num_threads;              // number of threads to use
-  i_t random_seed;              // random seed
+  bool use_left_looking_lu;        // true to use left looking LU factorization,
+                                   // false to use right looking
+  bool eliminate_singletons;       // true to eliminate singletons from the basis
+  bool print_presolve_stats;       // true to print presolve stats
+  i_t refactor_frequency;          // number of basis updates before refactorization
+  i_t iteration_log_frequency;     // number of iterations between log updates
+  i_t first_iteration_log;         // number of iterations to log at beginning of solve
+  i_t num_threads;                 // number of threads to use
+  i_t num_bfs_threads;             // number of threads dedicated to the best-first search
+  i_t num_diving_threads;          // number of threads dedicated to diving
+  i_t random_seed;                 // random seed
   i_t inside_mip;  // 0 if outside MIP, 1 if inside MIP at root node, 2 if inside MIP at leaf node
   std::function<void(std::vector<f_t>&, f_t)> solution_callback;
   std::function<void()> heuristic_preemption_callback;
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 14986df664..a8da9c3255 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -91,7 +91,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_ITERATION_LIMIT, &pdlp_settings.iteration_limit, 0, std::numeric_limits<i_t>::max(), std::numeric_limits<i_t>::max()},
     {CUOPT_PDLP_SOLVER_MODE, reinterpret_cast<int*>(&pdlp_settings.pdlp_solver_mode), CUOPT_PDLP_SOLVER_MODE_STABLE1, CUOPT_PDLP_SOLVER_MODE_STABLE3, CUOPT_PDLP_SOLVER_MODE_STABLE3},
     {CUOPT_METHOD, reinterpret_cast<int*>(&pdlp_settings.method), CUOPT_METHOD_CONCURRENT, CUOPT_METHOD_DUAL_SIMPLEX, CUOPT_METHOD_CONCURRENT},
-    {CUOPT_NUM_CPU_THREADS, &mip_settings.num_cpu_threads, -1, std::numeric_limits<i_t>::max(), -1}
+    {CUOPT_NUM_CPU_THREADS, &mip_settings.num_cpu_threads, -1, std::numeric_limits<i_t>::max(), -1},
   };
 
     // Bool parameters
diff --git a/cpp/src/mip/diversity/diversity_manager.cu b/cpp/src/mip/diversity/diversity_manager.cu
index 9df0966554..a9cd335a90 100644
--- a/cpp/src/mip/diversity/diversity_manager.cu
+++ b/cpp/src/mip/diversity/diversity_manager.cu
@@ -312,9 +312,9 @@ void diversity_manager_t<i_t, f_t>::run_fj_alone(solution_t<i_t, f_t>& solution)
 template <typename i_t, typename f_t>
 void diversity_manager_t<i_t, f_t>::run_fp_alone(solution_t<i_t, f_t>& solution)
 {
-  CUOPT_LOG_INFO("Running FP alone!");
+  CUOPT_LOG_DEBUG("Running FP alone!");
   ls.run_fp(solution, timer, &population);
-  CUOPT_LOG_INFO("FP alone finished!");
+  CUOPT_LOG_DEBUG("FP alone finished!");
 }
 
 template <typename i_t, typename f_t>
@@ -543,7 +543,7 @@ void diversity_manager_t<i_t, f_t>::recombine_and_ls_with_all(
 {
   raft::common::nvtx::range fun_scope("recombine_and_ls_with_all");
   if (solutions.size() > 0) {
-    CUOPT_LOG_INFO("Running recombiners on B&B solutions with size %lu", solutions.size());
+    CUOPT_LOG_DEBUG("Running recombiners on B&B solutions with size %lu", solutions.size());
     // add all solutions because time limit might have been consumed and we might have exited before
     for (auto& sol : solutions) {
       cuopt_func_call(sol.test_feasibility(true));
diff --git a/cpp/src/mip/diversity/recombiners/sub_mip.cuh b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
index 1818092fc9..fa5968c6f5 100644
--- a/cpp/src/mip/diversity/recombiners/sub_mip.cuh
+++ b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
@@ -111,8 +111,11 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
       branch_and_bound_settings.print_presolve_stats = false;
       branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
       branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
-      branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
-      branch_and_bound_settings.solution_callback = [this](std::vector<f_t>& solution,
+      branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
+      branch_and_bound_settings.num_threads     = 2;
+      branch_and_bound_settings.num_bfs_threads = 1;
+      branch_and_bound_settings.num_diving_threads = 1;
+      branch_and_bound_settings.solution_callback  = [this](std::vector<f_t>& solution,
                                                            f_t objective) {
         this->solution_callback(solution, objective);
       };
diff --git a/cpp/src/mip/local_search/rounding/lb_constraint_prop.cu b/cpp/src/mip/local_search/rounding/lb_constraint_prop.cu
index b9408755c8..be74611063 100644
--- a/cpp/src/mip/local_search/rounding/lb_constraint_prop.cu
+++ b/cpp/src/mip/local_search/rounding/lb_constraint_prop.cu
@@ -770,7 +770,6 @@ bool lb_constraint_prop_t<i_t, f_t>::find_integer(
 {
   RAFT_CHECK_CUDA(problem.handle_ptr->get_stream());
   if (orig_sol.problem_ptr->n_integer_vars == 0) {
-    CUOPT_LOG_ERROR("No integer variables provided in the bounds prop rounding");
     cuopt_func_call(orig_sol.test_variable_bounds());
     return orig_sol.compute_feasibility();
   }
diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu
index a9b7615ed6..7c89e849d8 100644
--- a/cpp/src/mip/solver.cu
+++ b/cpp/src/mip/solver.cu
@@ -172,11 +172,19 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
     branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
 
-    if (context.settings.num_cpu_threads != -1) {
+    if (context.settings.num_cpu_threads < 0) {
+      branch_and_bound_settings.num_threads = omp_get_max_threads() - 1;
+    } else {
       branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads);
     }
     CUOPT_LOG_INFO("Using %d CPU threads for B&B", branch_and_bound_settings.num_threads);
 
+    i_t num_threads                              = branch_and_bound_settings.num_threads;
+    i_t num_bfs_threads                          = std::max(1, num_threads / 4);
+    i_t num_diving_threads                       = std::max(1, num_threads - num_bfs_threads);
+    branch_and_bound_settings.num_bfs_threads    = num_bfs_threads;
+    branch_and_bound_settings.num_diving_threads = num_diving_threads;
+
     // Set the branch and bound -> primal heuristics callback
     branch_and_bound_settings.solution_callback =
       std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::solution_callback,
diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp
new file mode 100644
index 0000000000..d5b91c6f91
--- /dev/null
+++ b/cpp/src/utilities/omp_helpers.hpp
@@ -0,0 +1,138 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef _OPENMP
+
+#include <omp.h>
+#include <type_traits>
+
+namespace cuopt {
+
+// Wrapper of omp_lock_t. Optionally, you can provide a hint as defined in
+// https://www.openmp.org/spec-html/5.1/openmpse39.html#x224-2570003.9
+class omp_mutex_t {
+ public:
+  omp_mutex_t() { omp_init_lock(&mutex); }
+  virtual ~omp_mutex_t() { omp_destroy_lock(&mutex); }
+  void lock() { omp_set_lock(&mutex); }
+  void unlock() { omp_unset_lock(&mutex); }
+  bool try_lock() { return omp_test_lock(&mutex); }
+
+ private:
+  omp_lock_t mutex;
+};
+
+// Wrapper for omp atomic operations. See
+// https://www.openmp.org/spec-html/5.1/openmpsu105.html.
+template <typename T>
+class omp_atomic_t {
+ public:
+  omp_atomic_t() = default;
+  omp_atomic_t(T val) : val(val) {}
+
+  T operator=(T new_val)
+  {
+    store(new_val);
+    return new_val;
+  }
+
+  operator T() { return load(); }
+  T operator+=(T inc) { return fetch_add(inc) + inc; }
+  T operator-=(T inc) { return fetch_sub(inc) - inc; }
+
+  // In theory, this should be enabled only for integers,
+  // but it works for any numerical types.
+  T operator++() { return fetch_add(T(1)) + 1; }
+  T operator++(int) { return fetch_add(T(1)); }
+  T operator--() { return fetch_sub(T(1)) - 1; }
+  T operator--(int) { return fetch_sub(T(1)); }
+
+  T load()
+  {
+    T res;
+#pragma omp atomic read
+    res = val;
+    return res;
+  }
+
+  void store(T new_val)
+  {
+#pragma omp atomic write
+    val = new_val;
+  }
+
+  T exchange(T other)
+  {
+    T old;
+#pragma omp atomic capture
+    {
+      old = val;
+      val = other;
+    }
+    return old;
+  }
+
+  T fetch_add(T inc)
+  {
+    T old;
+#pragma omp atomic capture
+    {
+      old = val;
+      val += inc;
+    }
+    return old;
+  }
+
+  T fetch_sub(T inc) { return fetch_add(-inc); }
+
+// Atomic CAS are only supported in OpenMP v5.1
+// (gcc 12+ or clang 14+), however, nvcc (or the host compiler) cannot
+// parse it correctly yet
+#ifndef __NVCC__
+
+  T fetch_min(T other)
+  {
+    T old;
+#pragma omp atomic compare capture
+    {
+      old = val;
+      val = other < val ? other : val;
+    }
+    return old;
+  }
+
+  T fetch_max(T other)
+  {
+    T old;
+#pragma omp atomic compare capture
+    {
+      old = val;
+      val = other > val ? other : val;
+    }
+    return old;
+  }
+#endif
+
+ private:
+  T val;
+};
+
+#endif
+
+}  // namespace cuopt