NVIDIA · srib · Apr 7, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
@@ -1094,6 +1094,7 @@ class iteration_data_t {
     std::sort(column_nz_permutation.begin(),
               column_nz_permutation.end(),
               [&column_nz](i_t i, i_t j) { return column_nz[i] < column_nz[j]; });
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
 
     // We then compute the exact sparsity pattern for columns of A whose where
     // the number of nonzeros is less than a threshold. This part can be done
@@ -1124,6 +1125,7 @@ class iteration_data_t {
     // The best way to do that is to have A stored in CSR format.
     csr_matrix_t<i_t, f_t> A_row(0, 0, 0);
     A.to_compressed_row(A_row);
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
 
     std::vector<i_t> histogram(m + 1, 0);
     for (i_t j = 0; j < n; j++) {
@@ -1253,6 +1255,7 @@ class iteration_data_t {
     std::sort(permutation.begin(), permutation.end(), [&delta_nz](i_t i, i_t j) {
       return delta_nz[i] < delta_nz[j];
     });
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
 
     // Now we make a forward pass and compute the number of nonzeros in C
     // assuming we had included column j
@@ -2297,6 +2300,12 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
     if (use_augmented) {
       RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
       data.form_augmented();
+      // Check halt after form_augmented (synchronous) and before factorize (~1s).
+      // If halt was set while form_augmented ran, we catch it here and skip the
+      // expensive factorization entirely.
+      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+        return CONCURRENT_HALT_RETURN;
+      }
       status = data.chol->factorize(data.device_augmented);
 
 #ifdef CHOLESKY_DEBUG_CHECK
@@ -2305,6 +2314,12 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
     } else {
       // compute ADAT = A Dinv * A^T
       data.form_adat();
+      // Check halt after form_adat (synchronous) and before factorize (~1s).
+      // If halt was set while form_adat ran, we catch it here and skip the
+      // expensive Cholesky factorization entirely.
+      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+        return CONCURRENT_HALT_RETURN;
+      }
       // factorize
       status = data.chol->factorize(data.device_ADAT);
     }

@@ -2431,7 +2431,22 @@ int basis_update_mpf_t<i_t, f_t>::refactor_basis(
   assert(q.size() == A.m);
   reorder_basic_list(q, basic_list);  // We no longer need q after reordering the basic list
   work_estimate_ += 3 * q.size();
-  reset();
+
+  // Check halt before the transpose operations: these can take hundreds of ms
+  // on large problems (L0 and U0 each have O(fill-in) nonzeros) and have no
+  // internal halt checks.  Catching the flag here avoids the dead zone.
+  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    return CONCURRENT_HALT_RETURN;
+  }
+  // Inline reset() so we can check halt between the two transposes.
+  clear();
+  L0_.transpose(L0_transpose_);
+  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    return CONCURRENT_HALT_RETURN;
+  }
+  U0_.transpose(U0_transpose_);
+  work_estimate_ += 6 * L0_.col_start[L0_.n] + 6 * U0_.col_start[U0_.n];
+  reset_stats();
   return 0;
 }
 

@@ -2488,7 +2488,6 @@ dual::status_t dual_phase2(i_t phase,
   const i_t n = lp.num_cols;
   std::vector<i_t> basic_list(m);
   std::vector<i_t> nonbasic_list;
-  std::vector<i_t> superbasic_list;
   basis_update_mpf_t<i_t, f_t> ft(m, settings.refactor_frequency);
   const bool initialize_basis = true;
   return dual_phase2_with_advanced_basis(phase,
@@ -2688,6 +2687,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                         vector_norm2<i_t, f_t>(delta_y_steepest_edge));
   }
 
+  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    return dual::status_t::CONCURRENT_LIMIT;
+  }
+
   if (phase == 2) {
     settings.log.printf(" Iter     Objective           Num Inf.  Sum Inf.     Perturb  Time\n");
   }
@@ -2735,10 +2738,18 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0);
 #endif
 
+  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    return dual::status_t::CONCURRENT_LIMIT;
+  }
+
   csc_matrix_t<i_t, f_t> A_transpose(1, 1, 0);
   lp.A.transpose(A_transpose);
   phase2_work_estimate += 2 * lp.A.col_start[lp.A.n];
 
+  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    return dual::status_t::CONCURRENT_LIMIT;
+  }
+
   f_t obj = compute_objective(lp, x);
   phase2_work_estimate += 2 * n;
 
@@ -2908,6 +2919,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
       phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse);
     }
     timers.btran_time += timers.stop_timer();
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      return dual::status_t::CONCURRENT_LIMIT;
+    }
 
     const f_t steepest_edge_norm_check = delta_y_sparse.norm2_squared();
     phase2_work_estimate += 2 * delta_y_sparse.i.size();
@@ -2966,6 +2980,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
       }
     }
     timers.delta_z_time += timers.stop_timer();
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      return dual::status_t::CONCURRENT_LIMIT;
+    }
 
 #ifdef COMPUTE_DUAL_RESIDUAL
     std::vector<f_t> dual_residual;
@@ -3301,6 +3318,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     }
 
     timers.ftran_time += timers.stop_timer();
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      return dual::status_t::CONCURRENT_LIMIT;
+    }
 
 #ifdef CHECK_PRIMAL_STEP
     std::vector<f_t> residual(m);
@@ -3331,6 +3351,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 #endif
     assert(steepest_edge_status == 0);
     timers.se_norms_time += timers.stop_timer();
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      return dual::status_t::CONCURRENT_LIMIT;
+    }
 
     timers.start_timer();
     // x <- x + delta_x

@@ -120,16 +120,17 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t<i_t, f_t>& original
   std::vector<i_t> basic_list(m);
   std::vector<i_t> nonbasic_list;
   basis_update_mpf_t<i_t, f_t> ft(m, settings.refactor_frequency);
-  return solve_linear_program_with_advanced_basis(original_lp,
-                                                  start_time,
-                                                  settings,
-                                                  original_solution,
-                                                  ft,
-                                                  basic_list,
-                                                  nonbasic_list,
-                                                  vstatus,
-                                                  edge_norms,
-                                                  work_unit_context);
+  lp_status_t result = solve_linear_program_with_advanced_basis(original_lp,
+                                                                start_time,
+                                                                settings,
+                                                                original_solution,
+                                                                ft,
+                                                                basic_list,
+                                                                nonbasic_list,
+                                                                vstatus,
+                                                                edge_norms,
+                                                                work_unit_context);
+  return result;
 }
 
 template <typename i_t, typename f_t>
@@ -222,7 +223,10 @@ lp_status_t solve_linear_program_with_advanced_basis(
   if (phase1_status == dual::status_t::TIME_LIMIT) { return lp_status_t::TIME_LIMIT; }
   if (phase1_status == dual::status_t::WORK_LIMIT) { return lp_status_t::WORK_LIMIT; }
   if (phase1_status == dual::status_t::ITERATION_LIMIT) { return lp_status_t::ITERATION_LIMIT; }
-  if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; }
+  if (phase1_status == dual::status_t::CONCURRENT_LIMIT) {
+    original_solution.iterations = iter;
+    return lp_status_t::CONCURRENT_LIMIT;
+  }
   phase1_obj = phase1_solution.objective;
   if (phase1_obj > -settings.primal_tol) {
     settings.log.printf("Dual feasible solution found.\n");
@@ -309,7 +313,10 @@ lp_status_t solve_linear_program_with_advanced_basis(
     if (status == dual::status_t::TIME_LIMIT) { lp_status = lp_status_t::TIME_LIMIT; }
     if (status == dual::status_t::WORK_LIMIT) { lp_status = lp_status_t::WORK_LIMIT; }
     if (status == dual::status_t::ITERATION_LIMIT) { lp_status = lp_status_t::ITERATION_LIMIT; }
-    if (status == dual::status_t::CONCURRENT_LIMIT) { lp_status = lp_status_t::CONCURRENT_LIMIT; }
+    if (status == dual::status_t::CONCURRENT_LIMIT) {
+      original_solution.iterations = iter;
+      return lp_status_t::CONCURRENT_LIMIT;
+    }
     if (status == dual::status_t::NUMERICAL) { lp_status = lp_status_t::NUMERICAL_ISSUES; }
     if (status == dual::status_t::CUTOFF) { lp_status = lp_status_t::CUTOFF; }
     original_solution.iterations = iter;
@@ -581,6 +588,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t<i_t, f_t>& us
     solution.iterations         = barrier_solution.iterations;
   }
 
+  if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; }
+
   // If we aren't doing crossover, we're done
   if (!settings.crossover || barrier_lp.Q.n > 0) { return barrier_status; }
 
@@ -681,6 +690,10 @@ lp_status_t solve_linear_program(const user_problem_t<i_t, f_t>& user_problem,
   std::vector<f_t> edge_norms;
   lp_status_t status = solve_linear_program_advanced(
     original_lp, start_time, settings, lp_solution, vstatus, edge_norms);
+  if (status == lp_status_t::CONCURRENT_LIMIT) {
+    solution.iterations = lp_solution.iterations;
+    return lp_status_t::CONCURRENT_LIMIT;
+  }
   uncrush_primal_solution(user_problem, original_lp, lp_solution.x, solution.x);
   uncrush_dual_solution(
     user_problem, original_lp, lp_solution.y, lp_solution.z, solution.y, solution.z);