diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu
index 386e654488..ad20034f46 100644
--- a/cpp/src/barrier/barrier.cu
+++ b/cpp/src/barrier/barrier.cu
@@ -1390,7 +1390,7 @@ class iteration_data_t {
     // v = alpha * A * w + beta * v = alpha * A * Dinv * A^T * y + beta * v
     matrix_vector_multiply(A, alpha, w, beta, v);
     if (debug) {
-      printf("||A|| = %.16e\n", vector_norm2<i_t, f_t>(A.x.underlying()));
+      printf("||A|| = %.16e\n", vector_norm2<i_t, f_t>(A.x));
       printf("||w|| = %.16e\n", vector_norm2<i_t, f_t>(w));
       printf("||v|| = %.16e\n", vector_norm2<i_t, f_t>(v));
     }
diff --git a/cpp/src/barrier/cusparse_view.cu b/cpp/src/barrier/cusparse_view.cu
index 05488e77e2..11d036504f 100644
--- a/cpp/src/barrier/cusparse_view.cu
+++ b/cpp/src/barrier/cusparse_view.cu
@@ -159,9 +159,9 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(raft::handle_t const* handle_ptr,
   A_indices_ = device_copy(indices, handle_ptr->get_stream());
   A_data_    = device_copy(data, handle_ptr->get_stream());
 
-  A_T_offsets_ = device_copy(A.col_start.underlying(), handle_ptr->get_stream());
-  A_T_indices_ = device_copy(A.i.underlying(), handle_ptr->get_stream());
-  A_T_data_    = device_copy(A.x.underlying(), handle_ptr->get_stream());
+  A_T_offsets_ = device_copy(A.col_start, handle_ptr->get_stream());
+  A_T_indices_ = device_copy(A.i, handle_ptr->get_stream());
+  A_T_data_    = device_copy(A.x, handle_ptr->get_stream());
 
   cusparseCreateCsr(&A_,
                     rows,
diff --git a/cpp/src/dual_simplex/basis_solves.cpp b/cpp/src/dual_simplex/basis_solves.cpp
index 17f997f4a9..777687d49b 100644
--- a/cpp/src/dual_simplex/basis_solves.cpp
+++ b/cpp/src/dual_simplex/basis_solves.cpp
@@ -17,6 +17,7 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
+// Work = 3 * m
 template <typename i_t>
 i_t reorder_basic_list(const std::vector<i_t>& q, std::vector<i_t>& basic_list)
 {
@@ -165,7 +166,8 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
                     std::vector<i_t>& pinv,
                     std::vector<i_t>& q,
                     std::vector<i_t>& deficient,
-                    std::vector<i_t>& slacks_needed)
+                    std::vector<i_t>& slacks_needed,
+                    f_t& work_estimate)
 {
   raft::common::nvtx::range scope("LU::factorize_basis");
   const i_t m              = basic_list.size();
@@ -177,14 +179,18 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
     // TODO: We should see if we can find the singletons without explictly forming the matrix B
     f_t fact_start = tic();
     csc_matrix_t<i_t, f_t> B(A.m, A.m, 1);
-    form_b(A, basic_list, B);
+    work_estimate += A.m;
+    form_b(A, basic_list, B, work_estimate);
     std::vector<i_t> row_perm(m);
     std::vector<i_t> col_perm(m);
+    work_estimate += 2 * m;
     i_t row_singletons;
     i_t col_singletons;
-    find_singletons(B, row_singletons, row_perm, col_singletons, col_perm);
+    find_singletons(B, row_singletons, row_perm, col_singletons, col_perm, work_estimate);
     std::vector<i_t> row_perm_inv(m);
+    work_estimate += m;
     inverse_permutation(row_perm, row_perm_inv);
+    work_estimate += 3 * m;
 
 #ifdef PRINT_SINGLETONS
     printf("Singletons row %d col %d num %d\n",
@@ -219,6 +225,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
       const i_t Bnz = B.col_start[m];
       L.reallocate(Bnz);
       U.reallocate(Bnz);
+      work_estimate += 2 * Bnz;
       i_t Lnz = 0;
       // Fill in L(:, 0:col_singletons-1) with I
       for (i_t k = 0; k < col_singletons; ++k) {
@@ -228,6 +235,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
         Lnz++;
         assert(Lnz <= Bnz);
       }
+      work_estimate += 3 * col_singletons;
 
       i_t Unz = 0;
       // Fill in U(:, 0:col_singletons-1) with U_11
@@ -242,8 +250,10 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
           Unz++;
           assert(Unz <= Bnz);
         }
+        work_estimate += 4 * (col_end - col_start);
       }
       if (col_singletons > 0) { U.col_start[col_singletons] = Unz; }
+      work_estimate += 3 * col_singletons;
 
       // Ensure U(i, i) is at the end of column i for U_11
       for (i_t k = 0; k < col_singletons; ++k) {
@@ -259,7 +269,9 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
             break;
           }
         }
+        work_estimate += (col_before_end - col_start);
       }
+      work_estimate += 2 * col_singletons;
 
       // Fill in L(:, col_singletons:col_singletons+row_singletons-1) with L_22 and L_32
       //     and U(:, col_singletons:col_singletons+row_singletons-1) with U_12 and I
@@ -284,6 +296,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
             assert(Unz <= Bnz);
           }
         }
+        work_estimate += 5 * (col_end - col_start);
         // add in the identity in U
         U.i[Unz] = k;
         U.x[Unz] = 1.0;
@@ -291,6 +304,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
         assert(Unz <= Bnz);
       }
       L.col_start[num_singletons] = Lnz;
+      work_estimate += 7 * (num_singletons - col_singletons);
 
       // Ensure L(i, i) is at the beginning of column i for L_22 and L32
       for (i_t k = col_singletons; k < num_singletons; ++k) {
@@ -310,8 +324,10 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
             break;
           }
         }
+        work_estimate += (col_end - col_start);
         assert(found_diag);
       }
+      work_estimate += 4 * (num_singletons - col_singletons);
 
       // Compute how many nonzeros in B we have used so far so we know
       // how many nonzeros are in S
@@ -322,7 +338,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
       f_t actual_factor  = 0;
       if (Sdim > 0) {
         csc_matrix_t<i_t, f_t> S(Sdim, Sdim, Snz_max);
-
+        work_estimate += Sdim + Snz_max;
         // Build S
         i_t Snz = 0;
         for (i_t k = num_singletons; k < m; ++k) {
@@ -341,13 +357,18 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
               assert(Snz <= Snz_max);
             }
           }
+          work_estimate += 3 * (col_end - col_start);
         }
         S.col_start[Sdim] = Snz;  // Finalize S
+        work_estimate += 4 * (m - num_singletons);
 
         csc_matrix_t<i_t, f_t> SL(Sdim, Sdim, Snz);
         csc_matrix_t<i_t, f_t> SU(Sdim, Sdim, Snz);
+        work_estimate += 2 * Sdim + 2 * Snz;
+
         // Factorize S
         std::vector<i_t> S_perm_inv(Sdim);
+        work_estimate += Sdim;
         std::optional<std::vector<i_t>> empty = std::nullopt;
         f_t actual_factor_start               = tic();
 
@@ -356,6 +377,8 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
         for (i_t h = 0; h < Sdim; ++h) {
           identity[h] = h;
         }
+        work_estimate += 3 * Sdim;
+
         Srank = right_looking_lu(S,
                                  settings,
                                  settings.threshold_partial_pivoting_tol,
@@ -363,7 +386,8 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
                                  S_col_perm,
                                  SL,
                                  SU,
-                                 S_perm_inv);
+                                 S_perm_inv,
+                                 work_estimate);
         if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
           settings.log.printf("Concurrent halt\n");
           return CONCURRENT_HALT_RETURN;
@@ -375,33 +399,40 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
           for (i_t h = Srank; h < Sdim; ++h) {
             deficient[h - Srank] = col_perm[num_singletons + S_col_perm[h]];
           }
+          work_estimate += 3 * (Sdim - Srank);
           // Get S_perm
           std::vector<i_t> S_perm(Sdim);
           inverse_permutation(S_perm_inv, S_perm);
+          work_estimate += 4 * Sdim;
           // Get the slacks needed
           slacks_needed.resize(Sdim - Srank);
           for (i_t h = Srank; h < Sdim; ++h) {
             slacks_needed[h - Srank] = row_perm[num_singletons + S_perm[h]];
           }
+          work_estimate += 3 * (Sdim - Srank);
 
           return -1;
         }
 
         // Need to permute col_perm[k] according to q
         std::vector<i_t> col_perm_sav(m - num_singletons);
+        work_estimate += (m - num_singletons);
         i_t q_j = 0;
         for (i_t h = num_singletons; h < m; ++h) {
           col_perm_sav[q_j] = col_perm[h];
           q_j++;
         }
+        work_estimate += 2 * (m - num_singletons);
         q_j = 0;
         for (i_t h = num_singletons; h < m; ++h) {
           col_perm[h] = col_perm_sav[S_col_perm[q_j]];
           q_j++;
         }
+        work_estimate += 2 * (m - num_singletons);
 
         std::vector<i_t> S_perm(m);
         inverse_permutation(S_perm_inv, S_perm);
+        work_estimate += 4 * m;
         actual_factor = toc(actual_factor_start);
 
         // Permute the rows of L_32 according to S_perm_inv
@@ -415,11 +446,16 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
               L.i[p]          = new_i;
             }
           }
+          work_estimate += (col_end - col_start);
         }
+        work_estimate += 3 * (num_singletons - col_singletons);
 
         const i_t SLnz    = SL.col_start[Sdim];
         const i_t Lnz_max = Lnz + SLnz;
-        if (Lnz_max > Bnz) { L.reallocate(Lnz_max); }
+        if (Lnz_max > Bnz) {
+          L.reallocate(Lnz_max);
+          work_estimate += Lnz_max;
+        }
 
         // Fill in L(:, num_singletons:m-1) with L_33
         for (i_t k = num_singletons; k < m; ++k) {
@@ -434,13 +470,19 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
             Lnz++;
             assert(Lnz <= Lnz_max);
           }
+          work_estimate += 4 * (col_end - col_start);
         }
+        work_estimate += 3 * (m - num_singletons);
+
         assert(Lnz == Lnz_max);
         L.col_start[m] = Lnz;  // Finalize L
 
         const i_t SUnz    = SU.col_start[Sdim];
         const i_t Unz_max = Unz + SUnz + (Bnz - Bnz_used);
-        if (Unz_max > Bnz) { U.reallocate(Unz_max); }
+        if (Unz_max > Bnz) {
+          U.reallocate(Unz_max);
+          work_estimate += Unz_max;
+        }
 
         // Fill in U(:, num_singletons:m-1) with U_13 and U_33
         for (i_t k = num_singletons; k < m; ++k) {
@@ -459,6 +501,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
               assert(Unz <= Unz_max);
             }
           }
+          work_estimate += 3 * (B_col_end - B_col_start);
 
           // U_33
           const i_t l           = k - num_singletons;
@@ -471,7 +514,10 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
             Unz++;
             assert(Unz <= Unz_max);
           }
+          work_estimate += 4 * (U_col_end - U_col_start);
         }
+        work_estimate += 5 * (m - num_singletons);
+
         assert(Unz <= Unz_max);
         U.col_start[m] = Unz;  // Finalize U
 
@@ -479,12 +525,15 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
         for (i_t k = 0; k < Sdim; ++k) {
           last_perm[k] = row_perm[num_singletons + k];
         }
+        work_estimate += 3 * Sdim;
 
         // Fix up row permutations
         for (i_t k = 0; k < Sdim; ++k) {
           row_perm[num_singletons + k] = last_perm[S_perm[k]];
         }
+        work_estimate += 3 * Sdim;
         inverse_permutation(row_perm, row_perm_inv);
+        work_estimate += 3 * row_perm.size();
       } else {
         L.col_start[m] = Lnz;  // Finalize L
         U.col_start[m] = Unz;  // Finalize U
@@ -563,13 +612,16 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
 
   if (write_basis) {
     csc_matrix_t<i_t, f_t> B(m, m, 1);
-    form_b(A, basic_list, B);
+    form_b(A, basic_list, B, work_estimate);
     write_basis_info(B);
   }
   q.resize(m);
+  work_estimate += m;
   f_t fact_start = tic();
-  rank           = right_looking_lu(A, settings, medium_tol, basic_list, q, L, U, pinv);
+  rank = right_looking_lu(A, settings, medium_tol, basic_list, q, L, U, pinv, work_estimate);
   inverse_permutation(pinv, p);
+  work_estimate += 3 * pinv.size();
+
   if (rank != m) {
     // Get the rank deficient columns
     deficient.clear();
@@ -577,11 +629,13 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
     for (i_t h = rank; h < m; ++h) {
       deficient[h - rank] = q[h];
     }
+    work_estimate += 3 * (m - rank);
     // Get the slacks needed
     slacks_needed.resize(m - rank);
     for (i_t h = rank; h < m; ++h) {
       slacks_needed[h - rank] = p[h];
     }
+    work_estimate += 3 * (m - rank);
   }
   if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
     settings.log.printf("Concurrent halt\n");
@@ -596,7 +650,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
     multiply(L, U, C);
 
     csc_matrix_t<i_t, f_t> B(m, m, 1);
-    form_b(A, basic_list, B);
+    form_b(A, basic_list, B, work_estimate);
     csc_matrix_t<i_t, f_t> D(m, m, 1);
     B.permute_rows_and_cols(pinv, q, D);
 
@@ -623,7 +677,8 @@ i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
                  std::vector<i_t>& basis_list,
                  std::vector<i_t>& nonbasic_list,
                  std::vector<i_t>& superbasic_list,
-                 std::vector<variable_status_t>& vstatus)
+                 std::vector<variable_status_t>& vstatus,
+                 f_t& work_estimate)
 {
   const i_t m = A.m;
   const i_t n = A.n;
@@ -632,6 +687,7 @@ i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
 
   // Create slack_map
   std::vector<i_t> slack_map(m);  // slack_map[i] = j if column j is e_i
+  work_estimate += m;
   i_t slacks_found = 0;
   for (i_t j = n - 1; j >= n - m; j--) {
     const i_t col_start = A.col_start[j];
@@ -643,23 +699,29 @@ i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
       slacks_found++;
     }
   }
+  work_estimate += 3 * m + 2 * slacks_found;
 
   assert(slacks_found == m);
 
   // Create nonbasic_map
   std::vector<i_t> nonbasic_map(
     n, -1);  // nonbasic_map[j] = p if nonbasic[p] = j, -1 if j is basic/superbasic
+  work_estimate += n;
   const i_t num_nonbasic = nonbasic_list.size();
   for (i_t k = 0; k < num_nonbasic; ++k) {
     nonbasic_map[nonbasic_list[k]] = k;
   }
+  work_estimate += 2 * num_nonbasic;
+
   // Create a superbasic_map
   std::vector<i_t> superbasic_map(
     n, -1);  // superbasic_map[j] = p if superbasic[p] = j, -1 if j is basic/nonbasic
+  work_estimate += n;
   const i_t num_superbasic = superbasic_list.size();
   for (i_t k = 0; k < num_superbasic; ++k) {
     superbasic_map[superbasic_list[k]] = k;
   }
+  work_estimate += 2 * num_superbasic;
 
   const i_t columns_to_replace = deficient.size();
   for (i_t k = 0; k < columns_to_replace; ++k) {
@@ -687,6 +749,7 @@ i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
     }
     vstatus[replace_j] = variable_status_t::BASIC;
   }
+  work_estimate += 11 * columns_to_replace;
 
   return 0;
 }
@@ -694,7 +757,8 @@ i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
 template <typename i_t, typename f_t>
 i_t form_b(const csc_matrix_t<i_t, f_t>& A,
            const std::vector<i_t>& basic_list,
-           csc_matrix_t<i_t, f_t>& B)
+           csc_matrix_t<i_t, f_t>& B,
+           f_t& work_estimate)
 {
   const i_t m = A.m;
   i_t Bnz     = 0;
@@ -704,7 +768,9 @@ i_t form_b(const csc_matrix_t<i_t, f_t>& A,
     const i_t col_end   = A.col_start[j + 1];
     Bnz += (col_end - col_start);
   }
+  work_estimate += 3 * m;
   B.reallocate(Bnz);
+  work_estimate += 2 * Bnz;
   const i_t Bnz_check = Bnz;
   Bnz                 = 0;
   for (i_t k = 0; k < m; ++k) {
@@ -718,6 +784,8 @@ i_t form_b(const csc_matrix_t<i_t, f_t>& A,
       Bnz++;
     }
   }
+  work_estimate += 4 * m;
+  work_estimate += 4 * Bnz;
   B.col_start[m] = Bnz;
   assert(Bnz_check == Bnz);
   return 0;
@@ -784,9 +852,10 @@ i_t b_transpose_solve(const csc_matrix_t<i_t, f_t>& L,
 
   raft::common::nvtx::range scope("LU::b_transpose_solve");
 
+  f_t work_estimate = 0;
   // Solve for r such that U'*r = c
   std::vector<f_t> r = rhs;
-  upper_triangular_transpose_solve(U, r);
+  upper_triangular_transpose_solve(U, r, work_estimate);
 #ifdef BASIS_DEBUG
   // err = norm(U'*r - c, inf)
   std::vector<f_t> residual = rhs;
@@ -798,7 +867,7 @@ i_t b_transpose_solve(const csc_matrix_t<i_t, f_t>& L,
 #endif
 
   // Solve for w such that L'*w = r
-  lower_triangular_transpose_solve(L, r);
+  lower_triangular_transpose_solve(L, r, work_estimate);
 #ifdef BASIS_DEBUG
   // err2 = norm(L'*w -r, inf)
   matrix_transpose_vector_multiply(L, 1.0, r, -1.0, residual2);
@@ -825,13 +894,14 @@ i_t b_solve(const csc_matrix_t<i_t, f_t>& L,
   assert(p.size() == m);
   assert(rhs.size() == m);
   assert(solution.size() == m);
+  f_t work_estimate = 0;
   // P*B = L*U
   // B*x = b
   // P*B*x = P*b = b'
   permute_vector(p, rhs, solution);
 
   // Solve for v such that L*v = b'
-  lower_triangular_solve(L, solution);
+  lower_triangular_solve(L, solution, work_estimate);
 
 #ifdef BASIS_DEBUG
   std::vector<f_t> residual1(m);
@@ -844,7 +914,7 @@ i_t b_solve(const csc_matrix_t<i_t, f_t>& L,
 #endif
 
   // Solve for x such that U*x = v
-  upper_triangular_solve(U, solution);
+  upper_triangular_solve(U, solution, work_estimate);
 #ifdef BASIS_DEBUG
   matrix_vector_multiply(U, 1.0, solution, -1.0, residual2);
   const f_t err2 = vector_norm_inf(residual2);
@@ -874,7 +944,8 @@ template int factorize_basis<int>(const csc_matrix_t<int, double>& A,
                                   std::vector<int>& pinv,
                                   std::vector<int>& q,
                                   std::vector<int>& deficient,
-                                  std::vector<int>& slacks_needed);
+                                  std::vector<int>& slacks_needed,
+                                  double& work_estimate);
 
 template int basis_repair<int, double>(const csc_matrix_t<int, double>& A,
                                        const simplex_solver_settings_t<int, double>& settings,
@@ -885,11 +956,13 @@ template int basis_repair<int, double>(const csc_matrix_t<int, double>& A,
                                        std::vector<int>& basis_list,
                                        std::vector<int>& nonbasic_list,
                                        std::vector<int>& superbasic_list,
-                                       std::vector<variable_status_t>& vstatus);
+                                       std::vector<variable_status_t>& vstatus,
+                                       double& work_estimate);
 
 template int form_b<int, double>(const csc_matrix_t<int, double>& A,
                                  const std::vector<int>& basic_list,
-                                 csc_matrix_t<int, double>& B);
+                                 csc_matrix_t<int, double>& B,
+                                 double& work_estimate);
 
 template int b_multiply<int, double>(const lp_problem_t<int, double>& lp,
                                      const std::vector<int>& basic_list,
diff --git a/cpp/src/dual_simplex/basis_solves.hpp b/cpp/src/dual_simplex/basis_solves.hpp
index 59b4725e42..fbeba565f1 100644
--- a/cpp/src/dual_simplex/basis_solves.hpp
+++ b/cpp/src/dual_simplex/basis_solves.hpp
@@ -36,7 +36,8 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
                     std::vector<i_t>& pinv,
                     std::vector<i_t>& q,
                     std::vector<i_t>& deficient,
-                    std::vector<i_t>& slacks_need);
+                    std::vector<i_t>& slacks_need,
+                    f_t& work_estimate);
 
 // Repair the basis by bringing in slacks
 template <typename i_t, typename f_t>
@@ -49,13 +50,15 @@ i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
                  std::vector<i_t>& basis_list,
                  std::vector<i_t>& nonbasic_list,
                  std::vector<i_t>& superbasic_list,
-                 std::vector<variable_status_t>& vstatus);
+                 std::vector<variable_status_t>& vstatus,
+                 f_t& work_estimate);
 
 // Form the basis matrix B = A(:, basic_list)
 template <typename i_t, typename f_t>
 i_t form_b(const csc_matrix_t<i_t, f_t>& A,
            const std::vector<i_t>& basic_list,
-           csc_matrix_t<i_t, f_t>& B);
+           csc_matrix_t<i_t, f_t>& B,
+           f_t& work_estimate);
 
 // y = B*x = sum_{j in basis} A(:, j) * x(k)
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp
index 71dce2e39c..a5f59ef579 100644
--- a/cpp/src/dual_simplex/basis_updates.cpp
+++ b/cpp/src/dual_simplex/basis_updates.cpp
@@ -237,7 +237,8 @@ i_t basis_update_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
 #endif
   // First solve
   // L0*x0 = b
-  dual_simplex::lower_triangular_solve(L0_, rhs);
+  f_t work_estimate = 0;
+  dual_simplex::lower_triangular_solve(L0_, rhs, work_estimate);
 #ifdef CHECK_LOWER_SOLVE
   {
     matrix_vector_multiply(L0_, 1.0, rhs, -1.0, residual);
@@ -275,8 +276,9 @@ i_t basis_update_t<i_t, f_t>::l_solve(sparse_vector_t<i_t, f_t>& rhs) const
   // L0*x0 = b
   const i_t m = L0_.m;
 
-  i_t top = sparse_triangle_solve<i_t, f_t, true>(
-    rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data());
+  f_t work_estimate = 0;
+  i_t top           = sparse_triangle_solve<i_t, f_t, true>(
+    rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data(), work_estimate);
   solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
 
 #ifdef CHECK_L_SOLVE
@@ -370,7 +372,8 @@ i_t basis_update_t<i_t, f_t>::l_transpose_solve(std::vector<f_t>& rhs) const
   }
   // L0'*y = c
   // TODO: handle a sparse rhs
-  dual_simplex::lower_triangular_transpose_solve(L0_, rhs);
+  f_t work_estimate = 0;
+  dual_simplex::lower_triangular_transpose_solve(L0_, rhs, work_estimate);
   return 0;
 }
 
@@ -500,8 +503,9 @@ i_t basis_update_t<i_t, f_t>::l_transpose_solve(sparse_vector_t<i_t, f_t>& rhs)
   rhs.to_dense(cprime_dense);
 #endif
 
-  i_t top = sparse_triangle_solve<i_t, f_t, false>(
-    rhs, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data());
+  f_t work_estimate = 0;
+  i_t top           = sparse_triangle_solve<i_t, f_t, false>(
+    rhs, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data(), work_estimate);
   solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
 
 #ifdef CHECK_LOWER_TRANSPOSE_SOLVE
@@ -558,7 +562,8 @@ i_t basis_update_t<i_t, f_t>::u_solve(std::vector<f_t>& x) const
   std::vector<f_t> residual = bprime;
 #endif
 
-  dual_simplex::upper_triangular_solve(U_, bprime);
+  f_t work_estimate = 0;
+  dual_simplex::upper_triangular_solve(U_, bprime, work_estimate);
 
 #ifdef CHECK_UPPER_SOLVE
   matrix_vector_multiply(U_, 1.0, bprime, -1.0, residual);
@@ -582,8 +587,9 @@ i_t basis_update_t<i_t, f_t>::u_solve(sparse_vector_t<i_t, f_t>& rhs) const
   sparse_vector_t<i_t, f_t> bprime(m, 0);
   rhs.inverse_permute_vector(col_permutation_, bprime);
 
-  i_t top = sparse_triangle_solve<i_t, f_t, false>(
-    bprime, std::nullopt, xi_workspace_, U_, x_workspace_.data());
+  f_t work_estimate = 0;
+  i_t top           = sparse_triangle_solve<i_t, f_t, false>(
+    bprime, std::nullopt, xi_workspace_, U_, x_workspace_.data(), work_estimate);
   solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
 
   rhs.inverse_permute_vector(inverse_col_permutation_);
@@ -604,7 +610,8 @@ i_t basis_update_t<i_t, f_t>::u_transpose_solve(std::vector<f_t>& x) const
   const i_t m = U_.m;
   std::vector<f_t> bprime(m);
   inverse_permute_vector(col_permutation_, x, bprime);
-  dual_simplex::upper_triangular_transpose_solve(U_, bprime);
+  f_t work_estimate = 0;
+  dual_simplex::upper_triangular_transpose_solve(U_, bprime, work_estimate);
   permute_vector(col_permutation_, bprime, x);
   return 0;
 }
@@ -649,8 +656,9 @@ i_t basis_update_t<i_t, f_t>::u_transpose_solve(sparse_vector_t<i_t, f_t>& rhs)
 #endif
 
   // U'*y = bprime
-  i_t top = sparse_triangle_solve<i_t, f_t, true>(
-    bprime, std::nullopt, xi_workspace_, U_transpose_, x_workspace_.data());
+  f_t work_estimate = 0;
+  i_t top           = sparse_triangle_solve<i_t, f_t, true>(
+    bprime, std::nullopt, xi_workspace_, U_transpose_, x_workspace_.data(), work_estimate);
   solve_to_sparse_vector(top, rhs);  // Uses xi_workspace_ and x_workspace_ to fill rhs
 
 #ifdef CHECK_WORKSPACE
@@ -871,8 +879,9 @@ i_t basis_update_t<i_t, f_t>::update(std::vector<f_t>& utilde, i_t leaving_index
 
   // Solve U'*w = delta*et
   std::vector<f_t> w(m);
-  w[t] = delta;
-  dual_simplex::upper_triangular_transpose_solve(U_, w);
+  w[t]              = delta;
+  f_t work_estimate = 0;
+  dual_simplex::upper_triangular_transpose_solve(U_, w, work_estimate);
 #ifdef PARANOID
   {
     // Compute the residual of the solve
@@ -1111,7 +1120,9 @@ i_t basis_update_t<i_t, f_t>::lower_triangular_multiply(const csc_matrix_t<i_t,
   return new_nz;
 }
 
+//////////////////////////////////////////////////////////////////////////////////////////
 // Start of middle product form: basis_update_mpf_t
+//////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename i_t, typename f_t>
 i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts_basic)
@@ -1121,10 +1132,12 @@ i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts
   // Solve for U^T W^T = C_B^T
   // We do this one row at a time of C_B
   csc_matrix_t<i_t, f_t> WT(m, cuts_basic.m, 0);
+  work_estimate_ += cuts_basic.m;
 
   i_t WT_nz = 0;
   for (i_t k = 0; k < cuts_basic.m; k++) {
     sparse_vector_t<i_t, f_t> rhs(cuts_basic, k);
+    work_estimate_ += rhs.i.size();
     u_transpose_solve(rhs);
     WT.col_start[k] = WT_nz;
     for (i_t q = 0; q < rhs.i.size(); q++) {
@@ -1132,6 +1145,7 @@ i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts
       WT.x.push_back(rhs.x[q]);
       WT_nz++;
     }
+    work_estimate_ += 3 * rhs.i.size();
   }
   WT.col_start[cuts_basic.m] = WT_nz;
 
@@ -1161,6 +1175,8 @@ i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts
 #endif
 
   csc_matrix_t<i_t, f_t> V(cuts_basic.m, m, 0);
+  work_estimate_ += m;
+  i_t V_nz = 0;
   if (num_updates_ > 0) {
     // W = V T_0 ... T_{num_updates_ - 1}
     // or V = W T_{num_updates_ - 1}^{-1} ... T_0^{-1}
@@ -1173,11 +1189,12 @@ i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts
     // for appending to L0
 
     csr_matrix_t<i_t, f_t> V_row(cuts_basic.m, m, 0);
-    i_t V_nz           = 0;
+    work_estimate_ += m;
     const f_t zero_tol = 1e-13;
     for (i_t h = 0; h < cuts_basic.m; h++) {
       sparse_vector_t<i_t, f_t> rhs(WT, h);
       scatter_into_workspace(rhs);
+      work_estimate_ += 2 * rhs.i.size();
       i_t nz = rhs.i.size();
       for (i_t k = num_updates_ - 1; k >= 0; --k) {
         // T_k^{-T} = ( I - v u^T/(1 + u^T v))
@@ -1201,10 +1218,12 @@ i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts
         V_row.x.push_back(rhs.x[q]);
         V_nz++;
       }
+      work_estimate_ += 2 * rhs.i.size();
     }
     V_row.row_start[cuts_basic.m] = V_nz;
 
     V_row.to_compressed_col(V);
+    work_estimate_ += 3 * V_nz;
 
 #ifdef CHECK_V
     csc_matrix_t<i_t, f_t> CB_col(cuts_basic.m, m, 0);
@@ -1245,6 +1264,7 @@ i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts
   } else {
     // W = V
     WT.transpose(V);
+    work_estimate_ += 3 * WT.col_start[WT.n];
   }
 
   // Extend u_i, v_i for i = 0, ..., num_updates_ - 1
@@ -1254,9 +1274,10 @@ i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts
   // L = [ L0  0 ]
   //     [ V   I ]
 
-  i_t V_nz = V.col_start[m];
+  V_nz     = V.col_start[m];
   i_t L_nz = L0_.col_start[m];
   csc_matrix_t<i_t, f_t> new_L(m + cuts_basic.m, m + cuts_basic.m, L_nz + V_nz + cuts_basic.m);
+  work_estimate_ += (L_nz + V_nz + cuts_basic.m) + (m + cuts_basic.m);
   i_t predicted_nz = L_nz + V_nz + cuts_basic.m;
   L_nz             = 0;
   for (i_t j = 0; j < m; ++j) {
@@ -1276,16 +1297,19 @@ i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts
       L_nz++;
     }
   }
+  work_estimate_ += 4 * L_nz;
   for (i_t j = m; j < m + cuts_basic.m; ++j) {
     new_L.col_start[j] = L_nz;
     new_L.i[L_nz]      = j;
     new_L.x[L_nz]      = 1.0;
     L_nz++;
   }
+  work_estimate_ += 3 * cuts_basic.m;
   new_L.col_start[m + cuts_basic.m] = L_nz;
   assert(L_nz == predicted_nz);
 
   L0_ = new_L;
+  work_estimate_ += 2 * L_nz;
 
   // Adjust U
   // U = [ U0 0 ]
@@ -1295,12 +1319,14 @@ i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts
   U0_.col_start.resize(m + cuts_basic.m + 1);
   U0_.i.resize(U_nz + cuts_basic.m);
   U0_.x.resize(U_nz + cuts_basic.m);
+  work_estimate_ += 2 * (U_nz + cuts_basic.m) + (m + cuts_basic.m);
   for (i_t k = m; k < m + cuts_basic.m; ++k) {
     U0_.col_start[k] = U_nz;
     U0_.i[U_nz]      = k;
     U0_.x[U_nz]      = 1.0;
     U_nz++;
   }
+  work_estimate_ += 3 * cuts_basic.m;
   U0_.col_start[m + cuts_basic.m] = U_nz;
   U0_.n                           = m + cuts_basic.m;
   U0_.m                           = m + cuts_basic.m;
@@ -1310,14 +1336,17 @@ i_t basis_update_mpf_t<i_t, f_t>::append_cuts(const csr_matrix_t<i_t, f_t>& cuts
   // Adjust row_permutation_ and inverse_row_permutation_
   row_permutation_.resize(m + cuts_basic.m);
   inverse_row_permutation_.resize(m + cuts_basic.m);
+  work_estimate_ += 2 * (m + cuts_basic.m);
   for (i_t k = m; k < m + cuts_basic.m; ++k) {
     row_permutation_[k] = k;
   }
+  work_estimate_ += cuts_basic.m;
   inverse_permutation(row_permutation_, inverse_row_permutation_);
 
   // Adjust workspace sizes
   xi_workspace_.resize(2 * (m + cuts_basic.m), 0);
   x_workspace_.resize(m + cuts_basic.m, 0.0);
+  work_estimate_ += 3 * (m + cuts_basic.m);
 
   return 0;
 }
@@ -1331,6 +1360,7 @@ void basis_update_mpf_t<i_t, f_t>::gather_into_sparse_vector(i_t nz,
   out.x.clear();
   out.i.reserve(nz);
   out.x.reserve(nz);
+  work_estimate_ += 2 * nz;
   const f_t zero_tol = 1e-13;
   for (i_t k = 0; k < nz; ++k) {
     const i_t i = xi_workspace_[m + k];
@@ -1342,6 +1372,8 @@ void basis_update_mpf_t<i_t, f_t>::gather_into_sparse_vector(i_t nz,
     xi_workspace_[i]     = 0;
     x_workspace_[i]      = 0.0;
   }
+  work_estimate_ += 5 * nz;
+  work_estimate_ += 3 * out.i.size();
 }
 
 template <typename i_t, typename f_t>
@@ -1355,10 +1387,12 @@ void basis_update_mpf_t<i_t, f_t>::solve_to_workspace(i_t top) const
     xi_workspace_[p]      = 0;
     nz++;
   }
+  work_estimate_ += 3 * (m - top);
   for (i_t k = 0; k < nz; ++k) {
     const i_t i      = xi_workspace_[m + k];
     xi_workspace_[i] = 1;
   }
+  work_estimate_ += 2 * nz;
 }
 
 template <typename i_t, typename f_t>
@@ -1372,6 +1406,7 @@ void basis_update_mpf_t<i_t, f_t>::solve_to_sparse_vector(i_t top,
   out.i.clear();
   out.x.reserve(nz);
   out.i.reserve(nz);
+  work_estimate_ += 2 * nz;
   i_t k              = 0;
   const f_t zero_tol = 1e-13;
   for (i_t p = top; p < m; ++p) {
@@ -1384,6 +1419,7 @@ void basis_update_mpf_t<i_t, f_t>::solve_to_sparse_vector(i_t top,
     xi_workspace_[p] = 0;
     k++;
   }
+  work_estimate_ += 4 * k + 3 * out.i.size();
 }
 
 template <typename i_t, typename f_t>
@@ -1397,10 +1433,12 @@ i_t basis_update_mpf_t<i_t, f_t>::scatter_into_workspace(const sparse_vector_t<i
     xi_workspace_[i]     = 1;
     xi_workspace_[m + k] = i;
   }
+  work_estimate_ += 3 * nz;
   // scatter values into x_workspace_
   for (i_t k = 0; k < nz; ++k) {
     x_workspace_[in.i[k]] = in.x[k];
   }
+  work_estimate_ += 3 * nz;
   return nz;
 }
 
@@ -1412,11 +1450,13 @@ void basis_update_mpf_t<i_t, f_t>::grow_storage(i_t nz, i_t& S_start, i_t& S_nz)
   const i_t new_last_S_col = last_S_col + 2;
   if (new_last_S_col >= S_.col_start.size()) {
     S_.col_start.resize(new_last_S_col + refactor_frequency_);
+    work_estimate_ += new_last_S_col + refactor_frequency_;
   }
   S_nz = S_.col_start[last_S_col];
   if (S_nz + nz > S_.i.size()) {
     S_.i.resize(std::max(2 * S_nz, S_nz + nz));
     S_.x.resize(std::max(2 * S_nz, S_nz + nz));
+    work_estimate_ += 2 * std::max(2 * S_nz, S_nz + nz);
   }
   S_start = last_S_col;
   assert(S_nz + nz <= S_.i.size());
@@ -1431,6 +1471,7 @@ i_t basis_update_mpf_t<i_t, f_t>::nonzeros(const std::vector<f_t>& x) const
   for (i_t i = 0; i < xsz; ++i) {
     if (x[i] != 0.0) { nz++; }
   }
+  work_estimate_ += xsz;
   return nz;
 }
 
@@ -1445,6 +1486,7 @@ f_t basis_update_mpf_t<i_t, f_t>::dot_product(i_t col, const std::vector<f_t>& x
     const i_t i = S_.i[p];
     dot += S_.x[p] * x[i];
   }
+  work_estimate_ += 3 * (col_end - col_start);
   return dot;
 }
 
@@ -1457,10 +1499,15 @@ f_t basis_update_mpf_t<i_t, f_t>::dot_product(i_t col,
   f_t dot             = 0.0;
   const i_t col_start = S_.col_start[col];
   const i_t col_end   = S_.col_start[col + 1];
+  i_t nz_mark         = 0;
   for (i_t p = col_start; p < col_end; ++p) {
     const i_t i = S_.i[p];
-    if (mark[i]) { dot += S_.x[p] * x[i]; }
+    if (mark[i]) {
+      dot += S_.x[p] * x[i];
+      nz_mark++;
+    }
   }
+  work_estimate_ += 2 * nz_mark + (col_end - col_start);
   return dot;
 }
 
@@ -1477,6 +1524,7 @@ void basis_update_mpf_t<i_t, f_t>::add_sparse_column(const csc_matrix_t<i_t, f_t
     const i_t i = S.i[p];
     x[i] += theta * S.x[p];
   }
+  work_estimate_ += 3 * (col_end - col_start);
 }
 
 template <typename i_t, typename f_t>
@@ -1490,6 +1538,7 @@ void basis_update_mpf_t<i_t, f_t>::add_sparse_column(const csc_matrix_t<i_t, f_t
   const i_t m         = L0_.m;
   const i_t col_start = S.col_start[col];
   const i_t col_end   = S.col_start[col + 1];
+  i_t nz_start        = nz;
   for (i_t p = col_start; p < col_end; ++p) {
     const i_t i = S.i[p];
     if (!mark[i]) {
@@ -1500,6 +1549,7 @@ void basis_update_mpf_t<i_t, f_t>::add_sparse_column(const csc_matrix_t<i_t, f_t
     }
     x[i] += theta * S.x[p];
   }
+  work_estimate_ += 4 * (col_end - col_start) + 2 * (nz - nz_start);
 }
 
 template <typename i_t, typename f_t>
@@ -1528,14 +1578,17 @@ i_t basis_update_mpf_t<i_t, f_t>::b_transpose_solve(const std::vector<f_t>& rhs,
 
   // Solve for r such that U'*r = c
   std::vector<f_t> r = rhs;
+  work_estimate_ += 2 * r.size();
   u_transpose_solve(r);
   UTsol = r;
+  work_estimate_ += 2 * r.size();
 
   // Solve for w such that L'*w = r
   l_transpose_solve(r);
 
   // Compute y = P'*w
   inverse_permute_vector(row_permutation_, r, solution);
+  work_estimate_ += 3 * r.size();
 
   return 0;
 }
@@ -1630,7 +1683,7 @@ template <typename i_t, typename f_t>
 i_t basis_update_mpf_t<i_t, f_t>::u_transpose_solve(std::vector<f_t>& rhs) const
 {
   total_dense_U_transpose_++;
-  dual_simplex::upper_triangular_transpose_solve(U0_, rhs);
+  dual_simplex::upper_triangular_transpose_solve(U0_, rhs, work_estimate_);
   return 0;
 }
 
@@ -1641,7 +1694,7 @@ i_t basis_update_mpf_t<i_t, f_t>::u_transpose_solve(sparse_vector_t<i_t, f_t>& r
   // U0'*x = y
   // Solve U0'*x0 = y
   i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, true>(
-    rhs, std::nullopt, xi_workspace_, U0_transpose_, x_workspace_.data());
+    rhs, std::nullopt, xi_workspace_, U0_transpose_, x_workspace_.data(), work_estimate_);
   solve_to_sparse_vector(top, rhs);
   return 0;
 }
@@ -1671,9 +1724,10 @@ i_t basis_update_mpf_t<i_t, f_t>::l_transpose_solve(std::vector<f_t>& rhs) const
 
     if (std::abs(theta) > zero_tol) { add_sparse_column(S_, v_col, -theta, rhs); }
   }
+  work_estimate_ += 2 * num_updates_;
 
   // Solve for x such that L0^T * x = b'
-  dual_simplex::lower_triangular_transpose_solve(L0_, rhs);
+  dual_simplex::lower_triangular_transpose_solve(L0_, rhs, work_estimate_);
 
   return 0;
 }
@@ -1729,6 +1783,7 @@ i_t basis_update_mpf_t<i_t, f_t>::l_transpose_solve(sparse_vector_t<i_t, f_t>& r
     }
 #endif
   }
+  work_estimate_ += 2 * num_updates_;
 
 #ifdef CHECK_MULTIPLY
   for (i_t i = 0; i < m; ++i) {
@@ -1743,9 +1798,10 @@ i_t basis_update_mpf_t<i_t, f_t>::l_transpose_solve(sparse_vector_t<i_t, f_t>& r
 #endif
 
   sparse_vector_t<i_t, f_t> b(m, nz);
+  work_estimate_ += nz;
   gather_into_sparse_vector(nz, b);
   i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, false>(
-    b, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data());
+    b, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data(), work_estimate_);
   solve_to_sparse_vector(top, rhs);
 
 #ifdef CHECK_SPARSE_SOLVE
@@ -1772,6 +1828,7 @@ i_t basis_update_mpf_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
 {
   const i_t m = L0_.m;
   std::vector<f_t> Lsol(m);
+  work_estimate_ += m;
   return b_solve(rhs, solution, Lsol);
 }
 
@@ -1788,6 +1845,7 @@ i_t basis_update_mpf_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
   // P*B*x = P*b
 
   permute_vector(row_permutation_, rhs, solution);
+  work_estimate_ += 3 * rhs.size();
 
   // L*U*x = b'
   // Solve for v such that L*v = b'
@@ -1795,7 +1853,10 @@ i_t basis_update_mpf_t<i_t, f_t>::b_solve(const std::vector<f_t>& rhs,
   std::vector<f_t> rhs_permuted = solution;
 #endif
   l_solve(solution);
-  if (need_Lsol) { Lsol = solution; }
+  if (need_Lsol) {
+    Lsol = solution;
+    work_estimate_ += 2 * solution.size();
+  }
 
 #ifdef CHECK_L_SOLVE
   std::vector<f_t> Lsol_check = Lsol;
@@ -1836,7 +1897,9 @@ i_t basis_update_mpf_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
 {
   const i_t m = L0_.m;
   solution    = rhs;
+  work_estimate_ += 2 * rhs.i.size();
   solution.inverse_permute_vector(inverse_row_permutation_);
+  work_estimate_ += 3 * rhs.i.size();
 
 #ifdef CHECK_PERMUTATION
   std::vector<f_t> permuation_rhs;
@@ -1866,10 +1929,15 @@ i_t basis_update_mpf_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
   } else {
     std::vector<f_t> solution_dense;
     solution.to_dense(solution_dense);
+    work_estimate_ += solution_dense.size();
     l_solve(solution_dense);
     solution.from_dense(solution_dense);
+    work_estimate_ += solution_dense.size();
+  }
+  if (need_Lsol) {
+    Lsol = solution;
+    work_estimate_ += 2 * solution.i.size();
   }
-  if (need_Lsol) { Lsol = solution; }
   sum_L_ += static_cast<f_t>(solution.i.size()) / input_size;
 
 #ifdef CHECK_L_SOLVE
@@ -1897,8 +1965,10 @@ i_t basis_update_mpf_t<i_t, f_t>::b_solve(const sparse_vector_t<i_t, f_t>& rhs,
   } else {
     std::vector<f_t> solution_dense;
     solution.to_dense(solution_dense);
+    work_estimate_ += solution_dense.size();
     u_solve(solution_dense);
     solution.from_dense(solution_dense);
+    work_estimate_ += solution_dense.size();
   }
   sum_U_ += static_cast<f_t>(solution.i.size()) / rhs_size;
 
@@ -1921,7 +1991,7 @@ i_t basis_update_mpf_t<i_t, f_t>::u_solve(std::vector<f_t>& rhs) const
   total_dense_U_++;
   const i_t m = L0_.m;
   // U*x = y
-  dual_simplex::upper_triangular_solve(U0_, rhs);
+  dual_simplex::upper_triangular_solve(U0_, rhs, work_estimate_);
   return 0;
 }
 
@@ -1934,7 +2004,7 @@ i_t basis_update_mpf_t<i_t, f_t>::u_solve(sparse_vector_t<i_t, f_t>& rhs) const
 
   // Solve U0*x = y
   i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, false>(
-    rhs, std::nullopt, xi_workspace_, U0_, x_workspace_.data());
+    rhs, std::nullopt, xi_workspace_, U0_, x_workspace_.data(), work_estimate_);
   solve_to_sparse_vector(top, rhs);
 
   return 0;
@@ -1955,7 +2025,7 @@ i_t basis_update_mpf_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
 #ifdef CHECK_L_SOLVE
   std::vector<f_t> rhs_check = rhs;
 #endif
-  dual_simplex::lower_triangular_solve(L0_, rhs);
+  dual_simplex::lower_triangular_solve(L0_, rhs, work_estimate_);
 
 #ifdef CHECK_L0_SOLVE
   matrix_vector_multiply(L0_, 1.0, rhs, -1.0, residual);
@@ -1979,6 +2049,7 @@ i_t basis_update_mpf_t<i_t, f_t>::l_solve(std::vector<f_t>& rhs) const
 
     if (std::abs(theta) > zero_tol) { add_sparse_column(S_, u_col, -theta, rhs); }
   }
+  work_estimate_ += 2 * num_updates_;
 
 #ifdef CHECK_L_SOLVE
   std::vector<f_t> inout = rhs;
@@ -2004,7 +2075,7 @@ i_t basis_update_mpf_t<i_t, f_t>::l_solve(sparse_vector_t<i_t, f_t>& rhs) const
 
   // First solve L0*x0 = y
   i_t top = dual_simplex::sparse_triangle_solve<i_t, f_t, true>(
-    rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data());
+    rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data(), work_estimate_);
   solve_to_workspace(top);  // Uses xi_workspace_ and x_workspace_ to fill rhs
   i_t nz = m - top;
   // Then T0 * T1 * ... * T_{num_updates_ - 1} * x = x0
@@ -2027,6 +2098,7 @@ i_t basis_update_mpf_t<i_t, f_t>::l_solve(sparse_vector_t<i_t, f_t>& rhs) const
       add_sparse_column(S_, u_col, -theta, xi_workspace_, nz, x_workspace_);
     }
   }
+  work_estimate_ += 2 * num_updates_;
 
   gather_into_sparse_vector(nz, rhs);
 
@@ -2049,6 +2121,7 @@ i_t basis_update_mpf_t<i_t, f_t>::update(const std::vector<f_t>& utilde,
   const i_t col_start = U0_.col_start[leaving_index];
   const i_t col_end   = U0_.col_start[leaving_index + 1];
   std::vector<f_t> u  = utilde;
+  work_estimate_ += 2 * utilde.size();
   // u = utilde - U0(:, leaving_index)
   add_sparse_column(U0_, leaving_index, -1.0, u);
 
@@ -2069,9 +2142,11 @@ i_t basis_update_mpf_t<i_t, f_t>::update(const std::vector<f_t>& utilde,
 
   // Scatter u into S
   S_.append_column(u);
+  work_estimate_ += u.size() + 3 * u_nz;
 
   // Scatter v into S
   S_.append_column(etilde);
+  work_estimate_ += etilde.size() + 3 * v_nz;
 
   // Compute mu = 1 + v^T * u
   const f_t mu = 1.0 + sparse_dot(S_.i.data() + S_.col_start[S_start],
@@ -2080,6 +2155,7 @@ i_t basis_update_mpf_t<i_t, f_t>::update(const std::vector<f_t>& utilde,
                                   S_.i.data() + S_.col_start[S_start + 1],
                                   S_.x.data() + S_.col_start[S_start + 1],
                                   v_nz);
+  work_estimate_ += 3 * std::min(u_nz, v_nz);
 
   if (std::abs(mu) < 1E-8 || std::abs(mu) > 1E+8) {
     // Force a refactor. Otherwise we will get numerical issues when dividing by mu.
@@ -2126,6 +2202,7 @@ i_t basis_update_mpf_t<i_t, f_t>::update(const sparse_vector_t<i_t, f_t>& utilde
 
   // Ensure the workspace is sorted. Otherwise, the sparse dot will be incorrect.
   std::sort(xi_workspace_.begin() + m, xi_workspace_.begin() + m + nz, std::less<i_t>());
+  work_estimate_ += (m + nz) * std::log2(m + nz);
 
   // Gather the workspace into a column of S
   i_t S_start;
@@ -2133,10 +2210,13 @@ i_t basis_update_mpf_t<i_t, f_t>::update(const sparse_vector_t<i_t, f_t>& utilde
   grow_storage(nz + etilde.i.size(), S_start, S_nz);
 
   S_.append_column(nz, xi_workspace_.data() + m, x_workspace_.data());
+  work_estimate_ += 5 * nz;
 
   // Gather etilde into a column of S
   etilde.sort();  // Needs to be sorted for the sparse dot. TODO(CMM): Is etilde sorted on input?
+  work_estimate_ += etilde.i.size() * std::log2(etilde.i.size());
   S_.append_column(etilde);
+  work_estimate_ += 4 * etilde.i.size();
 
   // Compute mu = 1 + v^T * u
   const f_t mu = 1.0 + sparse_dot(S_.i.data() + S_.col_start[S_start],
@@ -2145,6 +2225,7 @@ i_t basis_update_mpf_t<i_t, f_t>::update(const sparse_vector_t<i_t, f_t>& utilde
                                   S_.i.data() + S_.col_start[S_start + 1],
                                   S_.x.data() + S_.col_start[S_start + 1],
                                   S_.col_start[S_start + 2] - S_.col_start[S_start + 1]);
+  work_estimate_ += 3 * std::min(nz, static_cast<i_t>(etilde.i.size()));
 
   if (std::abs(mu) < 1E-8 || std::abs(mu) > 1E+8) {
     // Force a refactor. Otherwise we will get numerical issues when dividing by mu.
@@ -2158,6 +2239,7 @@ i_t basis_update_mpf_t<i_t, f_t>::update(const sparse_vector_t<i_t, f_t>& utilde
     x_workspace_[i]      = 0.0;
     xi_workspace_[m + k] = 0;
   }
+  work_estimate_ += 4 * nz;
 
 #ifdef PRINT_MU_INFO
   printf("Update mu %e u nz %d v nz %d\n",
@@ -2190,7 +2272,6 @@ void basis_update_mpf_t<i_t, f_t>::l_multiply(std::vector<f_t>& inout) const
     const f_t theta = dot;
     add_sparse_column(S_, u_col, theta, inout);
   }
-
   std::vector<f_t> out(m, 0.0);
   matrix_vector_multiply(L0_, 1.0, inout, 0.0, out);
   inout = out;
@@ -2271,7 +2352,10 @@ int basis_update_mpf_t<i_t, f_t>::refactor_basis(
   std::vector<i_t> slacks_needed;
   std::vector<i_t> superbasic_list;  // Empty superbasic list
 
-  if (L0_.m != A.m) { resize(A.m); }
+  if (L0_.m != A.m) {
+    resize(A.m);
+    work_estimate_ += A.m;
+  }
   std::vector<i_t> q;
   i_t status = factorize_basis(A,
                                settings,
@@ -2282,7 +2366,8 @@ int basis_update_mpf_t<i_t, f_t>::refactor_basis(
                                inverse_row_permutation_,
                                q,
                                deficient,
-                               slacks_needed);
+                               slacks_needed,
+                               work_estimate_);
   if (status == CONCURRENT_HALT_RETURN) { return CONCURRENT_HALT_RETURN; }
   if (status == -1) {
     settings.log.debug("Initial factorization failed\n");
@@ -2295,7 +2380,8 @@ int basis_update_mpf_t<i_t, f_t>::refactor_basis(
                  basic_list,
                  nonbasic_list,
                  superbasic_list,
-                 vstatus);
+                 vstatus,
+                 work_estimate_);
 
 #ifdef CHECK_BASIS_REPAIR
     const i_t m = A.m;
@@ -2323,7 +2409,8 @@ int basis_update_mpf_t<i_t, f_t>::refactor_basis(
                              inverse_row_permutation_,
                              q,
                              deficient,
-                             slacks_needed);
+                             slacks_needed,
+                             work_estimate_);
     if (status == CONCURRENT_HALT_RETURN) { return CONCURRENT_HALT_RETURN; }
     if (status == -1) {
 #ifdef CHECK_L_FACTOR
@@ -2338,6 +2425,7 @@ int basis_update_mpf_t<i_t, f_t>::refactor_basis(
 
   assert(q.size() == A.m);
   reorder_basic_list(q, basic_list);  // We no longer need q after reordering the basic list
+  work_estimate_ += 3 * q.size();
   reset();
   return 0;
 }
diff --git a/cpp/src/dual_simplex/basis_updates.hpp b/cpp/src/dual_simplex/basis_updates.hpp
index d9783053f7..bb2b42d8ae 100644
--- a/cpp/src/dual_simplex/basis_updates.hpp
+++ b/cpp/src/dual_simplex/basis_updates.hpp
@@ -265,6 +265,7 @@ class basis_update_mpf_t {
     assert(p.size() == Linit.m);
     row_permutation_ = p;
     inverse_permutation(row_permutation_, inverse_row_permutation_);
+    work_estimate_ += 4 * p.size();
     clear();
     compute_transposes();
     reset_stats();
@@ -371,6 +372,7 @@ class basis_update_mpf_t {
   {
     L0_.transpose(L0_transpose_);
     U0_.transpose(U0_transpose_);
+    work_estimate_ += 6 * L0_.col_start[L0_.n] + 6 * U0_.col_start[U0_.n];
   }
 
   void multiply_lu(csc_matrix_t<i_t, f_t>& out) const;
@@ -386,6 +388,9 @@ class basis_update_mpf_t {
 
   void set_refactor_frequency(i_t new_frequency) { refactor_frequency_ = new_frequency; }
 
+  f_t work_estimate() const { return work_estimate_; }
+  void clear_work_estimate() { work_estimate_ = 0.0; }
+
  private:
   void clear()
   {
@@ -400,9 +405,11 @@ class basis_update_mpf_t {
     mu_values_.clear();
     mu_values_.reserve(refactor_frequency_);
     num_updates_ = 0;
+    work_estimate_ += 2 * refactor_frequency_;
 
     std::fill(xi_workspace_.begin(), xi_workspace_.end(), 0);
     std::fill(x_workspace_.begin(), x_workspace_.end(), 0.0);
+    work_estimate_ += xi_workspace_.size() + x_workspace_.size();
   }
 
   void grow_storage(i_t nz, i_t& S_start, i_t& S_nz);
@@ -472,6 +479,8 @@ class basis_update_mpf_t {
   mutable f_t sum_U_transpose_;
 
   f_t hypersparse_threshold_;
+
+  mutable f_t work_estimate_{0.0};
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
index fac65b8140..e30b067398 100644
--- a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
+++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
@@ -26,9 +26,9 @@ i_t bound_flipping_ratio_test_t<i_t, f_t>::compute_breakpoints(std::vector<i_t>&
 
   i_t idx = 0;
   while (idx == 0 && pivot_tol >= 1e-12) {
-    // for (i_t k = 0; k < n - m; ++k) {
-    //   const i_t j = nonbasic_list_[k];
-    for (i_t h = 0; h < delta_z_indices_.size(); ++h) {
+    // Loop over the nonbasic variables j with non-zero delta_z
+    const i_t nz = delta_z_indices_.size();
+    for (i_t h = 0; h < nz; ++h) {
       const i_t j = delta_z_indices_[h];
       const i_t k = nonbasic_mark_[j];
       if (vstatus_[j] == variable_status_t::NONBASIC_FIXED) { continue; }
@@ -45,6 +45,8 @@ i_t bound_flipping_ratio_test_t<i_t, f_t>::compute_breakpoints(std::vector<i_t>&
         idx++;
       }
     }
+    work_estimate_ += 4 * nz;
+    work_estimate_ += 4 * idx;
     pivot_tol /= 10;
   }
   return idx;
@@ -66,11 +68,15 @@ i_t bound_flipping_ratio_test_t<i_t, f_t>::single_pass(i_t start,
   i_t candidate  = -1;
   f_t zero_tol   = settings_.zero_tol;
   i_t k_idx      = -1;
+
+  i_t min_found    = 0;
+  i_t harris_found = 0;
   for (i_t k = start; k < end; ++k) {
     if (ratios[k] < min_val) {
       min_val   = ratios[k];
       candidate = indicies[k];
       k_idx     = k;
+      min_found++;
     } else if (ratios[k] < min_val + zero_tol) {
       // Use Harris to select variables with larger pivots
       const i_t j = nonbasic_list_[indicies[k]];
@@ -79,8 +85,11 @@ i_t bound_flipping_ratio_test_t<i_t, f_t>::single_pass(i_t start,
         candidate = indicies[k];
         k_idx     = k;
       }
+      harris_found++;
     }
   }
+  work_estimate_ += (end - start) + 2 * min_found + 6 * harris_found;
+
   step_length       = min_val;
   nonbasic_entering = candidate;
   // this should be temporary, find root causes where the candidate is not filled
@@ -116,6 +125,7 @@ i_t bound_flipping_ratio_test_t<i_t, f_t>::compute_step_length(f_t& step_length,
   // Compute the initial set of breakpoints
   std::vector<i_t> indicies(nz);
   std::vector<f_t> ratios(nz);
+  work_estimate_ += 2 * nz;
   i_t num_breakpoints = compute_breakpoints(indicies, ratios);
   if constexpr (verbose) { settings_.log.printf("Initial breakpoints %d\n", num_breakpoints); }
   if (num_breakpoints == 0) {
@@ -132,7 +142,7 @@ i_t bound_flipping_ratio_test_t<i_t, f_t>::compute_step_length(f_t& step_length,
   if (k_idx == RATIO_TEST_NUMERICAL_ISSUES) { return RATIO_TEST_NUMERICAL_ISSUES; }
   bool continue_search = k_idx >= 0 && num_breakpoints > 1 && slope > 0.0;
   if (!continue_search) {
-    if constexpr (0) {
+    if constexpr (verbose) {
       settings_.log.printf(
         "BFRT stopping. No bound flips. Step length %e Nonbasic entering %d Entering %d pivot %e\n",
         step_length,
@@ -163,6 +173,7 @@ i_t bound_flipping_ratio_test_t<i_t, f_t>::compute_step_length(f_t& step_length,
     for (i_t k = 0; k < num_breakpoints - 1; ++k) {
       if (ratios[k] > max_ratio) { max_ratio = ratios[k]; }
     }
+    work_estimate_ += 2 * num_breakpoints;
     settings_.log.printf(
       "Starting heap passes. %d breakpoints max ratio %e\n", num_breakpoints - 1, max_ratio);
     bucket_pass(
@@ -207,6 +218,7 @@ void bound_flipping_ratio_test_t<i_t, f_t>::heap_passes(const std::vector<i_t>&
                            std::abs(delta_z[nonbasic_list[current_indicies[k]]]));
     }
   }
+  work_estimate_ += N;
 
   auto compare = [zero_tol, &current_ratios, &current_indicies, &delta_z, &nonbasic_list](
                    const i_t& a, const i_t& b) {
@@ -217,11 +229,13 @@ void bound_flipping_ratio_test_t<i_t, f_t>::heap_passes(const std::vector<i_t>&
   };
 
   std::make_heap(bare_idx.begin(), bare_idx.end(), compare);
+  work_estimate_ += 3 * bare_idx.size();
 
   while (bare_idx.size() > 0 && slope > 0) {
     // Remove minimum ratio from the heap and rebalance
     i_t heap_index = bare_idx.front();
     std::pop_heap(bare_idx.begin(), bare_idx.end(), compare);
+    work_estimate_ += 2 * std::log2(bare_idx.size());
     bare_idx.pop_back();
 
     nonbasic_entering = current_indicies[heap_index];
diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp
index 51b00b1097..244ff334df 100644
--- a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp
+++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp
@@ -54,6 +54,7 @@ class bound_flipping_ratio_test_t {
   }
 
   i_t compute_step_length(f_t& step_length, i_t& nonbasic_entering);
+  f_t work_estimate() const { return work_estimate_; }
 
  private:
   i_t compute_breakpoints(std::vector<i_t>& indices, std::vector<f_t>& ratios);
@@ -98,6 +99,8 @@ class bound_flipping_ratio_test_t {
 
   i_t n_;
   i_t m_;
+
+  f_t work_estimate_;
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/bounds_strengthening.cpp b/cpp/src/dual_simplex/bounds_strengthening.cpp
index 37ab114a7e..6506ce4873 100644
--- a/cpp/src/dual_simplex/bounds_strengthening.cpp
+++ b/cpp/src/dual_simplex/bounds_strengthening.cpp
@@ -102,11 +102,6 @@ bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
   std::vector<bool> variable_changed(n, false);
   std::vector<bool> constraint_changed_next(m, false);
 
-  auto& A_i    = A.i.underlying();
-  auto& A_x    = A.x.underlying();
-  auto& Arow_j = Arow.j.underlying();
-  auto& Arow_x = Arow.x.underlying();
-
   size_t nnz_processed = 0;
 
   if (!bounds_changed.empty()) {
@@ -116,7 +111,7 @@ bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
         const i_t col_start = A.col_start[j];
         const i_t col_end   = A.col_start[j + 1];
         for (i_t p = col_start; p < col_end; ++p) {
-          const i_t i           = A_i[p];
+          const i_t i           = A.i[p];
           constraint_changed[i] = true;
         }
       }
@@ -139,8 +134,8 @@ bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
       f_t min_a = 0.0;
       f_t max_a = 0.0;
       for (i_t p = row_start; p < row_end; ++p) {
-        const i_t j    = Arow_j[p];
-        const f_t a_ij = Arow_x[p];
+        const i_t j    = Arow.j[p];
+        const f_t a_ij = Arow.x[p];
 
         variable_changed[j] = true;
         if (a_ij > 0) {
@@ -192,10 +187,10 @@ bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
       const i_t col_end   = A.col_start[k + 1];
       nnz_processed += (col_end - col_start);
       for (i_t p = col_start; p < col_end; ++p) {
-        const i_t i = A_i[p];
+        const i_t i = A.i[p];
 
         if (!constraint_changed[i]) { continue; }
-        const f_t a_ik = A_x[p];
+        const f_t a_ik = A.x[p];
 
         f_t delta_min_act = delta_min_activity[i];
         f_t delta_max_act = delta_max_activity[i];
diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp
index 597628e735..aa2baa6ea2 100644
--- a/cpp/src/dual_simplex/crossover.cpp
+++ b/cpp/src/dual_simplex/crossover.cpp
@@ -505,8 +505,9 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
         std::vector<i_t> q(m);
         std::vector<i_t> deficient;
         std::vector<i_t> slacks_needed;
-        i_t rank =
-          factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
+        f_t work_estimate = 0;
+        i_t rank          = factorize_basis(
+          lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate);
         if (rank == CONCURRENT_HALT_RETURN) {
           return CONCURRENT_HALT_RETURN;
         } else if (rank != m) {
@@ -520,9 +521,10 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
                        basic_list,
                        nonbasic_list,
                        superbasic_list,
-                       vstatus);
-          rank =
-            factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
+                       vstatus,
+                       work_estimate);
+          rank = factorize_basis(
+            lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate);
           if (rank == CONCURRENT_HALT_RETURN) {
             return CONCURRENT_HALT_RETURN;
           } else if (rank == -1) {
@@ -860,8 +862,9 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
         std::vector<i_t> q(m);
         std::vector<i_t> deficient;
         std::vector<i_t> slacks_needed;
-        i_t rank =
-          factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
+        f_t work_estimate = 0;
+        i_t rank          = factorize_basis(
+          lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate);
         if (rank == CONCURRENT_HALT_RETURN) {
           return CONCURRENT_HALT_RETURN;
         } else if (rank != m) {
@@ -875,12 +878,13 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
                        basic_list,
                        nonbasic_list,
                        superbasic_list,
-                       vstatus);
+                       vstatus,
+                       work_estimate);
           // We need to be careful. As basis_repair may have changed the superbasic list
           find_primal_superbasic_variables(
             lp, settings, solution, solution, vstatus, nonbasic_list, superbasic_list);
-          rank =
-            factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
+          rank = factorize_basis(
+            lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate);
           if (rank == CONCURRENT_HALT_RETURN) {
             return CONCURRENT_HALT_RETURN;
           } else if (rank == -1) {
@@ -1142,6 +1146,7 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   const i_t m         = lp.num_rows;
   const i_t n         = lp.num_cols;
   f_t crossover_start = tic();
+  f_t work_estimate   = 0;
 
   settings.log.printf("\n");
   settings.log.printf("Starting crossover\n");
@@ -1223,7 +1228,8 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   std::vector<i_t> deficient;
   std::vector<i_t> slacks_needed;
 
-  rank = factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
+  rank = factorize_basis(
+    lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate);
   if (rank == CONCURRENT_HALT_RETURN) { return crossover_status_t::CONCURRENT_LIMIT; }
   if (rank != m) {
     settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m);
@@ -1236,8 +1242,10 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
                  basic_list,
                  nonbasic_list,
                  superbasic_list,
-                 vstatus);
-    rank = factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
+                 vstatus,
+                 work_estimate);
+    rank = factorize_basis(
+      lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate);
     if (rank == CONCURRENT_HALT_RETURN) {
       return crossover_status_t::CONCURRENT_LIMIT;
     } else if (rank == -1) {
@@ -1392,8 +1400,8 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
       nonbasic_list.clear();
       superbasic_list.clear();
       get_basis_from_vstatus(m, vstatus, basic_list, nonbasic_list, superbasic_list);
-      rank =
-        factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
+      rank = factorize_basis(
+        lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate);
       if (rank == CONCURRENT_HALT_RETURN) {
         return crossover_status_t::CONCURRENT_LIMIT;
       } else if (rank != m) {
@@ -1407,9 +1415,10 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
                      basic_list,
                      nonbasic_list,
                      superbasic_list,
-                     vstatus);
-        rank =
-          factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
+                     vstatus,
+                     work_estimate);
+        rank = factorize_basis(
+          lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate);
         if (rank == CONCURRENT_HALT_RETURN) {
           return crossover_status_t::CONCURRENT_LIMIT;
         } else if (rank == -1) {
diff --git a/cpp/src/dual_simplex/folding.cpp b/cpp/src/dual_simplex/folding.cpp
index 5e42afb01f..f851e51d79 100644
--- a/cpp/src/dual_simplex/folding.cpp
+++ b/cpp/src/dual_simplex/folding.cpp
@@ -128,8 +128,8 @@ void compute_sums(const csc_matrix_t<i_t, f_t>& A,
     // Find all vertices (columns) that have a neighbor in the refining color
     colors_to_update.reserve(num_col_colors);
     find_vertices_to_refine(refining_color.vertices,
-                            Arow.row_start.underlying(),
-                            Arow.j.underlying(),
+                            Arow.row_start,
+                            Arow.j,
                             col_color_map,
                             marked_vertices,
                             vertices_to_refine,
@@ -145,9 +145,9 @@ void compute_sums(const csc_matrix_t<i_t, f_t>& A,
     compute_sums_of_refined_vertices(refining_color.color,
                                      refining_color.vertices,
                                      vertices_to_refine,
-                                     Arow.row_start.underlying(),
-                                     Arow.j.underlying(),
-                                     Arow.x.underlying(),
+                                     Arow.row_start,
+                                     Arow.j,
+                                     Arow.x,
                                      col_color_map,
                                      vertex_to_sum,
                                      max_sum_by_color);
@@ -156,8 +156,8 @@ void compute_sums(const csc_matrix_t<i_t, f_t>& A,
     // Find all vertices (rows) that have a neighbor in the refining color
     colors_to_update.reserve(num_row_colors);
     find_vertices_to_refine(refining_color.vertices,
-                            A.col_start.underlying(),
-                            A.i.underlying(),
+                            A.col_start,
+                            A.i,
                             row_color_map,
                             marked_vertices,
                             vertices_to_refine,
@@ -173,9 +173,9 @@ void compute_sums(const csc_matrix_t<i_t, f_t>& A,
     compute_sums_of_refined_vertices(refining_color.color,
                                      refining_color.vertices,
                                      vertices_to_refine,
-                                     A.col_start.underlying(),
-                                     A.i.underlying(),
-                                     A.x.underlying(),
+                                     A.col_start,
+                                     A.i,
+                                     A.x,
                                      row_color_map,
                                      vertex_to_sum,
                                      max_sum_by_color);
diff --git a/cpp/src/dual_simplex/initial_basis.cpp b/cpp/src/dual_simplex/initial_basis.cpp
index 5da8449040..9343769d00 100644
--- a/cpp/src/dual_simplex/initial_basis.cpp
+++ b/cpp/src/dual_simplex/initial_basis.cpp
@@ -79,7 +79,8 @@ i_t initial_basis_selection(const lp_problem_t<i_t, f_t>& problem,
     i_t row_singletons;
     i_t col_singletons;
     std::vector<i_t> row_perm(N);
-    find_singletons(C, row_singletons, row_perm, col_singletons, q);
+    f_t work_estimate = 0;
+    find_singletons(C, row_singletons, row_perm, col_singletons, q, work_estimate);
     std::vector<i_t> row_perm_inv(N);
     inverse_permutation(row_perm, row_perm_inv);
 
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index 1e057a16b8..50af6971b9 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -326,55 +326,47 @@ template <typename i_t, typename f_t>
 void compute_reduced_cost_update(const lp_problem_t<i_t, f_t>& lp,
                                  const std::vector<i_t>& basic_list,
                                  const std::vector<i_t>& nonbasic_list,
-                                 const ins_vector<f_t>& delta_y,
+                                 const std::vector<f_t>& delta_y,
                                  i_t leaving_index,
                                  i_t direction,
-                                 ins_vector<i_t>& delta_z_mark,
-                                 ins_vector<i_t>& delta_z_indices,
-                                 ins_vector<f_t>& delta_z)
+                                 std::vector<i_t>& delta_z_mark,
+                                 std::vector<i_t>& delta_z_indices,
+                                 std::vector<f_t>& delta_z,
+                                 f_t& work_estimate)
 {
   const i_t m = lp.num_rows;
   const i_t n = lp.num_cols;
 
-  const f_t* __restrict__ delta_y_ptr   = delta_y.data();
-  const f_t* __restrict__ Ax            = lp.A.x.data();
-  const i_t* __restrict__ Ai            = lp.A.i.data();
-  const i_t* __restrict__ ptr_col_start = lp.A.col_start.data();
-  f_t* __restrict__ delta_z_ptr         = delta_z.data();
-
   size_t nnzs_processed = 0;
-
   // delta_zB = sigma*ei
   for (i_t k = 0; k < m; k++) {
-    const i_t j    = basic_list[k];
-    delta_z_ptr[j] = 0;
+    const i_t j = basic_list[k];
+    delta_z[j]  = 0;
   }
-  delta_z_ptr[leaving_index] = direction;
+  work_estimate += 2 * m;
+  delta_z[leaving_index] = direction;
   // delta_zN = -N'*delta_y
   const i_t num_nonbasic = n - m;
   for (i_t k = 0; k < num_nonbasic; k++) {
     const i_t j = nonbasic_list[k];
     // z_j <- -A(:, j)'*delta_y
-    const i_t col_start = ptr_col_start[j];
-    const i_t col_end   = ptr_col_start[j + 1];
+    const i_t col_start = lp.A.col_start[j];
+    const i_t col_end   = lp.A.col_start[j + 1];
     f_t dot             = 0.0;
     for (i_t p = col_start; p < col_end; ++p) {
-      dot += Ax[p] * delta_y_ptr[Ai[p]];
+      dot += lp.A.x[p] * delta_y[lp.A.i[p]];
     }
     nnzs_processed += col_end - col_start;
 
-    delta_z_ptr[j] = -dot;
+    delta_z[j] = -dot;
     if (dot != 0.0) {
       delta_z_indices.push_back(j);  // Note delta_z_indices has n elements reserved
       delta_z_mark[j] = 1;
     }
   }
-
-  lp.A.x.byte_loads += nnzs_processed * sizeof(f_t);
-  lp.A.i.byte_loads += nnzs_processed * sizeof(i_t);
-  delta_y.byte_loads += nnzs_processed * sizeof(f_t);
-  lp.A.col_start.byte_loads += 2 * num_nonbasic * sizeof(i_t);
-  delta_z.byte_stores += (m + 1 + num_nonbasic) * sizeof(f_t);
+  work_estimate += 3 * num_nonbasic;
+  work_estimate += 3 * nnzs_processed;
+  work_estimate += 2 * delta_z_indices.size();
 }
 
 template <typename i_t, typename f_t>
@@ -382,67 +374,42 @@ void compute_delta_z(const csc_matrix_t<i_t, f_t>& A_transpose,
                      const sparse_vector_t<i_t, f_t>& delta_y,
                      i_t leaving_index,
                      i_t direction,
-                     ins_vector<i_t>& nonbasic_mark,
-                     ins_vector<i_t>& delta_z_mark,
-                     ins_vector<i_t>& delta_z_indices,
-                     ins_vector<f_t>& delta_z)
+                     std::vector<i_t>& nonbasic_mark,
+                     std::vector<i_t>& delta_z_mark,
+                     std::vector<i_t>& delta_z_indices,
+                     std::vector<f_t>& delta_z,
+                     f_t& work_estimate)
 {
-  auto& At_cs  = A_transpose.col_start.underlying();
-  auto& At_i   = A_transpose.i.underlying();
-  auto& At_x   = A_transpose.x.underlying();
-  auto& dy_i   = delta_y.i.underlying();
-  auto& dy_x   = delta_y.x.underlying();
-  auto& nb_mk  = nonbasic_mark.underlying();
-  auto& dz_mk  = delta_z_mark.underlying();
-  auto& dz_idx = delta_z_indices.underlying();
-  auto& dz     = delta_z.underlying();
-
   // delta_zN = - N'*delta_y
-  const i_t nz_delta_y = dy_i.size();
-  size_t nnz_processed = 0;
-  size_t nb_reads      = 0;
-  size_t dz_rmws       = 0;
-  size_t dz_mk_reads   = 0;
-  size_t dz_mk_writes  = 0;
-  size_t dz_idx_writes = 0;
+  const i_t nz_delta_y   = delta_y.i.size();
+  size_t nnz_processed   = 0;
+  size_t nonbasic_marked = 0;
   for (i_t k = 0; k < nz_delta_y; k++) {
-    const i_t i         = dy_i[k];
-    const f_t delta_y_i = dy_x[k];
+    const i_t i         = delta_y.i[k];
+    const f_t delta_y_i = delta_y.x[k];
     if (std::abs(delta_y_i) < 1e-12) { continue; }
-    const i_t row_start = At_cs[i];
-    const i_t row_end   = At_cs[i + 1];
+    const i_t row_start = A_transpose.col_start[i];
+    const i_t row_end   = A_transpose.col_start[i + 1];
     nnz_processed += row_end - row_start;
     for (i_t p = row_start; p < row_end; ++p) {
-      const i_t j = At_i[p];
-      nb_reads++;
-      if (nb_mk[j] >= 0) {
-        dz[j] -= delta_y_i * At_x[p];
-        dz_rmws++;
-        dz_mk_reads++;
-        if (!dz_mk[j]) {
-          dz_mk[j] = 1;
-          dz_idx.push_back(j);
-          dz_mk_writes++;
-          dz_idx_writes++;
+      const i_t j = A_transpose.i[p];
+      if (nonbasic_mark[j] >= 0) {
+        delta_z[j] -= delta_y_i * A_transpose.x[p];
+        nonbasic_marked++;
+        if (!delta_z_mark[j]) {
+          delta_z_mark[j] = 1;
+          delta_z_indices.push_back(j);
         }
       }
     }
   }
+  work_estimate += 4 * nz_delta_y;
+  work_estimate += 2 * nnz_processed;
+  work_estimate += 3 * nonbasic_marked;
+  work_estimate += 2 * delta_z_indices.size();
 
   // delta_zB = sigma*ei
-  dz[leaving_index] = direction;
-
-  delta_y.i.byte_loads += nz_delta_y * sizeof(i_t);
-  delta_y.x.byte_loads += nz_delta_y * sizeof(f_t);
-  A_transpose.col_start.byte_loads += 2 * nz_delta_y * sizeof(i_t);
-  A_transpose.i.byte_loads += nnz_processed * sizeof(i_t);
-  A_transpose.x.byte_loads += dz_rmws * sizeof(f_t);
-  nonbasic_mark.byte_loads += nb_reads * sizeof(i_t);
-  delta_z.byte_loads += dz_rmws * sizeof(f_t);
-  delta_z.byte_stores += (dz_rmws + 1) * sizeof(f_t);
-  delta_z_mark.byte_loads += dz_mk_reads * sizeof(i_t);
-  delta_z_mark.byte_stores += dz_mk_writes * sizeof(i_t);
-  delta_z_indices.byte_stores += dz_idx_writes * sizeof(i_t);
+  delta_z[leaving_index] = direction;
 
 #ifdef CHECK_CHANGE_IN_REDUCED_COST
   delta_y_sparse.to_dense(delta_y);
@@ -480,7 +447,8 @@ void compute_reduced_costs(const std::vector<f_t>& objective,
                            const std::vector<f_t>& y,
                            const std::vector<i_t>& basic_list,
                            const std::vector<i_t>& nonbasic_list,
-                           std::vector<f_t>& z)
+                           std::vector<f_t>& z,
+                           f_t& work_estimate)
 {
   PHASE2_NVTX_RANGE("DualSimplex::compute_reduced_costs");
 
@@ -499,12 +467,15 @@ void compute_reduced_costs(const std::vector<f_t>& objective,
     for (i_t p = col_start; p < col_end; ++p) {
       dot += A.x[p] * y[A.i[p]];
     }
+    work_estimate += 3 * (col_end - col_start);
     z[j] -= dot;
   }
+  work_estimate += 5 * (n - m);
   // zB = 0
   for (i_t k = 0; k < m; ++k) {
     z[basic_list[k]] = 0.0;
   }
+  work_estimate += 2 * m;
 }
 
 template <typename i_t, typename f_t>
@@ -515,12 +486,14 @@ void compute_primal_variables(const basis_update_mpf_t<i_t, f_t>& ft,
                               const std::vector<i_t>& nonbasic_list,
                               f_t tight_tol,
                               std::vector<f_t>& x,
-                              ins_vector<f_t>& xB_workspace)
+                              std::vector<f_t>& xB_workspace,
+                              f_t& work_estimate)
 {
   PHASE2_NVTX_RANGE("DualSimplex::compute_primal_variables");
   const i_t m          = A.m;
   const i_t n          = A.n;
   std::vector<f_t> rhs = lp_rhs;
+  work_estimate += 2 * m;
   // rhs = b - sum_{j : x_j = l_j} A(:, j) * l(j)
   //         - sum_{j : x_j = u_j} A(:, j) * u(j)
   for (i_t k = 0; k < n - m; ++k) {
@@ -532,25 +505,31 @@ void compute_primal_variables(const basis_update_mpf_t<i_t, f_t>& ft,
     for (i_t p = col_start; p < col_end; ++p) {
       rhs[A.i[p]] -= xj * A.x[p];
     }
+    work_estimate += 3 * (col_end - col_start);
   }
+  work_estimate += 5 * (n - m);
 
   xB_workspace.resize(m);
+  work_estimate += m;
   ft.b_solve(rhs, xB_workspace);
 
   for (i_t k = 0; k < m; ++k) {
     const i_t j = basic_list[k];
     x[j]        = xB_workspace[k];
   }
+  work_estimate += 2 * m;
 }
 
+// Work is 3*delta_z_indices.size()
 template <typename i_t, typename f_t>
 void clear_delta_z(i_t entering_index,
                    i_t leaving_index,
-                   ins_vector<i_t>& delta_z_mark,
-                   ins_vector<i_t>& delta_z_indices,
-                   ins_vector<f_t>& delta_z)
+                   std::vector<i_t>& delta_z_mark,
+                   std::vector<i_t>& delta_z_indices,
+                   std::vector<f_t>& delta_z)
 {
-  for (i_t k = 0; k < delta_z_indices.size(); k++) {
+  const i_t nz = delta_z_indices.size();
+  for (i_t k = 0; k < nz; k++) {
     const i_t j     = delta_z_indices[k];
     delta_z[j]      = 0.0;
     delta_z_mark[j] = 0;
@@ -564,13 +543,15 @@ template <typename i_t, typename f_t>
 void clear_delta_x(const std::vector<i_t>& basic_list,
                    i_t entering_index,
                    sparse_vector_t<i_t, f_t>& scaled_delta_xB_sparse,
-                   ins_vector<f_t>& delta_x)
+                   std::vector<f_t>& delta_x,
+                   f_t& work_estimate)
 {
   const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size();
   for (i_t k = 0; k < scaled_delta_xB_nz; ++k) {
     const i_t j = basic_list[scaled_delta_xB_sparse.i[k]];
     delta_x[j]  = 0.0;
   }
+  work_estimate += 3 * scaled_delta_xB_nz;
   // Leaving index already included above
   delta_x[entering_index] = 0.0;
   scaled_delta_xB_sparse.i.clear();
@@ -631,7 +612,7 @@ void vstatus_changes(const std::vector<variable_status_t>& vstatus,
 template <typename f_t>
 void compute_bounded_info(const std::vector<f_t>& lower,
                           const std::vector<f_t>& upper,
-                          ins_vector<uint8_t>& bounded_variables)
+                          std::vector<uint8_t>& bounded_variables)
 {
   const size_t n = lower.size();
   for (size_t j = 0; j < n; j++) {
@@ -646,17 +627,20 @@ void compute_dual_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
                                       const std::vector<i_t>& basic_list,
                                       const std::vector<i_t>& nonbasic_list,
                                       std::vector<f_t>& y,
-                                      std::vector<f_t>& z)
+                                      std::vector<f_t>& z,
+                                      f_t& work_estimate)
 {
   const i_t m = lp.num_rows;
   const i_t n = lp.num_cols;
 
   y.resize(m);
-  ins_vector<f_t> cB(m);
+  std::vector<f_t> cB(m);
+  work_estimate += 2 * m;
   for (i_t k = 0; k < m; ++k) {
     const i_t j = basic_list[k];
     cB[k]       = lp.objective[j];
   }
+  work_estimate += 3 * m;
   ft.b_transpose_solve(cB, y);
 
   // We want A'y + z = c
@@ -664,6 +648,7 @@ void compute_dual_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
   // B' y = c_B, z_B = 0
   // N' y + z_N = c_N
   z.resize(n);
+  work_estimate += n;
   // zN = cN - N'*y
   for (i_t k = 0; k < n - m; k++) {
     const i_t j = nonbasic_list[k];
@@ -677,12 +662,15 @@ void compute_dual_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
     for (i_t p = col_start; p < col_end; ++p) {
       dot += lp.A.x[p] * y[lp.A.i[p]];
     }
+    work_estimate += 3 * (col_end - col_start);
     z[j] -= dot;
   }
+  work_estimate += 5 * (n - m);
   // zB = 0
   for (i_t k = 0; k < m; ++k) {
     z[basic_list[k]] = 0.0;
   }
+  work_estimate += 2 * m;
 }
 
 template <typename i_t, typename f_t>
@@ -692,11 +680,13 @@ i_t compute_primal_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
                                        const std::vector<i_t>& nonbasic_list,
                                        const std::vector<variable_status_t>& vstatus,
                                        std::vector<f_t>& x,
-                                       ins_vector<f_t>& xB_workspace)
+                                       std::vector<f_t>& xB_workspace,
+                                       f_t& work_estimate)
 {
   const i_t m          = lp.num_rows;
   const i_t n          = lp.num_cols;
   std::vector<f_t> rhs = lp.rhs;
+  work_estimate += 2 * m;
 
   for (i_t k = 0; k < n - m; ++k) {
     const i_t j = nonbasic_list[k];
@@ -709,6 +699,7 @@ i_t compute_primal_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
       x[j] = 0.0;
     }
   }
+  work_estimate += 4 * (n - m);
 
   // rhs = b - sum_{j : x_j = l_j} A(:, j) l(j) - sum_{j : x_j = u_j} A(:, j) *
   // u(j)
@@ -720,25 +711,30 @@ i_t compute_primal_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
     for (i_t p = col_start; p < col_end; ++p) {
       rhs[lp.A.i[p]] -= xj * lp.A.x[p];
     }
+    work_estimate += 3 * (col_end - col_start);
   }
+  work_estimate += 4 * (n - m);
 
   xB_workspace.resize(m);
+  work_estimate += m;
   ft.b_solve(rhs, xB_workspace);
 
   for (i_t k = 0; k < m; ++k) {
     const i_t j = basic_list[k];
     x[j]        = xB_workspace[k];
   }
+  work_estimate += 3 * m;
   return 0;
 }
 
+// Work is 4*m + 2*n
 template <typename i_t, typename f_t>
 f_t compute_initial_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
                                            const simplex_solver_settings_t<i_t, f_t>& settings,
                                            const std::vector<i_t>& basic_list,
                                            const std::vector<f_t>& x,
-                                           ins_vector<f_t>& squared_infeasibilities,
-                                           ins_vector<i_t>& infeasibility_indices,
+                                           std::vector<f_t>& squared_infeasibilities,
+                                           std::vector<i_t>& infeasibility_indices,
                                            f_t& primal_inf)
 {
   PHASE2_NVTX_RANGE("DualSimplex::compute_initial_primal_infeasibilities");
@@ -816,7 +812,8 @@ void update_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
                                    std::vector<i_t>& basic_change_list,
                                    std::vector<f_t>& squared_infeasibilities,
                                    std::vector<i_t>& infeasibility_indices,
-                                   f_t& primal_inf)
+                                   f_t& primal_inf,
+                                   f_t& work_estimate)
 {
   const f_t primal_tol = settings.primal_tol;
   const i_t nz         = basic_change_list.size();
@@ -841,45 +838,41 @@ void update_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
                                        j,
                                        primal_inf);
   }
+  work_estimate += 8 * nz;
 }
 
 template <typename i_t, typename f_t>
-void clean_up_infeasibilities(ins_vector<f_t>& squared_infeasibilities,
-                              ins_vector<i_t>& infeasibility_indices)
+void clean_up_infeasibilities(std::vector<f_t>& squared_infeasibilities,
+                              std::vector<i_t>& infeasibility_indices,
+                              f_t& work_estimate)
 {
-  auto& sq_inf = squared_infeasibilities.underlying();
-  auto& idx    = infeasibility_indices.underlying();
-
   bool needs_clean_up  = false;
-  const i_t initial_nz = idx.size();
+  const i_t initial_nz = infeasibility_indices.size();
   for (i_t k = 0; k < initial_nz; ++k) {
-    const i_t j              = idx[k];
-    const f_t squared_infeas = sq_inf[j];
+    const i_t j              = infeasibility_indices[k];
+    const f_t squared_infeas = squared_infeasibilities[j];
     if (squared_infeas == 0.0) { needs_clean_up = true; }
   }
-
-  size_t idx_accesses = initial_nz;
-  size_t sq_accesses  = initial_nz;
+  work_estimate += 2 * initial_nz;
 
   if (needs_clean_up) {
-    for (size_t k = 0; k < idx.size(); ++k) {
-      const i_t j              = idx[k];
-      const f_t squared_infeas = sq_inf[j];
-      idx_accesses++;
-      sq_accesses++;
+    i_t num_cleans = 0;
+    work_estimate += 2 * infeasibility_indices.size();
+    for (size_t k = 0; k < infeasibility_indices.size(); ++k) {
+      const i_t j              = infeasibility_indices[k];
+      const f_t squared_infeas = squared_infeasibilities[j];
       if (squared_infeas == 0.0) {
-        const i_t new_j = idx.back();
-        idx[k]          = new_j;
-        idx.pop_back();
-        idx_accesses += 2;
-        sq_accesses++;
-        if (sq_inf[new_j] == 0.0) { k--; }
+        const i_t new_j          = infeasibility_indices.back();
+        infeasibility_indices[k] = new_j;
+        infeasibility_indices.pop_back();
+        if (squared_infeasibilities[new_j] == 0.0) {
+          k--;
+        }  // Decrement k so that we process the same index again
+        num_cleans++;
       }
     }
+    work_estimate += 4 * num_cleans;
   }
-
-  infeasibility_indices.byte_loads += idx_accesses * sizeof(i_t);
-  squared_infeasibilities.byte_loads += sq_accesses * sizeof(f_t);
 }
 
 template <typename i_t, typename f_t>
@@ -887,23 +880,21 @@ i_t steepest_edge_pricing_with_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
                                                const simplex_solver_settings_t<i_t, f_t>& settings,
                                                const std::vector<f_t>& x,
                                                const std::vector<f_t>& dy_steepest_edge,
-                                               const ins_vector<i_t>& basic_mark,
-                                               ins_vector<f_t>& squared_infeasibilities,
-                                               ins_vector<i_t>& infeasibility_indices,
+                                               const std::vector<i_t>& basic_mark,
+                                               std::vector<f_t>& squared_infeasibilities,
+                                               std::vector<i_t>& infeasibility_indices,
                                                i_t& direction,
                                                i_t& basic_leaving,
-                                               f_t& max_val)
+                                               f_t& max_val,
+                                               f_t& work_estimate)
 {
-  auto& sq_inf = squared_infeasibilities.underlying();
-  auto& idx    = infeasibility_indices.underlying();
-  auto& bmark  = basic_mark.underlying();
-
   max_val           = 0.0;
   i_t leaving_index = -1;
-  const i_t nz      = idx.size();
+  const i_t nz      = infeasibility_indices.size();
+  i_t max_count     = 0;
   for (i_t k = 0; k < nz; ++k) {
-    const i_t j              = idx[k];
-    const f_t squared_infeas = sq_inf[j];
+    const i_t j              = infeasibility_indices[k];
+    const f_t squared_infeas = squared_infeasibilities[j];
     const f_t val            = squared_infeas / dy_steepest_edge[j];
     if (val > max_val || (val == max_val && j > leaving_index)) {
       max_val                = val;
@@ -911,14 +902,12 @@ i_t steepest_edge_pricing_with_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
       const f_t lower_infeas = lp.lower[j] - x[j];
       const f_t upper_infeas = x[j] - lp.upper[j];
       direction              = lower_infeas >= upper_infeas ? 1 : -1;
+      max_count++;
     }
   }
+  work_estimate += 3 * nz + 3 * max_count;
 
-  basic_leaving = leaving_index >= 0 ? bmark[leaving_index] : -1;
-
-  infeasibility_indices.byte_loads += nz * sizeof(i_t);
-  squared_infeasibilities.byte_loads += nz * sizeof(f_t);
-  if (leaving_index >= 0) { basic_mark.byte_loads += sizeof(i_t); }
+  basic_leaving = leaving_index >= 0 ? basic_mark[leaving_index] : -1;
 
   return leaving_index;
 }
@@ -1042,7 +1031,7 @@ f_t first_stage_harris(const lp_problem_t<i_t, f_t>& lp,
                        const std::vector<variable_status_t>& vstatus,
                        const std::vector<i_t>& nonbasic_list,
                        std::vector<f_t>& z,
-                       ins_vector<f_t>& delta_z)
+                       std::vector<f_t>& delta_z)
 {
   const i_t n             = lp.num_cols;
   const i_t m             = lp.num_rows;
@@ -1076,7 +1065,7 @@ i_t second_stage_harris(const lp_problem_t<i_t, f_t>& lp,
                         const std::vector<variable_status_t>& vstatus,
                         const std::vector<i_t>& nonbasic_list,
                         const std::vector<f_t>& z,
-                        const ins_vector<f_t>& delta_z,
+                        const std::vector<f_t>& delta_z,
                         f_t max_step_length,
                         f_t& step_length,
                         i_t& nonbasic_entering)
@@ -1119,7 +1108,7 @@ i_t phase2_ratio_test(const lp_problem_t<i_t, f_t>& lp,
                       std::vector<variable_status_t>& vstatus,
                       std::vector<i_t>& nonbasic_list,
                       std::vector<f_t>& z,
-                      ins_vector<f_t>& delta_z,
+                      std::vector<f_t>& delta_z,
                       f_t& step_length,
                       i_t& nonbasic_entering)
 {
@@ -1169,17 +1158,18 @@ i_t phase2_ratio_test(const lp_problem_t<i_t, f_t>& lp,
 template <typename i_t, typename f_t>
 i_t flip_bounds(const lp_problem_t<i_t, f_t>& lp,
                 const simplex_solver_settings_t<i_t, f_t>& settings,
-                const ins_vector<uint8_t>& bounded_variables,
-                const ins_vector<f_t>& objective,
+                const std::vector<uint8_t>& bounded_variables,
+                const std::vector<f_t>& objective,
                 const std::vector<f_t>& z,
-                const ins_vector<i_t>& delta_z_indices,
+                const std::vector<i_t>& delta_z_indices,
                 const std::vector<i_t>& nonbasic_list,
                 i_t entering_index,
                 std::vector<variable_status_t>& vstatus,
-                ins_vector<f_t>& delta_x,
-                ins_vector<i_t>& mark,
-                ins_vector<f_t>& atilde,
-                ins_vector<i_t>& atilde_index)
+                std::vector<f_t>& delta_x,
+                std::vector<i_t>& mark,
+                std::vector<f_t>& atilde,
+                std::vector<i_t>& atilde_index,
+                f_t& work_estimate)
 {
   i_t num_flipped = 0;
   for (i_t k = 0; k < delta_z_indices.size(); ++k) {
@@ -1191,8 +1181,11 @@ i_t flip_bounds(const lp_problem_t<i_t, f_t>& lp,
     const f_t dual_tol =
       settings.dual_tol;  // lower to 1e-7 or less will cause 25fv47 and d2q06c to cycle
     if (vstatus[j] == variable_status_t::NONBASIC_LOWER && z[j] < -dual_tol) {
-      const f_t delta = lp.upper[j] - lp.lower[j];
+      const f_t delta                = lp.upper[j] - lp.lower[j];
+      const size_t atilde_start_size = atilde_index.size();
       scatter_dense(lp.A, j, -delta, atilde, mark, atilde_index);
+      work_estimate += 2 * (atilde_index.size() - atilde_start_size) +
+                       4 * (lp.A.col_start[j + 1] - lp.A.col_start[j]) + 10;
       delta_x[j] += delta;
       vstatus[j] = variable_status_t::NONBASIC_UPPER;
 #ifdef BOUND_FLIP_DEBUG
@@ -1201,8 +1194,11 @@ i_t flip_bounds(const lp_problem_t<i_t, f_t>& lp,
 #endif
       num_flipped++;
     } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER && z[j] > dual_tol) {
-      const f_t delta = lp.lower[j] - lp.upper[j];
+      const f_t delta                = lp.lower[j] - lp.upper[j];
+      const size_t atilde_start_size = atilde_index.size();
       scatter_dense(lp.A, j, -delta, atilde, mark, atilde_index);
+      work_estimate += 2 * (atilde_index.size() - atilde_start_size) +
+                       4 * (lp.A.col_start[j + 1] - lp.A.col_start[j]) + 10;
       delta_x[j] += delta;
       vstatus[j] = variable_status_t::NONBASIC_LOWER;
 #ifdef BOUND_FLIP_DEBUG
@@ -1239,7 +1235,8 @@ i_t initialize_steepest_edge_norms(const lp_problem_t<i_t, f_t>& lp,
                                    const f_t start_time,
                                    const std::vector<i_t>& basic_list,
                                    basis_update_mpf_t<i_t, f_t>& ft,
-                                   std::vector<f_t>& delta_y_steepest_edge)
+                                   std::vector<f_t>& delta_y_steepest_edge,
+                                   f_t& work_estimate)
 {
   const i_t m = basic_list.size();
 
@@ -1251,6 +1248,7 @@ i_t initialize_steepest_edge_norms(const lp_problem_t<i_t, f_t>& lp,
   std::vector<i_t> row_degree(m, 0);
   std::vector<i_t> mapping(m, -1);
   std::vector<f_t> coeff(m, 0.0);
+  work_estimate += 3 * m;
 
   for (i_t k = 0; k < m; ++k) {
     const i_t j         = basic_list[k];
@@ -1263,7 +1261,9 @@ i_t initialize_steepest_edge_norms(const lp_problem_t<i_t, f_t>& lp,
       mapping[k] = i;
       coeff[k]   = lp.A.x[p];
     }
+    work_estimate += 5 * (col_end - col_start);
   }
+  work_estimate += 3 * m;
 
 #ifdef CHECK_SINGLETON_ROWS
   csc_matrix_t<i_t, f_t> B(m, m, 0);
@@ -1285,6 +1285,7 @@ i_t initialize_steepest_edge_norms(const lp_problem_t<i_t, f_t>& lp,
 #endif
     }
   }
+  work_estimate += m;
 
   if (num_singleton_rows > 0) {
     settings.log.printf("Found %d singleton rows for steepest edge norms in %.2fs\n",
@@ -1347,6 +1348,7 @@ i_t initialize_steepest_edge_norms(const lp_problem_t<i_t, f_t>& lp,
       for (i_t p = 0; p < sparse_dy.x.size(); ++p) {
         my_init += sparse_dy.x[p] * sparse_dy.x[p];
       }
+      work_estimate += 2 * sparse_dy.x.size();
 #endif
 #if COMPARE_WITH_DENSE
       if (std::abs(init - my_init) > 1e-12) {
@@ -1375,6 +1377,7 @@ i_t initialize_steepest_edge_norms(const lp_problem_t<i_t, f_t>& lp,
       return CONCURRENT_HALT_RETURN;
     }
   }
+  work_estimate += 7 * m;
   return 0;
 }
 
@@ -1388,17 +1391,22 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settin
                                const sparse_vector_t<i_t, f_t>& scaled_delta_xB,
                                i_t basic_leaving_index,
                                i_t entering_index,
-                               ins_vector<f_t>& v,
+                               std::vector<f_t>& v,
                                sparse_vector_t<i_t, f_t>& v_sparse,
-                               std::vector<f_t>& delta_y_steepest_edge)
+                               std::vector<f_t>& delta_y_steepest_edge,
+                               f_t& work_estimate)
 {
   const i_t delta_y_nz = delta_y_sparse.i.size();
   v_sparse.clear();
   // B^T delta_y = - direction * e_basic_leaving_index
   // We want B v =  - B^{-T} e_basic_leaving_index
   ft.b_solve(delta_y_sparse, v_sparse);
-  if (direction == -1) { v_sparse.negate(); }
+  if (direction == -1) {
+    v_sparse.negate();
+    work_estimate += 2 * v_sparse.i.size();
+  }
   v_sparse.scatter(v);
+  work_estimate += 2 * v_sparse.i.size();
 
   const i_t leaving_index        = basic_list[basic_leaving_index];
   const f_t prev_dy_norm_squared = delta_y_steepest_edge[leaving_index];
@@ -1417,23 +1425,21 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settin
   // B*w = A(:, leaving_index)
   // B*scaled_delta_xB = -A(:, leaving_index) so w = -scaled_delta_xB
   const f_t wr = -scaled_delta_xB.find_coefficient(basic_leaving_index);
+  work_estimate += scaled_delta_xB.i.size();
   if (wr == 0) { return -1; }
   const f_t omegar             = dy_norm_squared / (wr * wr);
   const i_t scaled_delta_xB_nz = scaled_delta_xB.i.size();
 
-  auto& dxB_i = scaled_delta_xB.i.underlying();
-  auto& dxB_x = scaled_delta_xB.x.underlying();
-  auto& v_raw = v.underlying();
-
   for (i_t h = 0; h < scaled_delta_xB_nz; ++h) {
-    const i_t k = dxB_i[h];
+    const i_t k = scaled_delta_xB.i[h];
     const i_t j = basic_list[k];
     if (k == basic_leaving_index) {
-      const f_t w_squared      = dxB_x[h] * dxB_x[h];
+      const f_t w              = scaled_delta_xB.x[h];
+      const f_t w_squared      = w * w;
       delta_y_steepest_edge[j] = (1.0 / w_squared) * dy_norm_squared;
     } else {
-      const f_t wk = -dxB_x[h];
-      f_t new_val  = delta_y_steepest_edge[j] + wk * (2.0 * v_raw[k] / wr + wk * omegar);
+      const f_t wk = -scaled_delta_xB.x[h];
+      f_t new_val  = delta_y_steepest_edge[j] + wk * (2.0 * v[k] / wr + wk * omegar);
       new_val      = std::max(new_val, 1e-4);
 #ifdef STEEPEST_EDGE_DEBUG
       if (!(new_val >= 0)) {
@@ -1452,18 +1458,13 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& settin
       delta_y_steepest_edge[j] = new_val;
     }
   }
+  work_estimate += 5 * scaled_delta_xB_nz;
 
-  auto& vs_i     = v_sparse.i.underlying();
-  const i_t v_nz = vs_i.size();
+  const i_t v_nz = v_sparse.i.size();
   for (i_t k = 0; k < v_nz; ++k) {
-    v_raw[vs_i[k]] = 0.0;
+    v[v_sparse.i[k]] = 0.0;
   }
-
-  scaled_delta_xB.i.byte_loads += scaled_delta_xB_nz * sizeof(i_t);
-  scaled_delta_xB.x.byte_loads += scaled_delta_xB_nz * sizeof(f_t);
-  v.byte_loads += scaled_delta_xB_nz * sizeof(f_t);
-  v.byte_stores += v_nz * sizeof(f_t);
-  v_sparse.i.byte_loads += v_nz * sizeof(i_t);
+  work_estimate += 2 * v_nz;
 
   return 0;
 }
@@ -1520,10 +1521,11 @@ i_t check_steepest_edge_norms(const simplex_solver_settings_t<i_t, f_t>& setting
 template <typename i_t, typename f_t>
 i_t compute_perturbation(const lp_problem_t<i_t, f_t>& lp,
                          const simplex_solver_settings_t<i_t, f_t>& settings,
-                         const ins_vector<i_t>& delta_z_indices,
+                         const std::vector<i_t>& delta_z_indices,
                          std::vector<f_t>& z,
-                         ins_vector<f_t>& objective,
-                         f_t& sum_perturb)
+                         std::vector<f_t>& objective,
+                         f_t& sum_perturb,
+                         f_t& work_estimate)
 {
   const i_t n         = lp.num_cols;
   const i_t m         = lp.num_rows;
@@ -1558,6 +1560,7 @@ i_t compute_perturbation(const lp_problem_t<i_t, f_t>& lp,
 #endif
     }
   }
+  work_estimate += 7 * delta_z_indices.size();
 #ifdef PERTURBATION_DEBUG
   if (num_perturb > 0) {
     settings.log.printf("Perturbed %d dual variables by %e\n", num_perturb, sum_perturb);
@@ -1566,11 +1569,12 @@ i_t compute_perturbation(const lp_problem_t<i_t, f_t>& lp,
   return 0;
 }
 
-template <typename i_t>
+template <typename i_t, typename f_t>
 void reset_basis_mark(const std::vector<i_t>& basic_list,
                       const std::vector<i_t>& nonbasic_list,
-                      ins_vector<i_t>& basic_mark,
-                      ins_vector<i_t>& nonbasic_mark)
+                      std::vector<i_t>& basic_mark,
+                      std::vector<i_t>& nonbasic_mark,
+                      f_t& work_estimate)
 {
   const i_t m         = basic_list.size();
   const i_t n         = nonbasic_mark.size();
@@ -1579,18 +1583,22 @@ void reset_basis_mark(const std::vector<i_t>& basic_list,
   for (i_t k = 0; k < n; k++) {
     basic_mark[k] = -1;
   }
+  work_estimate += n;
 
   for (i_t k = 0; k < n; k++) {
     nonbasic_mark[k] = -1;
   }
+  work_estimate += n;
 
   for (i_t k = 0; k < n_minus_m; k++) {
     nonbasic_mark[nonbasic_list[k]] = k;
   }
+  work_estimate += 2 * n_minus_m;
 
   for (i_t k = 0; k < m; k++) {
     basic_mark[basic_list[k]] = k;
   }
+  work_estimate += 2 * m;
 }
 
 template <typename i_t, typename f_t>
@@ -1617,20 +1625,14 @@ void compute_delta_y(const basis_update_mpf_t<i_t, f_t>& ft,
 #ifdef CHECK_B_TRANSPOSE_SOLVE
   std::vector<f_t> delta_y_sparse_vector_check(m);
   delta_y_sparse.to_dense(delta_y_sparse_vector_check);
-  f_t error_check = 0.0;
-  for (i_t k = 0; k < m; ++k) {
-    if (std::abs(delta_y[k] - delta_y_sparse_vector_check[k]) > 1e-6) {
-      settings.log.printf(
-        "\tBTranspose error %d %e %e\n", k, delta_y[k], delta_y_sparse_vector_check[k]);
-    }
-    error_check += std::abs(delta_y[k] - delta_y_sparse_vector_check[k]);
-  }
-  if (error_check > 1e-6) { settings.log.printf("BTranspose error %e\n", error_check); }
+  // Pass in basic_list and lp for this code to work
   std::vector<f_t> residual(m);
+  std::vector<f_t> ei(m, 0);
+  ei[basic_leaving_index] = -direction;
   b_transpose_multiply(lp, basic_list, delta_y_sparse_vector_check, residual);
   for (i_t k = 0; k < m; ++k) {
     if (std::abs(residual[k] - ei[k]) > 1e-6) {
-      settings.log.printf("\tBTranspose multiply error %d %e %e\n", k, residual[k], ei[k]);
+      printf("\tBTranspose multiply error %d %e %e\n", k, residual[k], ei[k]);
     }
   }
 #endif
@@ -1638,12 +1640,13 @@ void compute_delta_y(const basis_update_mpf_t<i_t, f_t>& ft,
 
 template <typename i_t, typename f_t>
 i_t update_dual_variables(const sparse_vector_t<i_t, f_t>& delta_y_sparse,
-                          const ins_vector<i_t>& delta_z_indices,
-                          const ins_vector<f_t>& delta_z,
+                          const std::vector<i_t>& delta_z_indices,
+                          const std::vector<f_t>& delta_z,
                           f_t step_length,
                           i_t leaving_index,
                           std::vector<f_t>& y,
-                          std::vector<f_t>& z)
+                          std::vector<f_t>& z,
+                          f_t& work_estimate)
 {
   // Update dual variables
   // y <- y + steplength * delta_y
@@ -1652,12 +1655,14 @@ i_t update_dual_variables(const sparse_vector_t<i_t, f_t>& delta_y_sparse,
     const i_t i = delta_y_sparse.i[k];
     y[i] += step_length * delta_y_sparse.x[k];
   }
+  work_estimate += 3 * delta_y_nz;
   // z <- z + steplength * delta_z
   const i_t delta_z_nz = delta_z_indices.size();
   for (i_t k = 0; k < delta_z_nz; ++k) {
     const i_t j = delta_z_indices[k];
     z[j] += step_length * delta_z[j];
   }
+  work_estimate += 3 * delta_z_nz;
   z[leaving_index] += step_length * delta_z[leaving_index];
   return 0;
 }
@@ -1665,14 +1670,15 @@ i_t update_dual_variables(const sparse_vector_t<i_t, f_t>& delta_y_sparse,
 template <typename i_t, typename f_t>
 void adjust_for_flips(const basis_update_mpf_t<i_t, f_t>& ft,
                       const std::vector<i_t>& basic_list,
-                      const ins_vector<i_t>& delta_z_indices,
-                      ins_vector<i_t>& atilde_index,
-                      ins_vector<f_t>& atilde,
-                      ins_vector<i_t>& atilde_mark,
+                      const std::vector<i_t>& delta_z_indices,
+                      std::vector<i_t>& atilde_index,
+                      std::vector<f_t>& atilde,
+                      std::vector<i_t>& atilde_mark,
                       sparse_vector_t<i_t, f_t>& atilde_sparse,
                       sparse_vector_t<i_t, f_t>& delta_xB_0_sparse,
-                      ins_vector<f_t>& delta_x_flip,
-                      std::vector<f_t>& x)
+                      std::vector<f_t>& delta_x_flip,
+                      std::vector<f_t>& x,
+                      f_t& work_estimate)
 {
   const i_t atilde_nz = atilde_index.size();
   // B*delta_xB_0 = atilde
@@ -1683,27 +1689,30 @@ void adjust_for_flips(const basis_update_mpf_t<i_t, f_t>& ft,
     atilde_sparse.i.push_back(atilde_index[k]);
     atilde_sparse.x.push_back(atilde[atilde_index[k]]);
   }
+  work_estimate += 5 * atilde_nz;
   ft.b_solve(atilde_sparse, delta_xB_0_sparse);
   const i_t delta_xB_0_nz = delta_xB_0_sparse.i.size();
   for (i_t k = 0; k < delta_xB_0_nz; ++k) {
     const i_t j = basic_list[delta_xB_0_sparse.i[k]];
     x[j] += delta_xB_0_sparse.x[k];
   }
-
+  work_estimate += 4 * delta_xB_0_nz;
   for (i_t k = 0; k < delta_z_indices.size(); ++k) {
     const i_t j = delta_z_indices[k];
     x[j] += delta_x_flip[j];
     delta_x_flip[j] = 0.0;
   }
-
+  work_estimate += 4 * delta_z_indices.size();
   // Clear atilde
   for (i_t k = 0; k < atilde_index.size(); ++k) {
     atilde[atilde_index[k]] = 0.0;
   }
+  work_estimate += 2 * atilde_index.size();
   // Clear atilde_mark
   for (i_t k = 0; k < atilde_mark.size(); ++k) {
     atilde_mark[k] = 0;
   }
+  work_estimate += atilde_mark.size();
   atilde_index.clear();
 }
 
@@ -1715,21 +1724,27 @@ i_t compute_delta_x(const lp_problem_t<i_t, f_t>& lp,
                     i_t basic_leaving_index,
                     i_t direction,
                     const std::vector<i_t>& basic_list,
-                    const ins_vector<f_t>& delta_x_flip,
+                    const std::vector<f_t>& delta_x_flip,
                     const sparse_vector_t<i_t, f_t>& rhs_sparse,
                     const std::vector<f_t>& x,
                     sparse_vector_t<i_t, f_t>& utilde_sparse,
                     sparse_vector_t<i_t, f_t>& scaled_delta_xB_sparse,
-                    ins_vector<f_t>& delta_x)
+                    std::vector<f_t>& delta_x,
+                    f_t& work_estimate)
 {
   f_t delta_x_leaving = direction == 1 ? lp.lower[leaving_index] - x[leaving_index]
                                        : lp.upper[leaving_index] - x[leaving_index];
   // B*w = -A(:, entering)
   ft.b_solve(rhs_sparse, scaled_delta_xB_sparse, utilde_sparse);
   scaled_delta_xB_sparse.negate();
+  work_estimate += 2 * scaled_delta_xB_sparse.i.size();
 
 #ifdef CHECK_B_SOLVE
+  const i_t m = basic_list.size();
   std::vector<f_t> scaled_delta_xB(m);
+  scaled_delta_xB_sparse.to_dense(scaled_delta_xB);
+  std::vector<f_t> rhs(m);
+  rhs_sparse.to_dense(rhs);
   {
     std::vector<f_t> residual_B(m);
     b_multiply(lp, basic_list, scaled_delta_xB, residual_B);
@@ -1737,28 +1752,35 @@ i_t compute_delta_x(const lp_problem_t<i_t, f_t>& lp,
     for (i_t k = 0; k < m; ++k) {
       const f_t err = std::abs(rhs[k] + residual_B[k]);
       if (err >= 1e-6) {
-        settings.log.printf(
-          "Bsolve diff %d %e rhs %e residual %e\n", k, err, rhs[k], residual_B[k]);
+        printf("Bsolve diff %d %e rhs %e residual %e\n", k, err, rhs[k], residual_B[k]);
       }
       err_max = std::max(err_max, err);
     }
-    if (err_max > 1e-6) { settings.log.printf("B multiply error %e\n", err_max); }
+    if (err_max > 1e-6) {
+      printf("B multiply error %e\n", err_max);
+    } else {
+      printf("B multiply error %e\n", err_max);
+    }
   }
 #endif
 
   f_t scale = scaled_delta_xB_sparse.find_coefficient(basic_leaving_index);
+  work_estimate += 2 * scaled_delta_xB_sparse.i.size();
   if (scale != scale) {
     // We couldn't find a coefficient for the basic leaving index.
     // The coefficient might be very small. Switch to a regular solve and try to recover.
     std::vector<f_t> rhs;
     rhs_sparse.to_dense(rhs);
+    work_estimate += 2 * rhs.size();
     const i_t m = basic_list.size();
     std::vector<f_t> scaled_delta_xB(m);
+    work_estimate += m;
     ft.b_solve(rhs, scaled_delta_xB);
     if (scaled_delta_xB[basic_leaving_index] != 0.0 &&
         !std::isnan(scaled_delta_xB[basic_leaving_index])) {
       scaled_delta_xB_sparse.from_dense(scaled_delta_xB);
       scaled_delta_xB_sparse.negate();
+      work_estimate += 2 * scaled_delta_xB_sparse.i.size() + scaled_delta_xB.size();
       scale = -scaled_delta_xB[basic_leaving_index];
     } else {
       return -1;
@@ -1770,6 +1792,7 @@ i_t compute_delta_x(const lp_problem_t<i_t, f_t>& lp,
     const i_t j = basic_list[scaled_delta_xB_sparse.i[k]];
     delta_x[j]  = primal_step_length * scaled_delta_xB_sparse.x[k];
   }
+  work_estimate += 4 * scaled_delta_xB_nz;
   delta_x[leaving_index]  = delta_x_leaving;
   delta_x[entering_index] = primal_step_length;
   return 0;
@@ -1778,9 +1801,10 @@ i_t compute_delta_x(const lp_problem_t<i_t, f_t>& lp,
 template <typename i_t, typename f_t>
 void update_primal_variables(const sparse_vector_t<i_t, f_t>& scaled_delta_xB_sparse,
                              const std::vector<i_t>& basic_list,
-                             const ins_vector<f_t>& delta_x,
+                             const std::vector<f_t>& delta_x,
                              i_t entering_index,
-                             std::vector<f_t>& x)
+                             std::vector<f_t>& x,
+                             f_t& work_estimate)
 {
   // x <- x + delta_x
   const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size();
@@ -1788,23 +1812,26 @@ void update_primal_variables(const sparse_vector_t<i_t, f_t>& scaled_delta_xB_sp
     const i_t j = basic_list[scaled_delta_xB_sparse.i[k]];
     x[j] += delta_x[j];
   }
+  work_estimate += 4 * scaled_delta_xB_nz;
   // Leaving index already included above
   x[entering_index] += delta_x[entering_index];
 }
 
 template <typename i_t, typename f_t>
 void update_objective(const std::vector<i_t>& basic_list,
-                      const ins_vector<i_t>& changed_basic_indices,
+                      const std::vector<i_t>& changed_basic_indices,
                       const std::vector<f_t>& objective,
-                      const ins_vector<f_t>& delta_x,
+                      const std::vector<f_t>& delta_x,
                       i_t entering_index,
-                      f_t& obj)
+                      f_t& obj,
+                      f_t& work_estimate)
 {
   const i_t changed_basic_nz = changed_basic_indices.size();
   for (i_t k = 0; k < changed_basic_nz; ++k) {
     const i_t j = basic_list[changed_basic_indices[k]];
     obj += delta_x[j] * objective[j];
   }
+  work_estimate += 4 * changed_basic_nz;
   // Leaving index already included above
   obj += delta_x[entering_index] * objective[entering_index];
 }
@@ -2219,7 +2246,7 @@ void set_primal_variables_on_bounds(const lp_problem_t<i_t, f_t>& lp,
 }
 
 template <typename f_t>
-f_t compute_perturbed_objective(const ins_vector<f_t>& objective, const std::vector<f_t>& x)
+f_t compute_perturbed_objective(const std::vector<f_t>& objective, const std::vector<f_t>& x)
 {
   const size_t n = objective.size();
   f_t obj_val    = 0.0;
@@ -2230,7 +2257,7 @@ f_t compute_perturbed_objective(const ins_vector<f_t>& objective, const std::vec
 }
 
 template <typename i_t, typename f_t>
-f_t amount_of_perturbation(const lp_problem_t<i_t, f_t>& lp, const ins_vector<f_t>& objective)
+f_t amount_of_perturbation(const lp_problem_t<i_t, f_t>& lp, const std::vector<f_t>& objective)
 {
   f_t perturbation = 0.0;
   const i_t n      = lp.num_cols;
@@ -2246,7 +2273,7 @@ void prepare_optimality(i_t info,
                         const lp_problem_t<i_t, f_t>& lp,
                         const simplex_solver_settings_t<i_t, f_t>& settings,
                         basis_update_mpf_t<i_t, f_t>& ft,
-                        const ins_vector<f_t>& objective,
+                        const std::vector<f_t>& objective,
                         const std::vector<i_t>& basic_list,
                         const std::vector<i_t>& nonbasic_list,
                         const std::vector<variable_status_t>& vstatus,
@@ -2259,8 +2286,9 @@ void prepare_optimality(i_t info,
                         std::vector<f_t>& z,
                         lp_solution_t<i_t, f_t>& sol)
 {
-  const i_t m = lp.num_rows;
-  const i_t n = lp.num_cols;
+  const i_t m       = lp.num_rows;
+  const i_t n       = lp.num_cols;
+  f_t work_estimate = 0;  // Work in this function is not captured
 
   sol.objective         = compute_objective(lp, sol.x);
   sol.user_objective    = compute_user_objective(lp, sol.objective);
@@ -2271,7 +2299,7 @@ void prepare_optimality(i_t info,
     std::vector<f_t> unperturbed_y(m);
     std::vector<f_t> unperturbed_z(n);
     phase2::compute_dual_solution_from_basis(
-      lp, ft, basic_list, nonbasic_list, unperturbed_y, unperturbed_z);
+      lp, ft, basic_list, nonbasic_list, unperturbed_y, unperturbed_z, work_estimate);
     {
       const f_t dual_infeas = phase2::dual_infeasibility(
         lp, settings, vstatus, unperturbed_z, settings.tight_tol, settings.dual_tol);
@@ -2468,6 +2496,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   assert(lp.lower.size() == n);
   assert(lp.upper.size() == n);
   assert(lp.rhs.size() == m);
+  f_t phase2_work_estimate = 0.0;
+  ft.clear_work_estimate();
 
   std::vector<f_t>& x = sol.x;
   std::vector<f_t>& y = sol.y;
@@ -2475,16 +2505,11 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
   // Declare instrumented vectors used during initialization (before aggregator setup)
   // Perturbed objective
-  ins_vector<f_t> objective(lp.objective);
-  ins_vector<f_t> c_basic(m);
-  ins_vector<f_t> xB_workspace(m);
-
-  // Create instrumentation aggregator early to capture init section memory operations
-  instrumentation_aggregator_t aggregator;
+  std::vector<f_t> objective(lp.objective);
+  std::vector<f_t> c_basic(m);
+  std::vector<f_t> xB_workspace(m);
 
-  aggregator.add("objective", objective);
-  aggregator.add("c_basic", c_basic);
-  aggregator.add("xB_workspace", xB_workspace);
+  phase2_work_estimate += 2 * (n + m);
 
   dual::status_t status = dual::status_t::UNSET;
 
@@ -2493,14 +2518,20 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   settings.log.printf("Dual Simplex Phase %d\n", phase);
   std::vector<variable_status_t> vstatus_old = vstatus;
   std::vector<f_t> z_old                     = z;
+  phase2_work_estimate += 4 * n;
 
   phase2::bound_info(lp, settings);
+  phase2_work_estimate += 2 * n;
+
   if (initialize_basis) {
     PHASE2_NVTX_RANGE("DualSimplex::init_basis");
     std::vector<i_t> superbasic_list;
     nonbasic_list.clear();
     nonbasic_list.reserve(n - m);
+    phase2_work_estimate += (n - m);
+
     get_basis_from_vstatus(m, vstatus, basic_list, nonbasic_list, superbasic_list);
+    phase2_work_estimate += 2 * n;
     assert(superbasic_list.size() == 0);
     assert(nonbasic_list.size() == n - m);
 
@@ -2517,6 +2548,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     const i_t j = basic_list[k];
     c_basic[k]  = objective[j];
   }
+  phase2_work_estimate += 2 * m;
 
   // Solve B'*y = cB
   ft.b_transpose_solve(c_basic, y);
@@ -2527,7 +2559,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
       "|| y || %e || cB || %e\n", vector_norm_inf<i_t, f_t>(y), vector_norm_inf<i_t, f_t>(c_basic));
   }
 
-  phase2::compute_reduced_costs(objective.underlying(), lp.A, y, basic_list, nonbasic_list, z);
+  phase2::compute_reduced_costs(
+    objective, lp.A, y, basic_list, nonbasic_list, z, phase2_work_estimate);
   if constexpr (print_norms) { settings.log.printf("|| z || %e\n", vector_norm_inf<i_t, f_t>(z)); }
 
 #ifdef COMPUTE_DUAL_RESIDUAL
@@ -2541,6 +2574,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 #endif
 
   phase2::set_primal_variables_on_bounds(lp, settings, z, vstatus, x);
+  phase2_work_estimate += 5 * (n - m);
 
 #ifdef PRINT_VSTATUS_CHANGES
   i_t num_vstatus_changes;
@@ -2552,6 +2586,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
   const f_t init_dual_inf =
     phase2::dual_infeasibility(lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol);
+  phase2_work_estimate += 3 * n;
   if (init_dual_inf > settings.dual_tol) {
     settings.log.printf("Initial dual infeasibility %e\n", init_dual_inf);
   }
@@ -2561,9 +2596,17 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
       settings.log.printf("Free variable %d vstatus %d\n", j, vstatus[j]);
     }
   }
+  phase2_work_estimate += 3 * n;
 
-  phase2::compute_primal_variables(
-    ft, lp.rhs, lp.A, basic_list, nonbasic_list, settings.tight_tol, x, xB_workspace);
+  phase2::compute_primal_variables(ft,
+                                   lp.rhs,
+                                   lp.A,
+                                   basic_list,
+                                   nonbasic_list,
+                                   settings.tight_tol,
+                                   x,
+                                   xB_workspace,
+                                   phase2_work_estimate);
 
   if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
   if (print_norms) { settings.log.printf("|| x || %e\n", vector_norm2<i_t, f_t>(x)); }
@@ -2580,14 +2623,17 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   if (delta_y_steepest_edge.size() == 0) {
     PHASE2_NVTX_RANGE("DualSimplex::initialize_steepest_edge_norms");
     delta_y_steepest_edge.resize(n);
+    phase2_work_estimate += n;
     if (slack_basis) {
       phase2::initialize_steepest_edge_norms_from_slack_basis(
         basic_list, nonbasic_list, delta_y_steepest_edge);
+      phase2_work_estimate += 2 * n;
     } else {
       std::fill(delta_y_steepest_edge.begin(), delta_y_steepest_edge.end(), -1);
+      phase2_work_estimate += n;
       f_t steepest_edge_start = tic();
       i_t status              = phase2::initialize_steepest_edge_norms(
-        lp, settings, start_time, basic_list, ft, delta_y_steepest_edge);
+        lp, settings, start_time, basic_list, ft, delta_y_steepest_edge, phase2_work_estimate);
       f_t steepest_edge_time = toc(steepest_edge_start);
       if (status == CONCURRENT_HALT_RETURN) { return dual::status_t::CONCURRENT_LIMIT; }
       if (status == -1) { return dual::status_t::TIME_LIMIT; }
@@ -2598,6 +2644,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
       const i_t j = basic_list[k];
       if (delta_y_steepest_edge[j] <= 0.0) { delta_y_steepest_edge[j] = 1e-4; }
     }
+    phase2_work_estimate += 2 * m;
     settings.log.printf("using exisiting steepest edge %e\n",
                         vector_norm2<i_t, f_t>(delta_y_steepest_edge));
   }
@@ -2608,28 +2655,31 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
   const i_t iter_limit = settings.iteration_limit;
 
-  // Instrumented vectors for memory access tracking
-  ins_vector<f_t> delta_y(m, 0.0);
-  ins_vector<f_t> delta_z(n, 0.0);
-  ins_vector<f_t> delta_x(n, 0.0);
-  ins_vector<f_t> delta_x_flip(n, 0.0);
-  ins_vector<f_t> atilde(m, 0.0);
-  ins_vector<i_t> atilde_mark(m, 0);
-  ins_vector<i_t> atilde_index;
-  ins_vector<i_t> nonbasic_mark(n);
-  ins_vector<i_t> basic_mark(n);
-  ins_vector<i_t> delta_z_mark(n, 0);
-  ins_vector<i_t> delta_z_indices;
-  ins_vector<f_t> v(m, 0.0);
-  ins_vector<f_t> squared_infeasibilities;
-  ins_vector<i_t> infeasibility_indices;
+  std::vector<f_t> delta_y(m, 0.0);
+  std::vector<f_t> delta_z(n, 0.0);
+  std::vector<f_t> delta_x(n, 0.0);
+  std::vector<f_t> delta_x_flip(n, 0.0);
+  std::vector<f_t> atilde(m, 0.0);
+  std::vector<i_t> atilde_mark(m, 0);
+  std::vector<i_t> atilde_index;
+  std::vector<i_t> nonbasic_mark(n);
+  std::vector<i_t> basic_mark(n);
+  std::vector<i_t> delta_z_mark(n, 0);
+  std::vector<i_t> delta_z_indices;
+  std::vector<f_t> v(m, 0.0);
+  std::vector<f_t> squared_infeasibilities;
+  std::vector<i_t> infeasibility_indices;
+  phase2_work_estimate += 6 * n + 4 * m;
 
   delta_z_indices.reserve(n);
+  phase2_work_estimate += n;
 
-  phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark);
+  phase2::reset_basis_mark(
+    basic_list, nonbasic_list, basic_mark, nonbasic_mark, phase2_work_estimate);
 
-  ins_vector<uint8_t> bounded_variables(n, 0);
+  std::vector<uint8_t> bounded_variables(n, 0);
   phase2::compute_bounded_info(lp.lower, lp.upper, bounded_variables);
+  phase2_work_estimate += 4 * n;
 
   f_t primal_infeasibility;
   f_t primal_infeasibility_squared =
@@ -2640,21 +2690,18 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                                    squared_infeasibilities,
                                                    infeasibility_indices,
                                                    primal_infeasibility);
+  phase2_work_estimate += 4 * m + 2 * n;
 
 #ifdef CHECK_BASIC_INFEASIBILITIES
   phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0);
 #endif
 
   csc_matrix_t<i_t, f_t> A_transpose(1, 1, 0);
-  aggregator.add("A_transpose.col_start", A_transpose.col_start);
-  aggregator.add("A_transpose.i", A_transpose.i);
-  aggregator.add("A_transpose.x", A_transpose.x);
-  {
-    PHASE2_NVTX_RANGE("DualSimplex::transpose_A");
-    lp.A.transpose(A_transpose);
-  }
+  lp.A.transpose(A_transpose);
+  phase2_work_estimate += 2 * lp.A.col_start[lp.A.n];
+
   f_t obj = compute_objective(lp, x);
-  init_scope.pop();  // End phase2_advanced_init range
+  phase2_work_estimate += 2 * n;
 
   const i_t start_iter = iter;
 
@@ -2665,11 +2712,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   f_t delta_y_nz_percentage = 0.0;
   phase2::phase2_timers_t<i_t, f_t> timers(false);
 
-  // // Feature collection for regression training
-  // dual_simplex_features_t<i_t, f_t> features;
-  // features.init_from_problem(lp, settings, phase, slack_basis != 0, initialize_basis);
-  // features.start_iteration = iter;
-
   // Sparse vectors for main loop (declared outside loop for instrumentation)
   sparse_vector_t<i_t, f_t> delta_y_sparse(m, 0);
   sparse_vector_t<i_t, f_t> UTsol_sparse(m, 0);
@@ -2680,82 +2722,20 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   sparse_vector_t<i_t, f_t> v_sparse(m, 0);       // For steepest edge norms
   sparse_vector_t<i_t, f_t> atilde_sparse(m, 0);  // For flip adjustments
 
-  // Add remaining instrumented vectors to aggregator (x, y, z, objective, c_basic, xB_workspace
-  // added earlier) Delta vectors
-  aggregator.add("delta_y", delta_y);
-  aggregator.add("delta_z", delta_z);
-  aggregator.add("delta_x", delta_x);
-  aggregator.add("delta_x_flip", delta_x_flip);
-  aggregator.add("atilde", atilde);
-  aggregator.add("atilde_mark", atilde_mark);
-  aggregator.add("atilde_index", atilde_index);
-  aggregator.add("nonbasic_mark", nonbasic_mark);
-  aggregator.add("basic_mark", basic_mark);
-  aggregator.add("delta_z_mark", delta_z_mark);
-  aggregator.add("delta_z_indices", delta_z_indices);
-  aggregator.add("v", v);
-  aggregator.add("squared_infeasibilities", squared_infeasibilities);
-  aggregator.add("infeasibility_indices", infeasibility_indices);
-  aggregator.add("bounded_variables", bounded_variables);
-
-  // Add sparse vector internal arrays to aggregator
-  aggregator.add("delta_y_sparse.i", delta_y_sparse.i);
-  aggregator.add("delta_y_sparse.x", delta_y_sparse.x);
-  aggregator.add("UTsol_sparse.i", UTsol_sparse.i);
-  aggregator.add("UTsol_sparse.x", UTsol_sparse.x);
-  aggregator.add("delta_xB_0_sparse.i", delta_xB_0_sparse.i);
-  aggregator.add("delta_xB_0_sparse.x", delta_xB_0_sparse.x);
-  aggregator.add("utilde_sparse.i", utilde_sparse.i);
-  aggregator.add("utilde_sparse.x", utilde_sparse.x);
-  aggregator.add("scaled_delta_xB_sparse.i", scaled_delta_xB_sparse.i);
-  aggregator.add("scaled_delta_xB_sparse.x", scaled_delta_xB_sparse.x);
-  aggregator.add("rhs_sparse.i", rhs_sparse.i);
-  aggregator.add("rhs_sparse.x", rhs_sparse.x);
-  aggregator.add("v_sparse.i", v_sparse.i);
-  aggregator.add("v_sparse.x", v_sparse.x);
-  aggregator.add("atilde_sparse.i", atilde_sparse.i);
-  aggregator.add("atilde_sparse.x", atilde_sparse.x);
-
-  // Add A matrix for entering column access during basis update
-  aggregator.add("A.col_start", lp.A.col_start);
-  aggregator.add("A.i", lp.A.i);
-  aggregator.add("A.x", lp.A.x);
-
   // Track iteration interval start time for runtime measurement
   [[maybe_unused]] f_t interval_start_time = toc(start_time);
   i_t last_feature_log_iter                = iter;
 
-  cuopt::scope_guard work_unit_guard([&]() {
-    i_t remaining_iters = iter - last_feature_log_iter;
-    if (remaining_iters <= 0) return;
-
-    auto [total_loads, total_stores] = aggregator.collect_and_flush();
-    // features.byte_loads              = total_loads;
-    // features.byte_stores             = total_stores;
-
-    // f_t now                   = toc(start_time);
-    // features.interval_runtime = now - interval_start_time;
-    // interval_start_time       = now;
-
-    // features.iteration             = iter;
-    // features.num_refactors         = num_refactors;
-    // features.num_basis_updates     = ft.num_updates();
-    // features.sparse_delta_z_count  = sparse_delta_z;
-    // features.dense_delta_z_count   = dense_delta_z;
-    // features.total_bound_flips     = total_bound_flips;
-    // features.num_infeasibilities   = infeasibility_indices.size();
-    // features.delta_y_nz_percentage = delta_y_nz_percentage;
-    // features.log_features(settings);
-
-    if (work_unit_context) {
-      // TEMP;
-      work_unit_context->record_work_sync_on_horizon((total_loads + total_stores) / 1e8);
-    }
-  });
+  phase2_work_estimate += ft.work_estimate();
+  ft.clear_work_estimate();
+  if (work_unit_context) {
+    work_unit_context->record_work_sync_on_horizon((phase2_work_estimate) / 1e8);
+  }
+  phase2_work_estimate = 0.0;
 
   if (phase == 2) {
     settings.log.printf("%5d %+.16e %7d %.8e %.2e %.2f\n",
-                        0,
+                        iter,
                         compute_user_objective(lp, obj),
                         infeasibility_indices.size(),
                         primal_infeasibility_squared,
@@ -2783,7 +2763,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                                                            infeasibility_indices,
                                                                            direction,
                                                                            basic_leaving_index,
-                                                                           max_val);
+                                                                           max_val,
+                                                                           phase2_work_estimate);
       } else {
         // Max infeasibility pricing
         leaving_index = phase2::phase2_pricing(
@@ -2883,6 +2864,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     timers.btran_time += timers.stop_timer();
 
     const f_t steepest_edge_norm_check = delta_y_sparse.norm2_squared();
+    phase2_work_estimate += 2 * delta_y_sparse.i.size();
     if (delta_y_steepest_edge[leaving_index] <
         settings.steepest_edge_ratio * steepest_edge_norm_check) {
       constexpr bool verbose = false;
@@ -2904,6 +2886,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     for (i_t k = 0; k < nz_delta_y; k++) {
       if (std::abs(delta_y_sparse.x[k]) > 1e-12) { delta_y_nz0++; }
     }
+    phase2_work_estimate += nz_delta_y;
     delta_y_nz_percentage    = delta_y_nz0 / static_cast<f_t>(m) * 100.0;
     const bool use_transpose = delta_y_nz_percentage <= 30.0;
     {
@@ -2917,11 +2900,13 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                 nonbasic_mark,
                                 delta_z_mark,
                                 delta_z_indices,
-                                delta_z);
+                                delta_z,
+                                phase2_work_estimate);
       } else {
         dense_delta_z++;
         // delta_zB = sigma*ei
         delta_y_sparse.to_dense(delta_y);
+        phase2_work_estimate += delta_y.size();
         phase2::compute_reduced_cost_update(lp,
                                             basic_list,
                                             nonbasic_list,
@@ -2930,7 +2915,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                             direction,
                                             delta_z_mark,
                                             delta_z_indices,
-                                            delta_z);
+                                            delta_z,
+                                            phase2_work_estimate);
       }
     }
     timers.delta_z_time += timers.stop_timer();
@@ -2974,14 +2960,15 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                                    slope,
                                                    lp.lower,
                                                    lp.upper,
-                                                   bounded_variables.underlying(),
+                                                   bounded_variables,
                                                    vstatus,
                                                    nonbasic_list,
                                                    z,
-                                                   delta_z.underlying(),
-                                                   delta_z_indices.underlying(),
+                                                   delta_z,
+                                                   delta_z_indices,
                                                    nonbasic_mark);
         entering_index = bfrt.compute_step_length(step_length, nonbasic_entering_index);
+        phase2_work_estimate += bfrt.work_estimate();
         if (entering_index == RATIO_TEST_NUMERICAL_ISSUES) {
           settings.log.printf("Numerical issues encountered in ratio test.\n");
           return dual::status_t::NUMERICAL;
@@ -2998,26 +2985,37 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
       settings.log.printf("No entering variable found. Iter %d\n", iter);
       settings.log.printf("Scaled infeasibility %e\n", max_val);
       f_t perturbation = phase2::amount_of_perturbation(lp, objective);
+      phase2_work_estimate += 2 * n;
 
       if (perturbation > 0.0 && phase == 2) {
         // Try to remove perturbation
         std::vector<f_t> unperturbed_y(m);
         std::vector<f_t> unperturbed_z(n);
+        phase2_work_estimate += m + n;
         phase2::compute_dual_solution_from_basis(
-          lp, ft, basic_list, nonbasic_list, unperturbed_y, unperturbed_z);
+          lp, ft, basic_list, nonbasic_list, unperturbed_y, unperturbed_z, phase2_work_estimate);
         {
           const f_t dual_infeas = phase2::dual_infeasibility(
             lp, settings, vstatus, unperturbed_z, settings.tight_tol, settings.dual_tol);
+          phase2_work_estimate += 3 * n;
           settings.log.printf("Dual infeasibility after removing perturbation %e\n", dual_infeas);
           if (dual_infeas <= settings.dual_tol) {
             settings.log.printf("Removed perturbation of %.2e.\n", perturbation);
-            z            = unperturbed_z;
-            y            = unperturbed_y;
+            z = unperturbed_z;
+            y = unperturbed_y;
+            phase2_work_estimate += 2 * n + 2 * m;
             perturbation = 0.0;
 
             std::vector<f_t> unperturbed_x(n);
-            phase2::compute_primal_solution_from_basis(
-              lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x, xB_workspace);
+            phase2_work_estimate += n;
+            phase2::compute_primal_solution_from_basis(lp,
+                                                       ft,
+                                                       basic_list,
+                                                       nonbasic_list,
+                                                       vstatus,
+                                                       unperturbed_x,
+                                                       xB_workspace,
+                                                       phase2_work_estimate);
             x = unperturbed_x;
             primal_infeasibility_squared =
               phase2::compute_initial_primal_infeasibilities(lp,
@@ -3027,11 +3025,14 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                                              squared_infeasibilities,
                                                              infeasibility_indices,
                                                              primal_infeasibility);
+            phase2_work_estimate += 4 * m + 2 * n;
             settings.log.printf("Updated primal infeasibility: %e\n", primal_infeasibility);
 
             objective = lp.objective;
+            phase2_work_estimate += 2 * n;
             // Need to reset the objective value, since we have recomputed x
             obj = phase2::compute_perturbed_objective(objective, x);
+            phase2_work_estimate += 2 * n;
             if (dual_infeas <= settings.dual_tol && primal_infeasibility <= settings.primal_tol) {
               phase2::prepare_optimality(1,
                                          primal_infeasibility,
@@ -3056,14 +3057,23 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
             settings.log.printf(
               "Continuing with perturbation removed and steepest edge norms reset\n");
             // Clear delta_z before restarting the iteration
+            phase2_work_estimate += 3 * delta_z_indices.size();
             phase2::clear_delta_z(
               entering_index, leaving_index, delta_z_mark, delta_z_indices, delta_z);
             continue;
           } else {
             std::vector<f_t> unperturbed_x(n);
-            phase2::compute_primal_solution_from_basis(
-              lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x, xB_workspace);
+            phase2_work_estimate += n;
+            phase2::compute_primal_solution_from_basis(lp,
+                                                       ft,
+                                                       basic_list,
+                                                       nonbasic_list,
+                                                       vstatus,
+                                                       unperturbed_x,
+                                                       xB_workspace,
+                                                       phase2_work_estimate);
             x = unperturbed_x;
+            phase2_work_estimate += 2 * n;
             primal_infeasibility_squared =
               phase2::compute_initial_primal_infeasibilities(lp,
                                                              settings,
@@ -3072,9 +3082,11 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                                              squared_infeasibilities,
                                                              infeasibility_indices,
                                                              primal_infeasibility);
+            phase2_work_estimate += 4 * m + 2 * n;
 
             const f_t orig_dual_infeas = phase2::dual_infeasibility(
               lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol);
+            phase2_work_estimate += 3 * n;
 
             if (primal_infeasibility <= settings.primal_tol &&
                 orig_dual_infeas <= settings.dual_tol) {
@@ -3135,8 +3147,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
       const f_t dual_infeas =
         phase2::dual_infeasibility(lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol);
+      phase2_work_estimate += 3 * n;
       settings.log.printf("Dual infeasibility %e\n", dual_infeas);
       const f_t primal_inf = phase2::primal_infeasibility(lp, settings, vstatus, x);
+      phase2_work_estimate += 3 * n;
       settings.log.printf("Primal infeasibility %e\n", primal_inf);
       settings.log.printf("Updates %d\n", ft.num_updates());
       settings.log.printf("Steepest edge %e\n", max_val);
@@ -3152,8 +3166,14 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     // Update dual variables
     // y <- y + steplength * delta_y
     // z <- z + steplength * delta_z
-    i_t update_dual_variables_status = phase2::update_dual_variables(
-      delta_y_sparse, delta_z_indices, delta_z, step_length, leaving_index, y, z);
+    i_t update_dual_variables_status = phase2::update_dual_variables(delta_y_sparse,
+                                                                     delta_z_indices,
+                                                                     delta_z,
+                                                                     step_length,
+                                                                     leaving_index,
+                                                                     y,
+                                                                     z,
+                                                                     phase2_work_estimate);
     if (update_dual_variables_status == -1) {
       settings.log.printf("Numerical issues encountered in update_dual_variables.\n");
       return dual::status_t::NUMERICAL;
@@ -3182,7 +3202,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                                 delta_x_flip,
                                                 atilde_mark,
                                                 atilde,
-                                                atilde_index);
+                                                atilde_index,
+                                                phase2_work_estimate);
 
     timers.flip_time += timers.stop_timer();
     total_bound_flips += num_flipped;
@@ -3199,7 +3220,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                atilde_sparse,
                                delta_xB_0_sparse,
                                delta_x_flip,
-                               x);
+                               x,
+                               phase2_work_estimate);
       timers.ftran_time += timers.stop_timer();
     }
 
@@ -3221,7 +3243,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                   x,
                                   utilde_sparse,
                                   scaled_delta_xB_sparse,
-                                  delta_x) == -1) {
+                                  delta_x,
+                                  phase2_work_estimate) == -1) {
         settings.log.printf("Failed to compute delta_x. Iter %d\n", iter);
         return dual::status_t::NUMERICAL;
       }
@@ -3248,7 +3271,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                                                         entering_index,
                                                                         v,
                                                                         v_sparse,
-                                                                        delta_y_steepest_edge);
+                                                                        delta_y_steepest_edge,
+                                                                        phase2_work_estimate);
 #ifdef STEEPEST_EDGE_DEBUG
     if (steepest_edge_status == -1) {
       settings.log.printf("Num updates %d\n", ft.num_updates());
@@ -3260,7 +3284,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
     timers.start_timer();
     // x <- x + delta_x
-    phase2::update_primal_variables(scaled_delta_xB_sparse, basic_list, delta_x, entering_index, x);
+    phase2::update_primal_variables(
+      scaled_delta_xB_sparse, basic_list, delta_x, entering_index, x, phase2_work_estimate);
     timers.vector_time += timers.stop_timer();
 
 #ifdef COMPUTE_PRIMAL_RESIDUAL
@@ -3275,8 +3300,13 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     timers.start_timer();
     // TODO(CMM): Do I also need to update the objective due to the bound flips?
     // TODO(CMM): I'm using the unperturbed objective here, should this be the perturbed objective?
-    phase2::update_objective(
-      basic_list, scaled_delta_xB_sparse.i, lp.objective, delta_x, entering_index, obj);
+    phase2::update_objective(basic_list,
+                             scaled_delta_xB_sparse.i,
+                             lp.objective,
+                             delta_x,
+                             entering_index,
+                             obj,
+                             phase2_work_estimate);
     timers.objective_time += timers.stop_timer();
 
     timers.start_timer();
@@ -3291,10 +3321,11 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                           x,
                                           entering_index,
                                           leaving_index,
-                                          delta_xB_0_sparse.i.underlying(),
-                                          squared_infeasibilities.underlying(),
-                                          infeasibility_indices.underlying(),
-                                          primal_infeasibility_squared);
+                                          delta_xB_0_sparse.i,
+                                          squared_infeasibilities,
+                                          infeasibility_indices,
+                                          primal_infeasibility_squared,
+                                          phase2_work_estimate);
     // Update primal infeasibilities due to changes in basic variables
     // from the leaving and entering variables
     phase2::update_primal_infeasibilities(lp,
@@ -3303,21 +3334,23 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                           x,
                                           entering_index,
                                           leaving_index,
-                                          scaled_delta_xB_sparse.i.underlying(),
-                                          squared_infeasibilities.underlying(),
-                                          infeasibility_indices.underlying(),
-                                          primal_infeasibility_squared);
+                                          scaled_delta_xB_sparse.i,
+                                          squared_infeasibilities,
+                                          infeasibility_indices,
+                                          primal_infeasibility_squared,
+                                          phase2_work_estimate);
     // Update the entering variable
     phase2::update_single_primal_infeasibility(lp.lower,
                                                lp.upper,
                                                x,
                                                settings.primal_tol,
-                                               squared_infeasibilities.underlying(),
-                                               infeasibility_indices.underlying(),
+                                               squared_infeasibilities,
+                                               infeasibility_indices,
                                                entering_index,
                                                primal_infeasibility_squared);
 
-    phase2::clean_up_infeasibilities(squared_infeasibilities, infeasibility_indices);
+    phase2::clean_up_infeasibilities(
+      squared_infeasibilities, infeasibility_indices, phase2_work_estimate);
 
 #if CHECK_PRIMAL_INFEASIBILITIES
     phase2::check_primal_infeasibilities(
@@ -3326,11 +3359,13 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     timers.update_infeasibility_time += timers.stop_timer();
 
     // Clear delta_x
-    phase2::clear_delta_x(basic_list, entering_index, scaled_delta_xB_sparse, delta_x);
+    phase2::clear_delta_x(
+      basic_list, entering_index, scaled_delta_xB_sparse, delta_x, phase2_work_estimate);
 
     timers.start_timer();
     f_t sum_perturb = 0.0;
-    phase2::compute_perturbation(lp, settings, delta_z_indices, z, objective, sum_perturb);
+    phase2::compute_perturbation(
+      lp, settings, delta_z_indices, z, objective, sum_perturb, phase2_work_estimate);
     timers.perturb_time += timers.stop_timer();
 
     // Update basis information
@@ -3394,12 +3429,21 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
           settings.log.printf("Successfully repaired basis. Iteration %d\n", iter);
         }
 
-        phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark);
+        phase2::reset_basis_mark(
+          basic_list, nonbasic_list, basic_mark, nonbasic_mark, phase2_work_estimate);
         if (should_recompute_x) {
           std::vector<f_t> unperturbed_x(n);
-          phase2::compute_primal_solution_from_basis(
-            lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x, xB_workspace);
+          phase2_work_estimate += n;
+          phase2::compute_primal_solution_from_basis(lp,
+                                                     ft,
+                                                     basic_list,
+                                                     nonbasic_list,
+                                                     vstatus,
+                                                     unperturbed_x,
+                                                     xB_workspace,
+                                                     phase2_work_estimate);
           x = unperturbed_x;
+          phase2_work_estimate += 2 * n;
         }
         primal_infeasibility_squared =
           phase2::compute_initial_primal_infeasibilities(lp,
@@ -3409,6 +3453,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                                          squared_infeasibilities,
                                                          infeasibility_indices,
                                                          primal_infeasibility);
+        phase2_work_estimate += 4 * m + 2 * n;
       }
 #ifdef CHECK_BASIC_INFEASIBILITIES
       phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 7);
@@ -3435,6 +3480,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     iter++;
 
     // Clear delta_z
+    phase2_work_estimate += 3 * delta_z_indices.size();
     phase2::clear_delta_z(entering_index, leaving_index, delta_z_mark, delta_z_indices, delta_z);
 
     f_t now = toc(start_time);
@@ -3443,27 +3489,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     if ((iter % FEATURE_LOG_INTERVAL) == 0 && work_unit_context) {
       [[maybe_unused]] i_t iters_elapsed = iter - last_feature_log_iter;
 
-      auto [total_loads, total_stores] = aggregator.collect_and_flush();
-      // features.byte_loads              = total_loads;
-      // features.byte_stores             = total_stores;
-
-      // features.interval_runtime = now - interval_start_time;
-      // interval_start_time       = now;
-
-      // features.iteration             = iter;
-      // features.num_refactors         = num_refactors;
-      // features.num_basis_updates     = ft.num_updates();
-      // features.sparse_delta_z_count  = sparse_delta_z;
-      // features.dense_delta_z_count   = dense_delta_z;
-      // features.total_bound_flips     = total_bound_flips;
-      // features.num_infeasibilities   = infeasibility_indices.size();
-      // features.delta_y_nz_percentage = delta_y_nz_percentage;
-      // features.log_features(settings);
-
-      if (work_unit_context) {
-        // TEMP;
-        work_unit_context->record_work_sync_on_horizon((total_loads + total_stores) / 1e8);
-      }
+      phase2_work_estimate += ft.work_estimate();
+      ft.clear_work_estimate();
+      work_unit_context->record_work_sync_on_horizon(phase2_work_estimate / 1e8);
+      phase2_work_estimate = 0.0;
 
       last_feature_log_iter = iter;
     }
diff --git a/cpp/src/dual_simplex/primal.cpp b/cpp/src/dual_simplex/primal.cpp
index 98f5f4193b..b1b6be0b75 100644
--- a/cpp/src/dual_simplex/primal.cpp
+++ b/cpp/src/dual_simplex/primal.cpp
@@ -264,6 +264,7 @@ primal::status_t primal_phase2(i_t phase,
   assert(lp.lower.size() == n);
   assert(lp.upper.size() == n);
   assert(lp.rhs.size() == m);
+  f_t work_estimate = 0;
   std::vector<i_t> basic_list(m);
   std::vector<i_t> nonbasic_list;
   std::vector<i_t> superbasic_list;
@@ -294,8 +295,8 @@ primal::status_t primal_phase2(i_t phase,
   std::vector<i_t> q(m);
   std::vector<i_t> deficient;
   std::vector<i_t> slacks_needed;
-  i_t rank =
-    factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
+  i_t rank = factorize_basis(
+    lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate);
   if (rank == CONCURRENT_HALT_RETURN) {
     return primal::status_t::CONCURRENT_LIMIT;
   } else if (rank != m) {
@@ -309,8 +310,10 @@ primal::status_t primal_phase2(i_t phase,
                  basic_list,
                  nonbasic_list,
                  superbasic_list,
-                 vstatus);
-    rank = factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
+                 vstatus,
+                 work_estimate);
+    rank = factorize_basis(
+      lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate);
     if (rank == CONCURRENT_HALT_RETURN) {
       return primal::status_t::CONCURRENT_LIMIT;
     } else if (rank == -1) {
diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp
index cb9834705c..93a726d065 100644
--- a/cpp/src/dual_simplex/right_looking_lu.cpp
+++ b/cpp/src/dual_simplex/right_looking_lu.cpp
@@ -33,17 +33,20 @@ struct element_t {
 };
 constexpr int kNone = -1;
 
-template <typename i_t, typename f_t, typename VectorI>
+template <typename i_t, typename f_t>
 i_t initialize_degree_data(const csc_matrix_t<i_t, f_t>& A,
-                           const VectorI& column_list,
+                           const std::vector<i_t>& column_list,
                            std::vector<i_t>& Cdegree,
                            std::vector<i_t>& Rdegree,
                            std::vector<std::vector<i_t>>& col_count,
-                           std::vector<std::vector<i_t>>& row_count)
+                           std::vector<std::vector<i_t>>& row_count,
+                           f_t& work_estimate)
 {
   const i_t n = column_list.size();
   const i_t m = A.m;
   std::fill(Rdegree.begin(), Rdegree.end(), 0);
+  work_estimate += Rdegree.size();
+
   i_t Bnz = 0;
   for (i_t k = 0; k < n; ++k) {
     const i_t j         = column_list[k];
@@ -55,11 +58,13 @@ i_t initialize_degree_data(const csc_matrix_t<i_t, f_t>& A,
       Bnz++;
     }
   }
+  work_estimate += 3 * n + 2 * Bnz;
 
   for (i_t k = 0; k < n; ++k) {
     assert(Cdegree[k] <= m && Cdegree[k] >= 0);
     col_count[Cdegree[k]].push_back(k);
   }
+  work_estimate += 3 * n;
 
   for (i_t k = 0; k < m; ++k) {
     assert(Rdegree[k] <= n && Rdegree[k] >= 0);
@@ -69,20 +74,24 @@ i_t initialize_degree_data(const csc_matrix_t<i_t, f_t>& A,
       if (verbose) { printf("Zero degree row %d\n", k); }
     }
   }
+  work_estimate += 4 * m;
+
   return Bnz;
 }
 
-template <typename i_t, typename f_t, typename VectorI>
+template <typename i_t, typename f_t>
 i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
-                  const VectorI& column_list,
+                  const std::vector<i_t>& column_list,
                   i_t Bnz,
                   std::vector<element_t<i_t, f_t>>& elements,
                   std::vector<i_t>& first_in_row,
-                  std::vector<i_t>& first_in_col)
+                  std::vector<i_t>& first_in_col,
+                  f_t& work_estimate)
 {
   const i_t m = A.m;
   const i_t n = column_list.size();
   std::vector<i_t> last_element_in_row(m, kNone);
+  work_estimate += m;
 
   i_t nz = 0;
   for (i_t k = 0; k < n; ++k) {
@@ -110,6 +119,7 @@ i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
       nz++;
     }
   }
+  work_estimate += 3 * n + 15 * nz;
   assert(nz == Bnz);
 
   for (i_t j = 0; j < n; j++) {
@@ -120,6 +130,7 @@ i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
       assert(entry->i < m);
     }
   }
+  work_estimate += 2 * n + nz;
 
   for (i_t i = 0; i < m; i++) {
     for (i_t p = first_in_row[i]; p != kNone; p = elements[p].next_in_row) {
@@ -129,6 +140,7 @@ i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
       assert(entry->j >= 0);
     }
   }
+  work_estimate += 2 * m + nz;
 
   return 0;
 }
@@ -200,7 +212,8 @@ i_t markowitz_search(const std::vector<i_t>& Cdegree,
                      f_t threshold_tol,
                      i_t& pivot_i,
                      i_t& pivot_j,
-                     i_t& pivot_p)
+                     i_t& pivot_p,
+                     f_t& work_estimate)
 {
   i_t nz      = 1;
   const i_t m = Rdegree.size();
@@ -217,6 +230,7 @@ i_t markowitz_search(const std::vector<i_t>& Cdegree,
       assert(Cdegree[j] == nz);
       const f_t max_in_col = max_in_column[j];
 
+      work_estimate += 3 * nz;
       for (i_t p = first_in_col[j]; p != kNone; p = elements[p].next_in_column) {
         element_t<i_t, f_t>* entry = &elements[p];
         const i_t i                = entry->i;
@@ -260,6 +274,7 @@ i_t markowitz_search(const std::vector<i_t>& Cdegree,
     assert(row_count[nz].size() >= 0);
     for (const i_t i : row_count[nz]) {
       assert(Rdegree[i] == nz);
+      work_estimate += 5 * nz;
 #ifdef THRESHOLD_ROOK_PIVOTING
       const f_t max_in_row_i = max_in_row[i];
 #endif
@@ -301,9 +316,11 @@ void update_Cdegree_and_col_count(i_t pivot_i,
                                   const std::vector<i_t>& first_in_row,
                                   std::vector<i_t>& Cdegree,
                                   std::vector<std::vector<i_t>>& col_count,
-                                  std::vector<element_t<i_t, f_t>>& elements)
+                                  std::vector<element_t<i_t, f_t>>& elements,
+                                  f_t& work_estimate)
 {
   // Update Cdegree and col_count
+  i_t loop_count = 0;
   for (i_t p = first_in_row[pivot_i]; p != kNone; p = elements[p].next_in_row) {
     element_t<i_t, f_t>* entry = &elements[p];
     const i_t j                = entry->j;
@@ -317,13 +334,16 @@ void update_Cdegree_and_col_count(i_t pivot_i,
         // Remove col j from col_count[cdeg]
         std::swap(*it, col_count[cdeg].back());
         col_count[cdeg].pop_back();
+        work_estimate += (it - col_count[cdeg].begin());
         break;
       }
     }
     cdeg = --Cdegree[j];
     assert(cdeg >= 0);
     if (j != pivot_j && cdeg >= 0) { col_count[cdeg].push_back(j); }
+    loop_count++;
   }
+  work_estimate += 7 * loop_count;
   Cdegree[pivot_j] = -1;
 }
 
@@ -333,9 +353,11 @@ void update_Rdegree_and_row_count(i_t pivot_i,
                                   const std::vector<i_t>& first_in_col,
                                   std::vector<i_t>& Rdegree,
                                   std::vector<std::vector<i_t>>& row_count,
-                                  std::vector<element_t<i_t, f_t>>& elements)
+                                  std::vector<element_t<i_t, f_t>>& elements,
+                                  f_t& work_estimate)
 {
   // Update Rdegree and row_count
+  i_t loop_count = 0;
   for (i_t p = first_in_col[pivot_j]; p != kNone; p = elements[p].next_in_column) {
     element_t<i_t, f_t>* entry = &elements[p];
     const i_t i                = entry->i;
@@ -348,13 +370,16 @@ void update_Rdegree_and_row_count(i_t pivot_i,
         // Remove row i from row_count[rdeg]
         std::swap(*it, row_count[rdeg].back());
         row_count[rdeg].pop_back();
+        work_estimate += (it - row_count[rdeg].begin());
         break;
       }
     }
     rdeg = --Rdegree[i];
     assert(rdeg >= 0);
     if (i != pivot_i && rdeg >= 0) { row_count[rdeg].push_back(i); }
+    loop_count++;
   }
+  work_estimate += 7 * loop_count;
   Rdegree[pivot_i] = -1;
 }
 
@@ -375,7 +400,8 @@ void schur_complement(i_t pivot_i,
                       std::vector<i_t>& Cdegree,
                       std::vector<std::vector<i_t>>& row_count,
                       std::vector<std::vector<i_t>>& col_count,
-                      std::vector<element_t<i_t, f_t>>& elements)
+                      std::vector<element_t<i_t, f_t>>& elements,
+                      f_t& work_estimate)
 {
   for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) {
     element_t<i_t, f_t>* e = &elements[p1];
@@ -384,8 +410,10 @@ void schur_complement(i_t pivot_i,
     for (i_t p3 = first_in_row[i]; p3 != kNone; p3 = elements[p3].next_in_row) {
       row_last = p3;
     }
+    work_estimate += 2 * Rdegree[i];
     row_last_workspace[i] = row_last;
   }
+  work_estimate += 4 * Cdegree[pivot_j];
 
   for (i_t p0 = first_in_row[pivot_i]; p0 != kNone; p0 = elements[p0].next_in_row) {
     element_t<i_t, f_t>* entry = &elements[p0];
@@ -402,6 +430,7 @@ void schur_complement(i_t pivot_i,
       column_j_workspace[i] = p1;
       col_last              = p1;
     }
+    work_estimate += 5 * Cdegree[j];
 
     for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) {
       element_t<i_t, f_t>* e = &elements[p1];
@@ -457,6 +486,7 @@ void schur_complement(i_t pivot_i,
             // Remove row i from row_count[rdeg]
             std::swap(*it, row_count[rdeg].back());
             row_count[rdeg].pop_back();
+            work_estimate += 2 * (it - row_count[rdeg].begin());
             break;
           }
         }
@@ -471,6 +501,7 @@ void schur_complement(i_t pivot_i,
             // Remove col j from col_count[cdeg]
             std::swap(*it, col_count[cdeg].back());
             col_count[cdeg].pop_back();
+            work_estimate += 2 * (it - col_count[cdeg].begin());
             break;
           }
         }
@@ -478,6 +509,7 @@ void schur_complement(i_t pivot_i,
         col_count[cdeg].push_back(j);  // Add column j to col_count[cdeg]
       }
     }
+    work_estimate += 10 * Cdegree[pivot_j];
 
     for (i_t p1 = first_in_col[j]; p1 != kNone; p1 = elements[p1].next_in_column) {
       element_t<i_t, f_t>* e = &elements[p1];
@@ -485,7 +517,9 @@ void schur_complement(i_t pivot_i,
       assert(e->j == j);
       column_j_workspace[i] = kNone;
     }
+    work_estimate += 5 * Cdegree[j];
   }
+  work_estimate += 10 * Rdegree[pivot_i];
 }
 
 template <typename i_t, typename f_t>
@@ -494,16 +528,19 @@ void remove_pivot_row(i_t pivot_i,
                       std::vector<i_t>& first_in_col,
                       std::vector<i_t>& first_in_row,
                       std::vector<f_t>& max_in_column,
-                      std::vector<element_t<i_t, f_t>>& elements)
+                      std::vector<element_t<i_t, f_t>>& elements,
+                      f_t& work_estimate)
 {
   // Remove the pivot row
 
+  i_t row_loop_count = 0;
   for (i_t p0 = first_in_row[pivot_i]; p0 != kNone; p0 = elements[p0].next_in_row) {
     element_t<i_t, f_t>* e = &elements[p0];
     const i_t j            = e->j;
     if (j == pivot_j) { continue; }
-    i_t last         = kNone;
-    f_t max_in_col_j = 0;
+    i_t last           = kNone;
+    f_t max_in_col_j   = 0;
+    i_t col_loop_count = 0;
     for (i_t p = first_in_col[j]; p != kNone; p = elements[p].next_in_column) {
       element_t<i_t, f_t>* entry = &elements[p];
       if (entry->i == pivot_i) {
@@ -520,9 +557,13 @@ void remove_pivot_row(i_t pivot_i,
         if (abs_entryx > max_in_col_j) { max_in_col_j = abs_entryx; }
       }
       last = p;
+      col_loop_count++;
     }
+    work_estimate += 3 * col_loop_count;
     max_in_column[j] = max_in_col_j;
+    row_loop_count++;
   }
+  work_estimate += 5 * row_loop_count;
 
   first_in_row[pivot_i] = kNone;
 }
@@ -533,9 +574,11 @@ void remove_pivot_col(i_t pivot_i,
                       std::vector<i_t>& first_in_col,
                       std::vector<i_t>& first_in_row,
                       std::vector<f_t>& max_in_row,
-                      std::vector<element_t<i_t, f_t>>& elements)
+                      std::vector<element_t<i_t, f_t>>& elements,
+                      f_t& work_estimate)
 {
   // Remove the pivot col
+  i_t col_loop_count = 0;
   for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) {
     element_t<i_t, f_t>* e = &elements[p1];
     const i_t i            = e->i;
@@ -543,6 +586,7 @@ void remove_pivot_col(i_t pivot_i,
 #ifdef THRESHOLD_ROOK_PIVOTING
     f_t max_in_row_i = 0.0;
 #endif
+    i_t row_loop_count = 0;
     for (i_t p = first_in_row[i]; p != kNone; p = elements[p].next_in_row) {
       element_t<i_t, f_t>* entry = &elements[p];
       if (entry->j == pivot_j) {
@@ -562,25 +606,30 @@ void remove_pivot_col(i_t pivot_i,
       }
 #endif
       last = p;
+      row_loop_count++;
     }
+    work_estimate += 3 * row_loop_count;
 #ifdef THRESHOLD_ROOK_PIVOTING
     max_in_row[i] = max_in_row_i;
 #endif
+    col_loop_count++;
   }
   first_in_col[pivot_j] = kNone;
+  work_estimate += 3 * col_loop_count;
 }
 
 }  // namespace
 
-template <typename i_t, typename f_t, typename VectorI>
+template <typename i_t, typename f_t>
 i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
                      const simplex_solver_settings_t<i_t, f_t>& settings,
                      f_t tol,
-                     const VectorI& column_list,
-                     VectorI& q,
+                     const std::vector<i_t>& column_list,
+                     std::vector<i_t>& q,
                      csc_matrix_t<i_t, f_t>& L,
                      csc_matrix_t<i_t, f_t>& U,
-                     VectorI& pinv)
+                     std::vector<i_t>& pinv,
+                     f_t& work_estimate)
 {
   raft::common::nvtx::range scope("LU::right_looking_lu");
   const i_t n = column_list.size();
@@ -596,23 +645,31 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
 
   std::vector<i_t> Rdegree(n);  // Rdegree[i] is the degree of row i
   std::vector<i_t> Cdegree(n);  // Cdegree[j] is the degree of column j
+  work_estimate += 2 * n;
 
   std::vector<std::vector<i_t>> col_count(
     n + 1);  // col_count[nz] is a list of columns with nz nonzeros in the active submatrix
   std::vector<std::vector<i_t>> row_count(
     n + 1);  // row_count[nz] is a list of rows with nz nonzeros in the active submatrix
+  work_estimate += 2 * n;
 
-  const i_t Bnz = initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count);
+  const i_t Bnz =
+    initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate);
   std::vector<element_t<i_t, f_t>> elements(Bnz);
   std::vector<i_t> first_in_row(n, kNone);
   std::vector<i_t> first_in_col(n, kNone);
-  load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col);
+  work_estimate += 2 * n + Bnz;
+  load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col, work_estimate);
 
   std::vector<i_t> column_j_workspace(n, kNone);
   std::vector<i_t> row_last_workspace(n);
   std::vector<f_t> max_in_column(n);
   std::vector<f_t> max_in_row(m);
+  work_estimate += 3 * n + m;
+
   initialize_max_in_column(first_in_col, elements, max_in_column);
+  work_estimate += Bnz;
+
 #ifdef THRESHOLD_ROOK_PIVOTING
   initialize_max_in_row(first_in_row, elements, max_in_row);
 #endif
@@ -622,6 +679,7 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
   Urow.n = Urow.m = n;
   Urow.row_start.resize(n + 1, -1);
   i_t Unz = 0;
+  work_estimate += 2 * n;
 
   i_t Lnz = 0;
   L.x.clear();
@@ -631,6 +689,7 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
   std::fill(pinv.begin(), pinv.end(), -1);
   std::vector<i_t> qinv(n);
   std::fill(qinv.begin(), qinv.end(), -1);
+  work_estimate += 4 * n;
 
   i_t pivots = 0;
   for (i_t k = 0; k < n; ++k) {
@@ -657,7 +716,8 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
                      threshold_tol,
                      pivot_i,
                      pivot_j,
-                     pivot_p);
+                     pivot_p,
+                     work_estimate);
     if (pivot_i == -1 || pivot_j == -1) { break; }
     element_t<i_t, f_t>* pivot_entry = &elements[pivot_p];
     assert(pivot_i != -1 && pivot_j != -1);
@@ -688,6 +748,7 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
         Unz++;
       }
     }
+    work_estimate += 4 * (Unz - Urow.row_start[k]);
 
     // L <- [L l]
     L.col_start[k] = Lnz;
@@ -708,10 +769,13 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
         Lnz++;
       }
     }
+    work_estimate += 4 * (Lnz - L.col_start[k]);
 
     // Update Cdegree and col_count
-    update_Cdegree_and_col_count(pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements);
-    update_Rdegree_and_row_count(pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements);
+    update_Cdegree_and_col_count(
+      pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements, work_estimate);
+    update_Rdegree_and_row_count(
+      pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements, work_estimate);
 
     // A22 <- A22 - l u^T
     schur_complement(pivot_i,
@@ -730,11 +794,14 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
                      Cdegree,
                      row_count,
                      col_count,
-                     elements);
+                     elements,
+                     work_estimate);
 
     // Remove the pivot row
-    remove_pivot_row(pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements);
-    remove_pivot_col(pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements);
+    remove_pivot_row(
+      pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements, work_estimate);
+    remove_pivot_col(
+      pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements, work_estimate);
 
     // Set pivot entry to sentinel value
     pivot_entry->i = -1;
@@ -855,6 +922,7 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
     for (i_t i = 0; i < m; ++i) {
       if (pinv[i] == -1) { pinv[i] = start++; }
     }
+    work_estimate += m;
 
     // Finalize the permutation q. Do this by first completing the inverse permutation qinv.
     // Then invert qinv to get the final permutation q.
@@ -862,7 +930,9 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
     for (i_t j = 0; j < n; ++j) {
       if (qinv[j] == -1) { qinv[j] = start++; }
     }
+    work_estimate += n;
     inverse_permutation(qinv, q);
+    work_estimate += 2 * n;
 
     return pivots;
   }
@@ -875,6 +945,7 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
   for (i_t p = 0; p < Lnz; ++p) {
     L.i[p] = pinv[L.i[p]];
   }
+  work_estimate += 3 * Lnz;
 
 #ifdef CHECK_LOWER_TRIANGULAR
   for (i_t j = 0; j < n; ++j) {
@@ -889,17 +960,23 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
 #endif
 
   csc_matrix_t<i_t, f_t> U_unpermuted(n, n, 1);
+  work_estimate += n;
   Urow.to_compressed_col(
     U_unpermuted);  // Convert Urow to U stored in compressed sparse column format
+  work_estimate += n + Unz;
   std::vector<i_t> row_perm(n);
+  work_estimate += n;
   inverse_permutation(pinv, row_perm);
+  work_estimate += 2 * n;
 
   std::vector<i_t> identity(n);
   for (i_t k = 0; k < n; k++) {
     identity[k] = k;
   }
+  work_estimate += 2 * n;
 
   U_unpermuted.permute_rows_and_cols(identity, q, U);
+  work_estimate += 3 * U.n + 5 * Unz;
 
 #ifdef CHECK_UPPER_TRIANGULAR
   for (i_t k = 0; k < n; ++k) {
@@ -929,6 +1006,7 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
   // We return the inverser row permutation vector pinv and the column permutation vector q
 
   f_t factorization_start_time = tic();
+  f_t work_estimate            = 0;
   const i_t n                  = A.n;
   const i_t m                  = A.m;
   assert(pinv.size() == m);
@@ -946,11 +1024,12 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
     column_list[k] = k;
   }
 
-  const i_t Bnz = initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count);
+  const i_t Bnz =
+    initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate);
   std::vector<element_t<i_t, f_t>> elements(Bnz);
   std::vector<i_t> first_in_row(m, kNone);
   std::vector<i_t> first_in_col(n, kNone);
-  load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col);
+  load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col, work_estimate);
 
   std::vector<i_t> column_j_workspace(m, kNone);
   std::vector<i_t> row_last_workspace(m);
@@ -997,7 +1076,8 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
                      threshold_tol,
                      pivot_i,
                      pivot_j,
-                     pivot_p);
+                     pivot_p,
+                     work_estimate);
     if (pivot_i == -1 || pivot_j == -1) {
       settings.log.debug("Breaking can't find a pivot %d\n", k);
       break;
@@ -1016,9 +1096,9 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
 
     // Update Cdegree and col_count
     update_Cdegree_and_col_count<i_t, f_t>(
-      pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements);
+      pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements, work_estimate);
     update_Rdegree_and_row_count<i_t, f_t>(
-      pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements);
+      pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements, work_estimate);
 
     // A22 <- A22 - l u^T
     schur_complement<i_t, f_t>(pivot_i,
@@ -1037,12 +1117,14 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
                                Cdegree,
                                row_count,
                                col_count,
-                               elements);
+                               elements,
+                               work_estimate);
 
     // Remove the pivot row
     remove_pivot_row<i_t, f_t>(
-      pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements);
-    remove_pivot_col<i_t, f_t>(pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements);
+      pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements, work_estimate);
+    remove_pivot_col<i_t, f_t>(
+      pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements, work_estimate);
 
     // Set pivot entry to sentinel value
     pivot_entry->i = -1;
@@ -1149,25 +1231,15 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
 
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
 
-template int right_looking_lu<int, double, std::vector<int>>(
-  const csc_matrix_t<int, double>& A,
-  const simplex_solver_settings_t<int, double>& settings,
-  double tol,
-  const std::vector<int>& column_list,
-  std::vector<int>& q,
-  csc_matrix_t<int, double>& L,
-  csc_matrix_t<int, double>& U,
-  std::vector<int>& pinv);
-
-template int right_looking_lu<int, double, ins_vector<int>>(
-  const csc_matrix_t<int, double>& A,
-  const simplex_solver_settings_t<int, double>& settings,
-  double tol,
-  const ins_vector<int>& column_list,
-  ins_vector<int>& q,
-  csc_matrix_t<int, double>& L,
-  csc_matrix_t<int, double>& U,
-  ins_vector<int>& pinv);
+template int right_looking_lu<int, double>(const csc_matrix_t<int, double>& A,
+                                           const simplex_solver_settings_t<int, double>& settings,
+                                           double tol,
+                                           const std::vector<int>& column_list,
+                                           std::vector<int>& q,
+                                           csc_matrix_t<int, double>& L,
+                                           csc_matrix_t<int, double>& U,
+                                           std::vector<int>& pinv,
+                                           double& work_estimate);
 
 template int right_looking_lu_row_permutation_only<int, double>(
   const csc_matrix_t<int, double>& A,
diff --git a/cpp/src/dual_simplex/right_looking_lu.hpp b/cpp/src/dual_simplex/right_looking_lu.hpp
index 179fff01b2..39182eb3f4 100644
--- a/cpp/src/dual_simplex/right_looking_lu.hpp
+++ b/cpp/src/dual_simplex/right_looking_lu.hpp
@@ -14,15 +14,16 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
-template <typename i_t, typename f_t, typename VectorI>
+template <typename i_t, typename f_t>
 i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
                      const simplex_solver_settings_t<i_t, f_t>& settings,
                      f_t tol,
-                     const VectorI& column_list,
-                     VectorI& q,
+                     const std::vector<i_t>& column_list,
+                     std::vector<i_t>& q,
                      csc_matrix_t<i_t, f_t>& L,
                      csc_matrix_t<i_t, f_t>& U,
-                     VectorI& pinv);
+                     std::vector<i_t>& pinv,
+                     f_t& work_estimate);
 
 template <typename i_t, typename f_t>
 i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
diff --git a/cpp/src/dual_simplex/singletons.cpp b/cpp/src/dual_simplex/singletons.cpp
index 347604dcea..dbba46f6a5 100644
--- a/cpp/src/dual_simplex/singletons.cpp
+++ b/cpp/src/dual_simplex/singletons.cpp
@@ -7,12 +7,10 @@
 
 #include <dual_simplex/singletons.hpp>
 #include <dual_simplex/triangle_solve.hpp>
-#include <utilities/memory_instrumentation.hpp>
+#include <dual_simplex/types.hpp>
 
 #include <cstdio>
 
-using cuopt::ins_vector;
-
 namespace cuopt::linear_programming::dual_simplex {
 
 // Destroys the queue but prints it
@@ -27,10 +25,11 @@ void print_queue(std::queue<i_t>& q)
   printf("\n");
 }
 
-template <typename i_t>
+template <typename i_t, typename f_t>
 i_t order_singletons(std::queue<i_t>& singleton_queue,
                      i_t& singletons_found,
-                     row_col_graph_t<i_t>& G)
+                     row_col_graph_t<i_t>& G,
+                     f_t& work_estimate)
 {
   constexpr i_t kEmpty = -1;
   while (!singleton_queue.empty()) {
@@ -68,6 +67,7 @@ i_t order_singletons(std::queue<i_t>& singleton_queue,
         break;
       }
     }
+    work_estimate += 2 * (xend - G.Xp[xpivot]);
     assert(ypivot != kEmpty);
     assert(G.Ydeg[ypivot] >= 0);
 
@@ -85,6 +85,7 @@ i_t order_singletons(std::queue<i_t>& singleton_queue,
         singleton_queue.push(x);
       }
     }
+    work_estimate += 2 * (yend - G.Yp[ypivot]);
     // Mark the pivot by flipping the degrees
     G.Xdeg[xpivot] = FLIP(1);
     G.Ydeg[ypivot] = FLIP(G.Ydeg[ypivot]);
@@ -102,7 +103,8 @@ template <typename i_t, typename f_t>
 void create_row_representation(const csc_matrix_t<i_t, f_t>& A,
                                std::vector<i_t>& row_start,
                                std::vector<i_t>& col_index,
-                               std::vector<i_t>& workspace)
+                               std::vector<i_t>& workspace,
+                               f_t& work_estimate)
 {
   // row counts
   i_t n  = A.n;
@@ -114,25 +116,32 @@ void create_row_representation(const csc_matrix_t<i_t, f_t>& A,
   for (i_t i = 0; i < m; ++i) {
     workspace[i] = 0;
   }
+  work_estimate += m;
 
   // Compute row degrees
   for (i_t p = 0; p < nz; ++p) {
     workspace[A.i[p]]++;
   }
+  work_estimate += 3 * nz;
+
   // Compute rowstart and overwrite workspace
   cumulative_sum(workspace, row_start);
+  work_estimate += 4 * workspace.size();
 
   for (i_t j = 0; j < n; ++j) {
-    for (i_t p = A.col_start[j]; p < A.col_start[j + 1]; ++p) {
+    const i_t col_start = A.col_start[j];
+    const i_t col_end   = A.col_start[j + 1];
+    for (i_t p = col_start; p < col_end; ++p) {
       i_t q        = workspace[A.i[p]]++;
       col_index[q] = j;
     }
   }
+  work_estimate += 2 * n + 4 * nz;
 }
 
 // Complete the permuation
-template <typename i_t, typename VectorI>
-i_t complete_permutation(i_t singletons, std::vector<i_t>& Xdeg, VectorI& Xperm)
+template <typename i_t>
+i_t complete_permutation(i_t singletons, std::vector<i_t>& Xdeg, std::vector<i_t>& Xperm)
 {
   i_t n = Xdeg.size();
   assert(Xperm.size() == n);
@@ -157,12 +166,13 @@ i_t complete_permutation(i_t singletons, std::vector<i_t>& Xdeg, VectorI& Xperm)
   return num_empty;
 }
 
-template <typename i_t, typename f_t, typename VectorI>
+template <typename i_t, typename f_t>
 i_t find_singletons(const csc_matrix_t<i_t, f_t>& A,
                     i_t& row_singletons,
-                    VectorI& row_perm,
+                    std::vector<i_t>& row_perm,
                     i_t& col_singletons,
-                    VectorI& col_perm)
+                    std::vector<i_t>& col_perm,
+                    f_t& work_estimate)
 {
   i_t n  = A.n;
   i_t m  = A.m;
@@ -173,6 +183,7 @@ i_t find_singletons(const csc_matrix_t<i_t, f_t>& A,
   std::vector<i_t> Cdeg(n);
   std::vector<i_t> Rp(m + 1);
   std::vector<i_t> Rj(nz);
+  work_estimate += 3 * m + n + nz;
 
   i_t max_queue_len = std::max(m, n);
   std::queue<i_t> singleton_queue;
@@ -186,36 +197,38 @@ i_t find_singletons(const csc_matrix_t<i_t, f_t>& A,
       Rdeg[A.i[p]]++;
     }
   }
+  work_estimate += 2 * n + 2 * nz;
 
   // Add all columns of degree 1 to the queue
   for (i_t j = n - 1; j >= 0; --j) {
     if (Cdeg[j] == 1) { singleton_queue.push(j); }
   }
+  work_estimate += n + singleton_queue.size();
 
   bool row_form        = false;
   i_t singletons_found = 0;
   col_singletons       = 0;
   if (!singleton_queue.empty()) {
     // Don't create the row representation unless we found a singleton
-    create_row_representation(A, Rp, Rj, workspace);
+    create_row_representation(A, Rp, Rj, workspace, work_estimate);
     row_form = true;
 
     // Find column singletons
-    auto& col_perm_vec = static_cast<std::vector<i_t>&>(col_perm);
-    auto& row_perm_vec = static_cast<std::vector<i_t>&>(row_perm);
     row_col_graph_t<i_t> graph{Cdeg.begin(),
-                               col_perm_vec.begin(),
-                               A.col_start.underlying().cbegin(),
-                               A.i.underlying().cbegin(),
+                               col_perm.begin(),
+                               A.col_start.cbegin(),
+                               A.i.cbegin(),
                                Rdeg.begin(),
-                               row_perm_vec.begin(),
+                               row_perm.begin(),
                                Rp.cbegin(),
                                Rj.cbegin()};
 
 #ifdef SINGLETON_DEBUG
     printf("Searching for column singletons. Initial size %ld\n", singleton_queue.size());
 #endif
-    col_singletons = order_singletons(singleton_queue, singletons_found, graph);
+
+    col_singletons = order_singletons(singleton_queue, singletons_found, graph, work_estimate);
+
 #ifdef SINGLETON_DEBUG
     printf("Found %d column singletons\n", col_singletons);
 #endif
@@ -225,30 +238,29 @@ i_t find_singletons(const csc_matrix_t<i_t, f_t>& A,
   for (i_t i = m - 1; i >= 0; --i) {
     if (Rdeg[i] == 1) { singleton_queue.push(i); }
   }
+  work_estimate += m + singleton_queue.size();
 
   row_singletons = 0;
   if (!singleton_queue.empty()) {
     if (!row_form) {
       // If we haven't created the row representation yet, we need to
-      create_row_representation(A, Rp, Rj, workspace);  // use Rdeg as workspace
+      create_row_representation(A, Rp, Rj, workspace, work_estimate);  // use Rdeg as workspace
     }
 
     // Find row singletons
-    auto& row_perm_vec2 = static_cast<std::vector<i_t>&>(row_perm);
-    auto& col_perm_vec2 = static_cast<std::vector<i_t>&>(col_perm);
     row_col_graph_t<i_t> graph{Rdeg.begin(),
-                               row_perm_vec2.begin(),
+                               row_perm.begin(),
                                Rp.cbegin(),
                                Rj.cbegin(),
                                Cdeg.begin(),
-                               col_perm_vec2.begin(),
-                               A.col_start.underlying().cbegin(),
-                               A.i.underlying().cbegin()};
+                               col_perm.begin(),
+                               A.col_start.cbegin(),
+                               A.i.cbegin()};
 #ifdef SINGLETON_DEBUG
     printf("Searching for row singletons %ld\n", singleton_queue.size());
 #endif
     i_t last       = singletons_found;
-    row_singletons = order_singletons(singleton_queue, singletons_found, graph);
+    row_singletons = order_singletons(singleton_queue, singletons_found, graph, work_estimate);
     row_singletons = row_singletons - last;
 #ifdef SINGLETON_DEBUG
     printf("Found %d row singletons. %d\n", row_singletons, singletons_found);
@@ -263,10 +275,12 @@ i_t find_singletons(const csc_matrix_t<i_t, f_t>& A,
   printf("Col singletons %d\n", col_singletons);
 #endif
   i_t num_empty_cols = complete_permutation(singletons_found, Cdeg, col_perm);
+  work_estimate += 2 * Cdeg.size();
 #ifdef SINGLETON_DEBUG
   printf("Completed col perm. %d empty cols. Starting row perm\n", num_empty_cols);
 #endif
   i_t num_empty_rows = complete_permutation(singletons_found, Rdeg, row_perm);
+  work_estimate += 2 * Rdeg.size();
 #ifdef SINGLETON_DEBUG
   printf("Empty rows %d Empty columns %d\n", num_empty_rows, num_empty_cols);
 #endif
@@ -277,35 +291,28 @@ i_t find_singletons(const csc_matrix_t<i_t, f_t>& A,
 
 template struct row_col_graph_t<int>;
 
-template int order_singletons<int>(std::queue<int>& singleton_queue,
-                                   int& singletons_found,
-                                   row_col_graph_t<int>& G);
+template int order_singletons<int, double>(std::queue<int>& singleton_queue,
+                                           int& singletons_found,
+                                           row_col_graph_t<int>& G,
+                                           double& work_estimate);
 
 // \param [in,out]  workspace - size m
 template void create_row_representation<int, double>(const csc_matrix_t<int, double>& A,
                                                      std::vector<int>& row_start,
                                                      std::vector<int>& col_index,
-                                                     std::vector<int>& workspace);
+                                                     std::vector<int>& workspace,
+                                                     double& work_estimate);
 // Complete the permuation
-template int complete_permutation<int, std::vector<int>>(int singletons,
-                                                         std::vector<int>& Xdeg,
-                                                         std::vector<int>& Xperm);
-template int complete_permutation<int, ins_vector<int>>(int singletons,
-                                                        std::vector<int>& Xdeg,
-                                                        ins_vector<int>& Xperm);
-
-template int find_singletons<int, double, std::vector<int>>(const csc_matrix_t<int, double>& A,
-                                                            int& row_singletons,
-                                                            std::vector<int>& row_perm,
-                                                            int& col_singleton,
-                                                            std::vector<int>& col_perm);
-
-template int find_singletons<int, double, ins_vector<int>>(const csc_matrix_t<int, double>& A,
-                                                           int& row_singletons,
-                                                           ins_vector<int>& row_perm,
-                                                           int& col_singleton,
-                                                           ins_vector<int>& col_perm);
-
+template int complete_permutation<int>(int singletons,
+                                       std::vector<int>& Xdeg,
+                                       std::vector<int>& Xperm);
+
+template int find_singletons<int, double>(const csc_matrix_t<int, double>& A,
+                                          int& row_singletons,
+                                          std::vector<int>& row_perm,
+                                          int& col_singleton,
+                                          std::vector<int>& col_perm,
+                                          double& work_estimate);
 #endif
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/singletons.hpp b/cpp/src/dual_simplex/singletons.hpp
index 2a1ac3e55f..9bbe9270f7 100644
--- a/cpp/src/dual_simplex/singletons.hpp
+++ b/cpp/src/dual_simplex/singletons.hpp
@@ -27,26 +27,29 @@ struct row_col_graph_t {
   typename std::vector<i_t>::const_iterator Yi;
 };
 
-template <typename i_t>
+template <typename i_t, typename f_t>
 i_t order_singletons(std::queue<i_t>& singleton_queue,
                      i_t& singletons_found,
-                     row_col_graph_t<i_t>& G);
+                     row_col_graph_t<i_t>& G,
+                     f_t& work_estimate);
 
 // \param [in,out]  workspace - size m
 template <typename i_t, typename f_t>
 void create_row_representationon(const csc_matrix_t<i_t, f_t>& A,
                                  std::vector<i_t>& row_start,
                                  std::vector<i_t>& col_index,
-                                 std::vector<i_t>& workspace);
+                                 std::vector<i_t>& workspace,
+                                 f_t& work_estimate);
 // Complete the permuation
-template <typename i_t, typename f_t>
-i_t complete_permutationn(i_t singletons, std::vector<i_t>& Xdeg, std::vector<i_t>& Xperm);
+template <typename i_t>
+i_t complete_permutation(i_t singletons, std::vector<i_t>& Xdeg, std::vector<i_t>& Xperm);
 
-template <typename i_t, typename f_t, typename VectorI>
+template <typename i_t, typename f_t>
 i_t find_singletons(const csc_matrix_t<i_t, f_t>& A,
                     i_t& row_singletons,
-                    VectorI& row_perm,
+                    std::vector<i_t>& row_perm,
                     i_t& col_singleton,
-                    VectorI& col_perm);
+                    std::vector<i_t>& col_perm,
+                    f_t& work_estimate);
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/sparse_matrix.cpp b/cpp/src/dual_simplex/sparse_matrix.cpp
index 6d9cf2b809..ec8aee09c4 100644
--- a/cpp/src/dual_simplex/sparse_matrix.cpp
+++ b/cpp/src/dual_simplex/sparse_matrix.cpp
@@ -8,12 +8,9 @@
 // #include <dual_simplex/dense_vector.hpp>
 #include <dual_simplex/sparse_matrix.hpp>
 #include <dual_simplex/sparse_vector.hpp>
-#include <utilities/memory_instrumentation.hpp>
 
 #include <dual_simplex/types.hpp>
 
-using cuopt::ins_vector;
-
 // #include <thrust/for_each.h>
 // #include <thrust/iterator/counting_iterator.h>
 
@@ -37,8 +34,8 @@ void csc_matrix_t<i_t, f_t>::reallocate(i_t new_nz)
   this->nz_max = new_nz;
 }
 
-template <typename i_t, typename OutputVector>
-void cumulative_sum(std::vector<i_t>& inout, OutputVector& output)
+template <typename i_t>
+void cumulative_sum(std::vector<i_t>& inout, std::vector<i_t>& output)
 {
   i_t n = inout.size();
   assert(output.size() == n + 1);
@@ -171,6 +168,7 @@ void csc_matrix_t<i_t, f_t>::append_column(const std::vector<f_t>& x)
   this->n++;
 }
 
+// Work = 4*x.i.size()
 template <typename i_t, typename f_t>
 void csc_matrix_t<i_t, f_t>::append_column(const sparse_vector_t<i_t, f_t>& x)
 {
@@ -193,6 +191,7 @@ void csc_matrix_t<i_t, f_t>::append_column(const sparse_vector_t<i_t, f_t>& x)
   this->n++;
 }
 
+// Work is 5*x_nz
 template <typename i_t, typename f_t>
 void csc_matrix_t<i_t, f_t>::append_column(i_t x_nz, i_t* i, f_t* x)
 {
@@ -212,6 +211,7 @@ void csc_matrix_t<i_t, f_t>::append_column(i_t x_nz, i_t* i, f_t* x)
   this->n++;
 }
 
+// Work = 6*nz + 2*n
 template <typename i_t, typename f_t>
 i_t csc_matrix_t<i_t, f_t>::transpose(csc_matrix_t<i_t, f_t>& AT) const
 {
@@ -658,8 +658,8 @@ i_t csr_matrix_t<i_t, f_t>::check_matrix(std::string matrix_name) const
 }
 
 // x <- x + alpha * A(:, j)
-template <typename i_t, typename f_t, typename VectorF>
-void scatter_dense(const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, VectorF& x)
+template <typename i_t, typename f_t>
+void scatter_dense(const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, std::vector<f_t>& x)
 {
   const i_t col_start = A.col_start[j];
   const i_t col_end   = A.col_start[j + 1];
@@ -671,9 +671,13 @@ void scatter_dense(const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, VectorF& x
 }
 
 // x <- x + alpha * A(:, j)
-template <typename i_t, typename f_t, typename VectorF, typename VectorI>
-void scatter_dense(
-  const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, VectorF& x, VectorI& mark, VectorI& indices)
+template <typename i_t, typename f_t>
+void scatter_dense(const csc_matrix_t<i_t, f_t>& A,
+                   i_t j,
+                   f_t alpha,
+                   std::vector<f_t>& x,
+                   std::vector<i_t>& mark,
+                   std::vector<i_t>& indices)
 {
   const i_t col_start = A.col_start[j];
   const i_t col_end   = A.col_start[j + 1];
@@ -933,10 +937,7 @@ template class csc_matrix_t<int, double>;
 
 template class csr_matrix_t<int, double>;
 
-template void cumulative_sum<int, std::vector<int>>(std::vector<int>& inout,
-                                                    std::vector<int>& output);
-template void cumulative_sum<int, ins_vector<int>>(std::vector<int>& inout,
-                                                   ins_vector<int>& output);
+template void cumulative_sum<int>(std::vector<int>& inout, std::vector<int>& output);
 
 template int coo_to_csc<int, double>(const std::vector<int>& Ai,
                                      const std::vector<int>& Aj,
@@ -952,31 +953,17 @@ template int scatter<int, double>(const csc_matrix_t<int, double>& A,
                                   csc_matrix_t<int, double>& C,
                                   int nz);
 
-template void scatter_dense<int, double, std::vector<double>>(const csc_matrix_t<int, double>& A,
-                                                              int j,
-                                                              double alpha,
-                                                              std::vector<double>& x);
-
-template void scatter_dense<int, double, std::vector<double>, std::vector<int>>(
-  const csc_matrix_t<int, double>& A,
-  int j,
-  double alpha,
-  std::vector<double>& x,
-  std::vector<int>& mark,
-  std::vector<int>& indices);
-
-template void scatter_dense<int, double, ins_vector<double>>(const csc_matrix_t<int, double>& A,
-                                                             int j,
-                                                             double alpha,
-                                                             ins_vector<double>& x);
-
-template void scatter_dense<int, double, ins_vector<double>, ins_vector<int>>(
-  const csc_matrix_t<int, double>& A,
-  int j,
-  double alpha,
-  ins_vector<double>& x,
-  ins_vector<int>& mark,
-  ins_vector<int>& indices);
+template void scatter_dense<int, double>(const csc_matrix_t<int, double>& A,
+                                         int j,
+                                         double alpha,
+                                         std::vector<double>& x);
+
+template void scatter_dense<int, double>(const csc_matrix_t<int, double>& A,
+                                         int j,
+                                         double alpha,
+                                         std::vector<double>& x,
+                                         std::vector<int>& mark,
+                                         std::vector<int>& indices);
 
 template int multiply<int, double>(const csc_matrix_t<int, double>& A,
                                    const csc_matrix_t<int, double>& B,
diff --git a/cpp/src/dual_simplex/sparse_matrix.hpp b/cpp/src/dual_simplex/sparse_matrix.hpp
index 8f8f622518..56e3ca82c6 100644
--- a/cpp/src/dual_simplex/sparse_matrix.hpp
+++ b/cpp/src/dual_simplex/sparse_matrix.hpp
@@ -9,7 +9,6 @@
 
 #include <dual_simplex/types.hpp>
 #include <dual_simplex/vector_math.hpp>
-#include <utilities/memory_instrumentation.hpp>
 
 #include <algorithm>
 #include <cassert>
@@ -17,9 +16,6 @@
 #include <string>
 #include <vector>
 
-// Import instrumented vector
-using cuopt::ins_vector;
-
 namespace cuopt::linear_programming::dual_simplex {
 
 template <typename i_t, typename f_t>
@@ -129,12 +125,12 @@ class csc_matrix_t {
     return true;
   }
 
-  i_t m;                      // number of rows
-  i_t n;                      // number of columns
-  i_t nz_max;                 // maximum number of entries
-  ins_vector<i_t> col_start;  // column pointers (size n + 1)
-  ins_vector<i_t> i;          // row indices, size nz_max
-  ins_vector<f_t> x;          // numerical values, size nz_max
+  i_t m;                       // number of rows
+  i_t n;                       // number of columns
+  i_t nz_max;                  // maximum number of entries
+  std::vector<i_t> col_start;  // column pointers (size n + 1)
+  std::vector<i_t> i;          // row indices, size nz_max
+  std::vector<f_t> x;          // numerical values, size nz_max
 
   static_assert(std::is_signed_v<i_t>);  // Require signed integers (we make use of this
                                          // to avoid extra space / computation)
@@ -177,18 +173,18 @@ class csr_matrix_t {
     return true;
   }
 
-  i_t nz_max;                 // maximum number of nonzero entries
-  i_t m;                      // number of rows
-  i_t n;                      // number of cols
-  ins_vector<i_t> row_start;  // row pointers (size m + 1)
-  ins_vector<i_t> j;          // column indices, size nz_max
-  ins_vector<f_t> x;          // numerical values, size nz_max
+  i_t nz_max;                  // maximum number of nonzero entries
+  i_t m;                       // number of rows
+  i_t n;                       // number of cols
+  std::vector<i_t> row_start;  // row pointers (size m + 1)
+  std::vector<i_t> j;          // column indices, size nz_max
+  std::vector<f_t> x;          // numerical values, size nz_max
 
   static_assert(std::is_signed_v<i_t>);
 };
 
-template <typename i_t, typename OutputVector>
-void cumulative_sum(std::vector<i_t>& inout, OutputVector& output);
+template <typename i_t>
+void cumulative_sum(std::vector<i_t>& inout, std::vector<i_t>& output);
 
 template <typename i_t, typename f_t>
 i_t coo_to_csc(const std::vector<i_t>& Ai,
@@ -207,12 +203,16 @@ i_t scatter(const csc_matrix_t<i_t, f_t>& A,
             i_t nz);
 
 // x <- x + alpha * A(:, j)
-template <typename i_t, typename f_t, typename VectorF>
-void scatter_dense(const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, VectorF& x);
+template <typename i_t, typename f_t>
+void scatter_dense(const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, std::vector<f_t>& x);
 
-template <typename i_t, typename f_t, typename VectorF, typename VectorI>
-void scatter_dense(
-  const csc_matrix_t<i_t, f_t>& A, i_t j, f_t alpha, VectorF& x, VectorI& mark, VectorI& indices);
+template <typename i_t, typename f_t>
+void scatter_dense(const csc_matrix_t<i_t, f_t>& A,
+                   i_t j,
+                   f_t alpha,
+                   std::vector<f_t>& x,
+                   std::vector<i_t>& mark,
+                   std::vector<i_t>& indices);
 
 // Compute C = A*B where C is m x n, A is m x k, and B = k x n
 // Do this by computing C(:, j) = A*B(:, j) = sum (i=1 to k) A(:, k)*B(i, j)
diff --git a/cpp/src/dual_simplex/sparse_vector.cpp b/cpp/src/dual_simplex/sparse_vector.cpp
index 1c6a15eb21..0d34d4c390 100644
--- a/cpp/src/dual_simplex/sparse_vector.cpp
+++ b/cpp/src/dual_simplex/sparse_vector.cpp
@@ -13,8 +13,6 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
-using cuopt::ins_vector;
-
 template <typename i_t, typename f_t>
 sparse_vector_t<i_t, f_t>::sparse_vector_t(const csc_matrix_t<i_t, f_t>& A, i_t col)
 {
@@ -70,17 +68,12 @@ void sparse_vector_t<i_t, f_t>::from_dense(const std::vector<f_t>& in)
   n = in.size();
   i.reserve(n);
   x.reserve(n);
-  auto& i_vec = i.underlying();
-  auto& x_vec = x.underlying();
   for (i_t k = 0; k < n; ++k) {
     if (in[k] != 0) {
-      i_vec.push_back(k);
-      x_vec.push_back(in[k]);
+      i.push_back(k);
+      x.push_back(in[k]);
     }
   }
-  const size_t nz = i_vec.size();
-  i.byte_stores += nz * sizeof(i_t);
-  x.byte_stores += nz * sizeof(f_t);
 }
 
 template <typename i_t, typename f_t>
@@ -92,8 +85,8 @@ void sparse_vector_t<i_t, f_t>::to_csc(csc_matrix_t<i_t, f_t>& A) const
   A.col_start.resize(2);
   A.col_start[0] = 0;
   A.col_start[1] = i.size();
-  A.i            = i.underlying();
-  A.x            = x.underlying();
+  A.i            = i;
+  A.x            = x;
 }
 
 template <typename i_t, typename f_t>
@@ -107,17 +100,6 @@ void sparse_vector_t<i_t, f_t>::to_dense(std::vector<f_t>& x_dense) const
   }
 }
 
-template <typename i_t, typename f_t>
-void sparse_vector_t<i_t, f_t>::to_dense(ins_vector<f_t>& x_dense) const
-{
-  x_dense.clear();
-  x_dense.resize(n, 0.0);
-  const i_t nz = i.size();
-  for (i_t k = 0; k < nz; ++k) {
-    x_dense[i[k]] = x[k];
-  }
-}
-
 template <typename i_t, typename f_t>
 void sparse_vector_t<i_t, f_t>::scatter(std::vector<f_t>& x_dense) const
 {
@@ -128,29 +110,12 @@ void sparse_vector_t<i_t, f_t>::scatter(std::vector<f_t>& x_dense) const
   }
 }
 
-template <typename i_t, typename f_t>
-void sparse_vector_t<i_t, f_t>::scatter(ins_vector<f_t>& x_dense) const
-{
-  // Assumes x_dense is already cleared
-  auto& sv_i   = i.underlying();
-  auto& sv_x   = x.underlying();
-  auto& dense  = x_dense.underlying();
-  const i_t nz = sv_i.size();
-  for (i_t k = 0; k < nz; ++k) {
-    dense[sv_i[k]] += sv_x[k];
-  }
-  i.byte_loads += nz * sizeof(i_t);
-  x.byte_loads += nz * sizeof(f_t);
-  x_dense.byte_loads += nz * sizeof(f_t);
-  x_dense.byte_stores += nz * sizeof(f_t);
-}
-
 template <typename i_t, typename f_t>
 void sparse_vector_t<i_t, f_t>::inverse_permute_vector(const std::vector<i_t>& p)
 {
   assert(p.size() == n);
   i_t nz = i.size();
-  ins_vector<i_t> i_perm(nz);
+  std::vector<i_t> i_perm(nz);
   for (i_t k = 0; k < nz; ++k) {
     i_perm[k] = p[i[k]];
   }
@@ -166,7 +131,7 @@ void sparse_vector_t<i_t, f_t>::inverse_permute_vector(const std::vector<i_t>& p
   i_t nz = i.size();
   y.n    = n;
   y.x    = x;
-  ins_vector<i_t> i_perm(nz);
+  std::vector<i_t> i_perm(nz);
   for (i_t k = 0; k < nz; ++k) {
     i_perm[k] = p[i[k]];
   }
@@ -232,8 +197,8 @@ void sparse_vector_t<i_t, f_t>::sort()
   } else {
     // Use a n log n sort
     const i_t nz = i.size();
-    ins_vector<i_t> i_sorted(nz);
-    ins_vector<f_t> x_sorted(nz);
+    std::vector<i_t> i_sorted(nz);
+    std::vector<f_t> x_sorted(nz);
     std::vector<i_t> perm(nz);
     for (i_t k = 0; k < nz; ++k) {
       perm[k] = k;
diff --git a/cpp/src/dual_simplex/sparse_vector.hpp b/cpp/src/dual_simplex/sparse_vector.hpp
index fadd2df338..6ea642ee07 100644
--- a/cpp/src/dual_simplex/sparse_vector.hpp
+++ b/cpp/src/dual_simplex/sparse_vector.hpp
@@ -9,15 +9,11 @@
 
 #include <dual_simplex/sparse_matrix.hpp>
 #include <dual_simplex/types.hpp>
-#include <utilities/memory_instrumentation.hpp>
 
 #include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
 
-// Import instrumented vector type
-using cuopt::ins_vector;
-
 // A sparse vector stored as a list of nonzero coefficients and their indices
 template <typename i_t, typename f_t>
 class sparse_vector_t {
@@ -37,13 +33,9 @@ class sparse_vector_t {
   void to_csc(csc_matrix_t<i_t, f_t>& A) const;
   // convert a sparse vector into a dense vector. Dense vector is cleared and resized.
   void to_dense(std::vector<f_t>& x_dense) const;
-  // convert a sparse vector into an instrumented dense vector.
-  void to_dense(ins_vector<f_t>& x_dense) const;
   // scatter a sparse vector into a dense vector. Assumes x_dense is already cleared or
   // preinitialized
   void scatter(std::vector<f_t>& x_dense) const;
-  // scatter into instrumented vector
-  void scatter(ins_vector<f_t>& x_dense) const;
   // inverse permute the current sparse vector
   void inverse_permute_vector(const std::vector<i_t>& p);
   // inverse permute a sparse vector into another sparse vector
@@ -71,8 +63,8 @@ class sparse_vector_t {
   void squeeze(sparse_vector_t<i_t, f_t>& y) const;
 
   i_t n;
-  ins_vector<i_t> i;
-  ins_vector<f_t> x;
+  std::vector<i_t> i;
+  std::vector<f_t> x;
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/triangle_solve.cpp b/cpp/src/dual_simplex/triangle_solve.cpp
index 2f280295bf..ad09ae30cf 100644
--- a/cpp/src/dual_simplex/triangle_solve.cpp
+++ b/cpp/src/dual_simplex/triangle_solve.cpp
@@ -26,19 +26,22 @@ template <typename i_t, typename f_t>
 i_t reach(const sparse_vector_t<i_t, f_t>& b,
           const std::optional<std::vector<i_t>>& pinv,
           csc_matrix_t<i_t, f_t>& G,
-          std::vector<i_t>& xi)
+          std::vector<i_t>& xi,
+          f_t& work_estimate)
 {
   const i_t m   = G.m;
   i_t top       = m;
   const i_t bnz = b.i.size();
   for (i_t p = 0; p < bnz; ++p) {
     if (!MARKED(G.col_start, b.i[p])) {  // start a DFS at unmarked node i
-      top = depth_first_search(b.i[p], pinv, G, top, xi, xi.begin() + m);
+      top = depth_first_search(b.i[p], pinv, G, top, xi, xi.begin() + m, work_estimate);
     }
   }
+  work_estimate += 4 * bnz;
   for (i_t p = top; p < m; ++p) {  // restore G
     MARK(G.col_start, xi[p]);
   }
+  work_estimate += 3 * (m - top);
   return top;
 }
 
@@ -61,7 +64,8 @@ i_t depth_first_search(i_t j,
                        csc_matrix_t<i_t, f_t>& G,
                        i_t top,
                        std::vector<i_t>& xi,
-                       typename std::vector<i_t>::iterator pstack)
+                       typename std::vector<i_t>::iterator pstack,
+                       f_t& work_estimate)
 {
   i_t head = 0;
   xi[0]    = j;  // Initialize the recursion stack
@@ -75,10 +79,12 @@ i_t depth_first_search(i_t j,
       // Point to the first outgoing edge of node j
       pstack[head] = (jnew < 0) ? 0 : UNFLIP(G.col_start[jnew]);
     }
-    done   = 1;  // Node j is done if no unvisited neighbors
-    i_t p2 = (jnew < 0) ? 0 : UNFLIP(G.col_start[jnew + 1]);
-    for (i_t p = pstack[head]; p < p2; ++p) {  // Examine all neighbors of j
-      i_t i = G.i[p];                          // Consider neighbor i
+    done           = 1;  // Node j is done if no unvisited neighbors
+    i_t p2         = (jnew < 0) ? 0 : UNFLIP(G.col_start[jnew + 1]);
+    const i_t psav = pstack[head];
+    i_t p;
+    for (p = psav; p < p2; ++p) {  // Examine all neighbors of j
+      i_t i = G.i[p];              // Consider neighbor i
       if (MARKED(G.col_start, i)) {
         continue;  // skip visited node i
       }
@@ -87,6 +93,7 @@ i_t depth_first_search(i_t j,
       done         = 0;  // node j is not done
       break;             // break to start dfs at node i
     }
+    work_estimate += 3 * (p - psav) + 10;
     if (done) {
       pstack[head] = 0;  // restore pstack so it can be used again in other routines
       xi[head]     = 0;  // restore xi so it can be used again in other routines
@@ -102,18 +109,23 @@ i_t sparse_triangle_solve(const sparse_vector_t<i_t, f_t>& b,
                           const std::optional<std::vector<i_t>>& pinv,
                           std::vector<i_t>& xi,
                           csc_matrix_t<i_t, f_t>& G,
-                          f_t* x)
+                          f_t* x,
+                          f_t& work_estimate)
 {
   i_t m = G.m;
   assert(b.n == m);
-  i_t top = reach(b, pinv, G, xi);
+  i_t top = reach(b, pinv, G, xi, work_estimate);
   for (i_t p = top; p < m; ++p) {
     x[xi[p]] = 0;  // Clear x vector
   }
+  work_estimate += 2 * (m - top);
+
   const i_t bnz = b.i.size();
   for (i_t p = 0; p < bnz; ++p) {
     x[b.i[p]] = b.x[p];  // Scatter b
   }
+  work_estimate += 3 * bnz;
+
   for (i_t px = top; px < m; ++px) {
     i_t j = xi[px];                   // x(j) is nonzero
     i_t J = pinv ? ((*pinv)[j]) : j;  // j maps to column J of G
@@ -131,6 +143,7 @@ i_t sparse_triangle_solve(const sparse_vector_t<i_t, f_t>& b,
       end = G.col_start[J + 1] - 1;
     }
     x[j] /= Gjj;
+    work_estimate += 4 * (end - p) + 7;
     for (; p < end; ++p) {
       x[G.i[p]] -= G.x[p] * x[j];  // x(i) -= G(i,j) * x(j)
     }
@@ -147,26 +160,30 @@ i_t sparse_triangle_solve(const sparse_vector_t<i_t, f_t>& b,
 template int reach<int, double>(const sparse_vector_t<int, double>& b,
                                 const std::optional<std::vector<int>>& pinv,
                                 csc_matrix_t<int, double>& G,
-                                std::vector<int>& xi);
+                                std::vector<int>& xi,
+                                double& work_estimate);
 
 template int depth_first_search<int, double>(int j,
                                              const std::optional<std::vector<int>>& pinv,
                                              csc_matrix_t<int, double>& G,
                                              int top,
                                              std::vector<int>& xi,
-                                             std::vector<int>::iterator pstack);
+                                             std::vector<int>::iterator pstack,
+                                             double& work_estimate);
 
 template int sparse_triangle_solve<int, double, true>(const sparse_vector_t<int, double>& b,
                                                       const std::optional<std::vector<int>>& pinv,
                                                       std::vector<int>& xi,
                                                       csc_matrix_t<int, double>& G,
-                                                      double* x);
+                                                      double* x,
+                                                      double& work_estimate);
 
 template int sparse_triangle_solve<int, double, false>(const sparse_vector_t<int, double>& b,
                                                        const std::optional<std::vector<int>>& pinv,
                                                        std::vector<int>& xi,
                                                        csc_matrix_t<int, double>& G,
-                                                       double* x);
+                                                       double* x,
+                                                       double& work_estimate);
 #endif
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/triangle_solve.hpp b/cpp/src/dual_simplex/triangle_solve.hpp
index 20af92cc93..811ed8b927 100644
--- a/cpp/src/dual_simplex/triangle_solve.hpp
+++ b/cpp/src/dual_simplex/triangle_solve.hpp
@@ -25,126 +25,91 @@ namespace cuopt::linear_programming::dual_simplex {
 
 // Solve L*x = b. On input x contains the right-hand side b, on output the
 // solution
-template <typename i_t, typename f_t, typename VectorF>
-i_t lower_triangular_solve(const csc_matrix_t<i_t, f_t>& L, VectorF& x)
+template <typename i_t, typename f_t>
+i_t lower_triangular_solve(const csc_matrix_t<i_t, f_t>& L, std::vector<f_t>& x, f_t& work_estimate)
 {
   i_t n = L.n;
   assert(x.size() == n);
 
-  auto& L_cs = L.col_start.underlying();
-  auto& L_i  = L.i.underlying();
-  auto& L_x  = L.x.underlying();
-
-  size_t nnz_processed = 0;
   for (i_t j = 0; j < n; ++j) {
-    i_t col_start = L_cs[j];
-    i_t col_end   = L_cs[j + 1];
+    i_t col_start = L.col_start[j];
+    i_t col_end   = L.col_start[j + 1];
     if (x[j] != 0.0) {
-      x[j] /= L_x[col_start];
-      auto x_j = x[j];  // hoist this load out of the loop
+      x[j] /= L.x[col_start];
+      const f_t x_j = x[j];  // hoist this load out of the loop
       // as the compiler cannot guess that x[j] never aliases to x[L.i[p]]
       for (i_t p = col_start + 1; p < col_end; ++p) {
-        x[L_i[p]] -= L_x[p] * x_j;
+        x[L.i[p]] -= L.x[p] * x_j;
       }
-      nnz_processed += col_end - col_start;
     }
   }
-
-  L.col_start.byte_loads += (n + 1) * sizeof(i_t);
-  L.i.byte_loads += nnz_processed * sizeof(i_t);
-  L.x.byte_loads += nnz_processed * sizeof(f_t);
-
+  work_estimate += 3 * n + 3 * L.col_start[n];
   return 0;
 }
 
 // Solve L'*x = b. On input x contains the right-hand side b, on output the
 // solution
-template <typename i_t, typename f_t, typename VectorF>
-i_t lower_triangular_transpose_solve(const csc_matrix_t<i_t, f_t>& L, VectorF& x)
+template <typename i_t, typename f_t>
+i_t lower_triangular_transpose_solve(const csc_matrix_t<i_t, f_t>& L,
+                                     std::vector<f_t>& x,
+                                     f_t& work_estimate)
 {
   const i_t n = L.n;
   assert(x.size() == n);
 
-  auto& L_cs = L.col_start.underlying();
-  auto& L_i  = L.i.underlying();
-  auto& L_x  = L.x.underlying();
-
   for (i_t j = n - 1; j >= 0; --j) {
-    const i_t col_start = L_cs[j] + 1;
-    const i_t col_end   = L_cs[j + 1];
+    const i_t col_start = L.col_start[j] + 1;
+    const i_t col_end   = L.col_start[j + 1];
     for (i_t p = col_start; p < col_end; ++p) {
-      x[j] -= L_x[p] * x[L_i[p]];
+      x[j] -= L.x[p] * x[L.i[p]];
     }
-    x[j] /= L_x[L_cs[j]];
+    x[j] /= L.x[L.col_start[j]];
   }
-
-  const size_t total_nnz = L_cs[n];
-  L.col_start.byte_loads += (n + 1) * sizeof(i_t);
-  L.i.byte_loads += total_nnz * sizeof(i_t);
-  L.x.byte_loads += total_nnz * sizeof(f_t);
-
+  work_estimate += 6 * n + 4 * L.col_start[n];
   return 0;
 }
 
 // Solve U*x = b. On input x contains the right-hand side b, on output the
 // solution
-template <typename i_t, typename f_t, typename VectorF>
-i_t upper_triangular_solve(const csc_matrix_t<i_t, f_t>& U, VectorF& x)
+template <typename i_t, typename f_t>
+i_t upper_triangular_solve(const csc_matrix_t<i_t, f_t>& U, std::vector<f_t>& x, f_t& work_estimate)
 {
   const i_t n = U.n;
   assert(x.size() == n);
-
-  auto& U_cs = U.col_start.underlying();
-  auto& U_i  = U.i.underlying();
-  auto& U_x  = U.x.underlying();
-
-  size_t nnz_processed = 0;
   for (i_t j = n - 1; j >= 0; --j) {
-    const i_t col_start = U_cs[j];
-    const i_t col_end   = U_cs[j + 1] - 1;
+    const i_t col_start = U.col_start[j];
+    const i_t col_end   = U.col_start[j + 1] - 1;
     if (x[j] != 0.0) {
-      x[j] /= U_x[col_end];
-      auto x_j = x[j];  // same x_j load hoisting
+      x[j] /= U.x[col_end];
+      const f_t x_j = x[j];  // same x_j load hoisting
       for (i_t p = col_start; p < col_end; ++p) {
-        x[U_i[p]] -= U_x[p] * x_j;
+        x[U.i[p]] -= U.x[p] * x_j;
       }
-      nnz_processed += col_end - col_start;
+      work_estimate += 3 * (col_end - col_start) + 3;
     }
   }
-
-  U.col_start.byte_loads += (n + 1) * sizeof(i_t);
-  U.i.byte_loads += nnz_processed * sizeof(i_t);
-  U.x.byte_loads += nnz_processed * sizeof(f_t);
-
+  work_estimate += 3 * n;
   return 0;
 }
 
 // Solve U'*x = b. On input x contains the right-hand side b, on output the
 // solution
-template <typename i_t, typename f_t, typename VectorF>
-i_t upper_triangular_transpose_solve(const csc_matrix_t<i_t, f_t>& U, VectorF& x)
+template <typename i_t, typename f_t>
+i_t upper_triangular_transpose_solve(const csc_matrix_t<i_t, f_t>& U,
+                                     std::vector<f_t>& x,
+                                     f_t& work_estimate)
 {
   const i_t n = U.n;
   assert(x.size() == n);
-
-  auto& U_cs = U.col_start.underlying();
-  auto& U_i  = U.i.underlying();
-  auto& U_x  = U.x.underlying();
-
   for (i_t j = 0; j < n; ++j) {
-    const i_t col_start = U_cs[j];
-    const i_t col_end   = U_cs[j + 1] - 1;
+    const i_t col_start = U.col_start[j];
+    const i_t col_end   = U.col_start[j + 1] - 1;
     for (i_t p = col_start; p < col_end; ++p) {
-      x[j] -= U_x[p] * x[U_i[p]];
+      x[j] -= U.x[p] * x[U.i[p]];
     }
-    x[j] /= U_x[col_end];
+    x[j] /= U.x[col_end];
   }
-
-  const size_t total_nnz = U_cs[n];
-  U.col_start.byte_loads += (n + 1) * sizeof(i_t);
-  U.i.byte_loads += total_nnz * sizeof(i_t);
-  U.x.byte_loads += total_nnz * sizeof(f_t);
-
+  work_estimate += 4 * n + 4 * U.col_start[n];
   return 0;
 }
 
@@ -159,7 +124,8 @@ template <typename i_t, typename f_t>
 i_t reach(const sparse_vector_t<i_t, f_t>& b,
           const std::optional<std::vector<i_t>>& pinv,
           csc_matrix_t<i_t, f_t>& G,
-          std::vector<i_t>& xi);
+          std::vector<i_t>& xi,
+          f_t& work_estimate);
 
 // \brief Performs a depth-first search starting from node j in the graph
 // defined by G
@@ -182,7 +148,8 @@ i_t depth_first_search(i_t j,
                        csc_matrix_t<i_t, f_t>& G,
                        i_t top,
                        std::vector<i_t>& xi,
-                       typename std::vector<i_t>::iterator pstack);
+                       typename std::vector<i_t>::iterator pstack,
+                       f_t& work_estimate);
 
 // \brief sparse_triangle_solve solve L*x = b or U*x = b where L is a sparse lower
 // triangular matrix
@@ -201,6 +168,7 @@ i_t sparse_triangle_solve(const sparse_vector_t<i_t, f_t>& b,
                           const std::optional<std::vector<i_t>>& pinv,
                           std::vector<i_t>& xi,
                           csc_matrix_t<i_t, f_t>& G,
-                          f_t* x);
+                          f_t* x,
+                          f_t& work_estimate);
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/vector_math.cpp b/cpp/src/dual_simplex/vector_math.cpp
index 8ea31d3e39..0db09b00b6 100644
--- a/cpp/src/dual_simplex/vector_math.cpp
+++ b/cpp/src/dual_simplex/vector_math.cpp
@@ -57,6 +57,7 @@ f_t dot(const std::vector<f_t>& x, const std::vector<f_t>& y)
   return dot;
 }
 
+// Work = 3*min(nz_x, nz_y)
 template <typename i_t, typename f_t>
 f_t sparse_dot(
   i_t const* xind, f_t const* xval, i_t nx, i_t const* yind, i_t ny, f_t const* y_scatter_val)
@@ -123,8 +124,48 @@ f_t sparse_dot(const std::vector<i_t>& xind,
   return dot;
 }
 
-// NOTE: permute_vector, inverse_permute_vector, and inverse_permutation are now
-// templated on vector types and defined in the header file.
+// Computes x = P*b or x=b(p) in MATLAB.
+// Work is 3*n
+template <typename i_t, typename f_t>
+i_t permute_vector(const std::vector<i_t>& p, const std::vector<f_t>& b, std::vector<f_t>& x)
+{
+  i_t n = p.size();
+  assert(x.size() == n);
+  assert(b.size() == n);
+  for (i_t k = 0; k < n; ++k) {
+    x[k] = b[p[k]];
+  }
+  return 0;
+}
+
+// Computes x = P'*b or x(p) = b in MATLAB.
+// Work is 3 * n
+template <typename i_t, typename f_t>
+i_t inverse_permute_vector(const std::vector<i_t>& p,
+                           const std::vector<f_t>& b,
+                           std::vector<f_t>& x)
+{
+  i_t n = p.size();
+  assert(x.size() == n);
+  assert(b.size() == n);
+  for (i_t k = 0; k < n; ++k) {
+    x[p[k]] = b[k];
+  }
+  return 0;
+}
+
+// Computes pinv from p. Or pinv(p) = 1:n in MATLAB
+// Work is 2*n
+template <typename i_t>
+i_t inverse_permutation(const std::vector<i_t>& p, std::vector<i_t>& pinv)
+{
+  i_t n = p.size();
+  if (pinv.size() != n) { pinv.resize(n); }
+  for (i_t k = 0; k < n; ++k) {
+    pinv[p[k]] = k;
+  }
+  return 0;
+}
 
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
 
@@ -160,8 +201,14 @@ template double sparse_dot<int, double>(int const* xind,
 template double sparse_dot<int, double>(
   int* xind, double* xval, int nx, int* yind, double* yval, int ny);
 
-// NOTE: permute_vector, inverse_permute_vector, and inverse_permutation are now
-// templated on vector types and defined in the header file, so no explicit instantiation needed.
+template int permute_vector<int, double>(const std::vector<int>& p,
+                                         const std::vector<double>& b,
+                                         std::vector<double>& x);
+template int inverse_permute_vector<int, double>(const std::vector<int>& p,
+                                                 const std::vector<double>& b,
+                                                 std::vector<double>& x);
+template int inverse_permutation<int>(const std::vector<int>& p, std::vector<int>& pinv);
+
 #endif
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/vector_math.hpp b/cpp/src/dual_simplex/vector_math.hpp
index 6bf573e892..478e024e51 100644
--- a/cpp/src/dual_simplex/vector_math.hpp
+++ b/cpp/src/dual_simplex/vector_math.hpp
@@ -57,41 +57,16 @@ template <typename i_t, typename f_t>
 f_t sparse_dot(i_t* xind, f_t* xval, i_t nx, i_t* yind, f_t* yval, i_t ny);
 
 // Computes x = P*b or x=b(p) in MATLAB.
-template <typename VectorI, typename VectorF_in, typename VectorF_out>
-int permute_vector(const VectorI& p, const VectorF_in& b, VectorF_out& x)
-{
-  auto n = p.size();
-  assert(x.size() == n);
-  assert(b.size() == n);
-  for (decltype(n) k = 0; k < n; ++k) {
-    x[k] = b[p[k]];
-  }
-  return 0;
-}
-
+template <typename i_t, typename f_t>
+i_t permute_vector(const std::vector<i_t>& p, const std::vector<f_t>& b, std::vector<f_t>& x);
 // Computes x = P'*b or x(p) = b in MATLAB.
-template <typename VectorI, typename VectorF_in, typename VectorF_out>
-int inverse_permute_vector(const VectorI& p, const VectorF_in& b, VectorF_out& x)
-{
-  auto n = p.size();
-  assert(x.size() == n);
-  assert(b.size() == n);
-  for (decltype(n) k = 0; k < n; ++k) {
-    x[p[k]] = b[k];
-  }
-  return 0;
-}
+template <typename i_t, typename f_t>
+i_t inverse_permute_vector(const std::vector<i_t>& p,
+                           const std::vector<f_t>& b,
+                           std::vector<f_t>& x);
 
 // Computes pinv from p. Or pinv(p) = 1:n in MATLAB
-template <typename VectorI_in, typename VectorI_out>
-int inverse_permutation(const VectorI_in& p, VectorI_out& pinv)
-{
-  auto n = p.size();
-  if (pinv.size() != n) { pinv.resize(n); }
-  for (decltype(n) k = 0; k < n; ++k) {
-    pinv[p[k]] = k;
-  }
-  return 0;
-}
+template <typename i_t>
+i_t inverse_permutation(const std::vector<i_t>& p, std::vector<i_t>& pinv);
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/mip_heuristics/problem/problem.cu b/cpp/src/mip_heuristics/problem/problem.cu
index df7f37c541..d77e2e5f65 100644
--- a/cpp/src/mip_heuristics/problem/problem.cu
+++ b/cpp/src/mip_heuristics/problem/problem.cu
@@ -2090,9 +2090,9 @@ void problem_t<i_t, f_t>::get_host_user_problem(
   user_problem.objective = cuopt::host_copy(objective_coefficients, stream);
 
   dual_simplex::csr_matrix_t<i_t, f_t> csr_A(m, n, nz);
-  csr_A.x         = ins_vector<f_t>(cuopt::host_copy(coefficients, stream));
-  csr_A.j         = ins_vector<i_t>(cuopt::host_copy(variables, stream));
-  csr_A.row_start = ins_vector<i_t>(cuopt::host_copy(offsets, stream));
+  csr_A.x         = std::vector<f_t>(cuopt::host_copy(coefficients, stream));
+  csr_A.j         = std::vector<i_t>(cuopt::host_copy(variables, stream));
+  csr_A.row_start = std::vector<i_t>(cuopt::host_copy(offsets, stream));
 
   csr_A.to_compressed_col(user_problem.A);
 
diff --git a/cpp/src/pdlp/translate.hpp b/cpp/src/pdlp/translate.hpp
index 6ecb37c6e6..aebe87b140 100644
--- a/cpp/src/pdlp/translate.hpp
+++ b/cpp/src/pdlp/translate.hpp
@@ -30,9 +30,9 @@ static dual_simplex::user_problem_t<i_t, f_t> cuopt_problem_to_simplex_problem(
   user_problem.objective = cuopt::host_copy(model.objective_coefficients, handle_ptr->get_stream());
 
   dual_simplex::csr_matrix_t<i_t, f_t> csr_A(m, n, nz);
-  csr_A.x         = ins_vector<f_t>(cuopt::host_copy(model.coefficients, handle_ptr->get_stream()));
-  csr_A.j         = ins_vector<i_t>(cuopt::host_copy(model.variables, handle_ptr->get_stream()));
-  csr_A.row_start = ins_vector<i_t>(cuopt::host_copy(model.offsets, handle_ptr->get_stream()));
+  csr_A.x = std::vector<f_t>(cuopt::host_copy(model.coefficients, handle_ptr->get_stream()));
+  csr_A.j = std::vector<i_t>(cuopt::host_copy(model.variables, handle_ptr->get_stream()));
+  csr_A.row_start = std::vector<i_t>(cuopt::host_copy(model.offsets, handle_ptr->get_stream()));
 
   csr_A.to_compressed_col(user_problem.A);
 
@@ -121,9 +121,9 @@ void translate_to_crossover_problem(const detail::problem_t<i_t, f_t>& problem,
 
   dual_simplex::csr_matrix_t<i_t, f_t> csr_A(
     problem.n_constraints, problem.n_variables, problem.nnz);
-  csr_A.x         = ins_vector<f_t>(cuopt::host_copy(problem.coefficients, stream));
-  csr_A.j         = ins_vector<i_t>(cuopt::host_copy(problem.variables, stream));
-  csr_A.row_start = ins_vector<i_t>(cuopt::host_copy(problem.offsets, stream));
+  csr_A.x         = std::vector<f_t>(cuopt::host_copy(problem.coefficients, stream));
+  csr_A.j         = std::vector<i_t>(cuopt::host_copy(problem.variables, stream));
+  csr_A.row_start = std::vector<i_t>(cuopt::host_copy(problem.offsets, stream));
 
   stream.synchronize();
   CUOPT_LOG_DEBUG("Converting to compressed column");