diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 386e654488..ad20034f46 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -1390,7 +1390,7 @@ class iteration_data_t { // v = alpha * A * w + beta * v = alpha * A * Dinv * A^T * y + beta * v matrix_vector_multiply(A, alpha, w, beta, v); if (debug) { - printf("||A|| = %.16e\n", vector_norm2(A.x.underlying())); + printf("||A|| = %.16e\n", vector_norm2(A.x)); printf("||w|| = %.16e\n", vector_norm2(w)); printf("||v|| = %.16e\n", vector_norm2(v)); } diff --git a/cpp/src/barrier/cusparse_view.cu b/cpp/src/barrier/cusparse_view.cu index 05488e77e2..11d036504f 100644 --- a/cpp/src/barrier/cusparse_view.cu +++ b/cpp/src/barrier/cusparse_view.cu @@ -159,9 +159,9 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, A_indices_ = device_copy(indices, handle_ptr->get_stream()); A_data_ = device_copy(data, handle_ptr->get_stream()); - A_T_offsets_ = device_copy(A.col_start.underlying(), handle_ptr->get_stream()); - A_T_indices_ = device_copy(A.i.underlying(), handle_ptr->get_stream()); - A_T_data_ = device_copy(A.x.underlying(), handle_ptr->get_stream()); + A_T_offsets_ = device_copy(A.col_start, handle_ptr->get_stream()); + A_T_indices_ = device_copy(A.i, handle_ptr->get_stream()); + A_T_data_ = device_copy(A.x, handle_ptr->get_stream()); cusparseCreateCsr(&A_, rows, diff --git a/cpp/src/dual_simplex/basis_solves.cpp b/cpp/src/dual_simplex/basis_solves.cpp index 17f997f4a9..777687d49b 100644 --- a/cpp/src/dual_simplex/basis_solves.cpp +++ b/cpp/src/dual_simplex/basis_solves.cpp @@ -17,6 +17,7 @@ namespace cuopt::linear_programming::dual_simplex { +// Work = 3 * m template i_t reorder_basic_list(const std::vector& q, std::vector& basic_list) { @@ -165,7 +166,8 @@ i_t factorize_basis(const csc_matrix_t& A, std::vector& pinv, std::vector& q, std::vector& deficient, - std::vector& slacks_needed) + std::vector& slacks_needed, + f_t& work_estimate) { raft::common::nvtx::range scope("LU::factorize_basis"); const i_t m = basic_list.size(); @@ -177,14 +179,18 @@ i_t factorize_basis(const csc_matrix_t& A, // TODO: We should see if we can find the singletons without explictly forming the matrix B f_t fact_start = tic(); csc_matrix_t B(A.m, A.m, 1); - form_b(A, basic_list, B); + work_estimate += A.m; + form_b(A, basic_list, B, work_estimate); std::vector row_perm(m); std::vector col_perm(m); + work_estimate += 2 * m; i_t row_singletons; i_t col_singletons; - find_singletons(B, row_singletons, row_perm, col_singletons, col_perm); + find_singletons(B, row_singletons, row_perm, col_singletons, col_perm, work_estimate); std::vector row_perm_inv(m); + work_estimate += m; inverse_permutation(row_perm, row_perm_inv); + work_estimate += 3 * m; #ifdef PRINT_SINGLETONS printf("Singletons row %d col %d num %d\n", @@ -219,6 +225,7 @@ i_t factorize_basis(const csc_matrix_t& A, const i_t Bnz = B.col_start[m]; L.reallocate(Bnz); U.reallocate(Bnz); + work_estimate += 2 * Bnz; i_t Lnz = 0; // Fill in L(:, 0:col_singletons-1) with I for (i_t k = 0; k < col_singletons; ++k) { @@ -228,6 +235,7 @@ i_t factorize_basis(const csc_matrix_t& A, Lnz++; assert(Lnz <= Bnz); } + work_estimate += 3 * col_singletons; i_t Unz = 0; // Fill in U(:, 0:col_singletons-1) with U_11 @@ -242,8 +250,10 @@ i_t factorize_basis(const csc_matrix_t& A, Unz++; assert(Unz <= Bnz); } + work_estimate += 4 * (col_end - col_start); } if (col_singletons > 0) { U.col_start[col_singletons] = Unz; } + work_estimate += 3 * col_singletons; // Ensure U(i, i) is at the end of column i for U_11 for (i_t k = 0; k < col_singletons; ++k) { @@ -259,7 +269,9 @@ i_t factorize_basis(const csc_matrix_t& A, break; } } + work_estimate += (col_before_end - col_start); } + work_estimate += 2 * col_singletons; // Fill in L(:, col_singletons:col_singletons+row_singletons-1) with L_22 and L_32 // and U(:, col_singletons:col_singletons+row_singletons-1) with U_12 and I @@ -284,6 +296,7 @@ i_t factorize_basis(const csc_matrix_t& A, assert(Unz <= Bnz); } } + work_estimate += 5 * (col_end - col_start); // add in the identity in U U.i[Unz] = k; U.x[Unz] = 1.0; @@ -291,6 +304,7 @@ i_t factorize_basis(const csc_matrix_t& A, assert(Unz <= Bnz); } L.col_start[num_singletons] = Lnz; + work_estimate += 7 * (num_singletons - col_singletons); // Ensure L(i, i) is at the beginning of column i for L_22 and L32 for (i_t k = col_singletons; k < num_singletons; ++k) { @@ -310,8 +324,10 @@ i_t factorize_basis(const csc_matrix_t& A, break; } } + work_estimate += (col_end - col_start); assert(found_diag); } + work_estimate += 4 * (num_singletons - col_singletons); // Compute how many nonzeros in B we have used so far so we know // how many nonzeros are in S @@ -322,7 +338,7 @@ i_t factorize_basis(const csc_matrix_t& A, f_t actual_factor = 0; if (Sdim > 0) { csc_matrix_t S(Sdim, Sdim, Snz_max); - + work_estimate += Sdim + Snz_max; // Build S i_t Snz = 0; for (i_t k = num_singletons; k < m; ++k) { @@ -341,13 +357,18 @@ i_t factorize_basis(const csc_matrix_t& A, assert(Snz <= Snz_max); } } + work_estimate += 3 * (col_end - col_start); } S.col_start[Sdim] = Snz; // Finalize S + work_estimate += 4 * (m - num_singletons); csc_matrix_t SL(Sdim, Sdim, Snz); csc_matrix_t SU(Sdim, Sdim, Snz); + work_estimate += 2 * Sdim + 2 * Snz; + // Factorize S std::vector S_perm_inv(Sdim); + work_estimate += Sdim; std::optional> empty = std::nullopt; f_t actual_factor_start = tic(); @@ -356,6 +377,8 @@ i_t factorize_basis(const csc_matrix_t& A, for (i_t h = 0; h < Sdim; ++h) { identity[h] = h; } + work_estimate += 3 * Sdim; + Srank = right_looking_lu(S, settings, settings.threshold_partial_pivoting_tol, @@ -363,7 +386,8 @@ i_t factorize_basis(const csc_matrix_t& A, S_col_perm, SL, SU, - S_perm_inv); + S_perm_inv, + work_estimate); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { settings.log.printf("Concurrent halt\n"); return CONCURRENT_HALT_RETURN; @@ -375,33 +399,40 @@ i_t factorize_basis(const csc_matrix_t& A, for (i_t h = Srank; h < Sdim; ++h) { deficient[h - Srank] = col_perm[num_singletons + S_col_perm[h]]; } + work_estimate += 3 * (Sdim - Srank); // Get S_perm std::vector S_perm(Sdim); inverse_permutation(S_perm_inv, S_perm); + work_estimate += 4 * Sdim; // Get the slacks needed slacks_needed.resize(Sdim - Srank); for (i_t h = Srank; h < Sdim; ++h) { slacks_needed[h - Srank] = row_perm[num_singletons + S_perm[h]]; } + work_estimate += 3 * (Sdim - Srank); return -1; } // Need to permute col_perm[k] according to q std::vector col_perm_sav(m - num_singletons); + work_estimate += (m - num_singletons); i_t q_j = 0; for (i_t h = num_singletons; h < m; ++h) { col_perm_sav[q_j] = col_perm[h]; q_j++; } + work_estimate += 2 * (m - num_singletons); q_j = 0; for (i_t h = num_singletons; h < m; ++h) { col_perm[h] = col_perm_sav[S_col_perm[q_j]]; q_j++; } + work_estimate += 2 * (m - num_singletons); std::vector S_perm(m); inverse_permutation(S_perm_inv, S_perm); + work_estimate += 4 * m; actual_factor = toc(actual_factor_start); // Permute the rows of L_32 according to S_perm_inv @@ -415,11 +446,16 @@ i_t factorize_basis(const csc_matrix_t& A, L.i[p] = new_i; } } + work_estimate += (col_end - col_start); } + work_estimate += 3 * (num_singletons - col_singletons); const i_t SLnz = SL.col_start[Sdim]; const i_t Lnz_max = Lnz + SLnz; - if (Lnz_max > Bnz) { L.reallocate(Lnz_max); } + if (Lnz_max > Bnz) { + L.reallocate(Lnz_max); + work_estimate += Lnz_max; + } // Fill in L(:, num_singletons:m-1) with L_33 for (i_t k = num_singletons; k < m; ++k) { @@ -434,13 +470,19 @@ i_t factorize_basis(const csc_matrix_t& A, Lnz++; assert(Lnz <= Lnz_max); } + work_estimate += 4 * (col_end - col_start); } + work_estimate += 3 * (m - num_singletons); + assert(Lnz == Lnz_max); L.col_start[m] = Lnz; // Finalize L const i_t SUnz = SU.col_start[Sdim]; const i_t Unz_max = Unz + SUnz + (Bnz - Bnz_used); - if (Unz_max > Bnz) { U.reallocate(Unz_max); } + if (Unz_max > Bnz) { + U.reallocate(Unz_max); + work_estimate += Unz_max; + } // Fill in U(:, num_singletons:m-1) with U_13 and U_33 for (i_t k = num_singletons; k < m; ++k) { @@ -459,6 +501,7 @@ i_t factorize_basis(const csc_matrix_t& A, assert(Unz <= Unz_max); } } + work_estimate += 3 * (B_col_end - B_col_start); // U_33 const i_t l = k - num_singletons; @@ -471,7 +514,10 @@ i_t factorize_basis(const csc_matrix_t& A, Unz++; assert(Unz <= Unz_max); } + work_estimate += 4 * (U_col_end - U_col_start); } + work_estimate += 5 * (m - num_singletons); + assert(Unz <= Unz_max); U.col_start[m] = Unz; // Finalize U @@ -479,12 +525,15 @@ i_t factorize_basis(const csc_matrix_t& A, for (i_t k = 0; k < Sdim; ++k) { last_perm[k] = row_perm[num_singletons + k]; } + work_estimate += 3 * Sdim; // Fix up row permutations for (i_t k = 0; k < Sdim; ++k) { row_perm[num_singletons + k] = last_perm[S_perm[k]]; } + work_estimate += 3 * Sdim; inverse_permutation(row_perm, row_perm_inv); + work_estimate += 3 * row_perm.size(); } else { L.col_start[m] = Lnz; // Finalize L U.col_start[m] = Unz; // Finalize U @@ -563,13 +612,16 @@ i_t factorize_basis(const csc_matrix_t& A, if (write_basis) { csc_matrix_t B(m, m, 1); - form_b(A, basic_list, B); + form_b(A, basic_list, B, work_estimate); write_basis_info(B); } q.resize(m); + work_estimate += m; f_t fact_start = tic(); - rank = right_looking_lu(A, settings, medium_tol, basic_list, q, L, U, pinv); + rank = right_looking_lu(A, settings, medium_tol, basic_list, q, L, U, pinv, work_estimate); inverse_permutation(pinv, p); + work_estimate += 3 * pinv.size(); + if (rank != m) { // Get the rank deficient columns deficient.clear(); @@ -577,11 +629,13 @@ i_t factorize_basis(const csc_matrix_t& A, for (i_t h = rank; h < m; ++h) { deficient[h - rank] = q[h]; } + work_estimate += 3 * (m - rank); // Get the slacks needed slacks_needed.resize(m - rank); for (i_t h = rank; h < m; ++h) { slacks_needed[h - rank] = p[h]; } + work_estimate += 3 * (m - rank); } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { settings.log.printf("Concurrent halt\n"); @@ -596,7 +650,7 @@ i_t factorize_basis(const csc_matrix_t& A, multiply(L, U, C); csc_matrix_t B(m, m, 1); - form_b(A, basic_list, B); + form_b(A, basic_list, B, work_estimate); csc_matrix_t D(m, m, 1); B.permute_rows_and_cols(pinv, q, D); @@ -623,7 +677,8 @@ i_t basis_repair(const csc_matrix_t& A, std::vector& basis_list, std::vector& nonbasic_list, std::vector& superbasic_list, - std::vector& vstatus) + std::vector& vstatus, + f_t& work_estimate) { const i_t m = A.m; const i_t n = A.n; @@ -632,6 +687,7 @@ i_t basis_repair(const csc_matrix_t& A, // Create slack_map std::vector slack_map(m); // slack_map[i] = j if column j is e_i + work_estimate += m; i_t slacks_found = 0; for (i_t j = n - 1; j >= n - m; j--) { const i_t col_start = A.col_start[j]; @@ -643,23 +699,29 @@ i_t basis_repair(const csc_matrix_t& A, slacks_found++; } } + work_estimate += 3 * m + 2 * slacks_found; assert(slacks_found == m); // Create nonbasic_map std::vector nonbasic_map( n, -1); // nonbasic_map[j] = p if nonbasic[p] = j, -1 if j is basic/superbasic + work_estimate += n; const i_t num_nonbasic = nonbasic_list.size(); for (i_t k = 0; k < num_nonbasic; ++k) { nonbasic_map[nonbasic_list[k]] = k; } + work_estimate += 2 * num_nonbasic; + // Create a superbasic_map std::vector superbasic_map( n, -1); // superbasic_map[j] = p if superbasic[p] = j, -1 if j is basic/nonbasic + work_estimate += n; const i_t num_superbasic = superbasic_list.size(); for (i_t k = 0; k < num_superbasic; ++k) { superbasic_map[superbasic_list[k]] = k; } + work_estimate += 2 * num_superbasic; const i_t columns_to_replace = deficient.size(); for (i_t k = 0; k < columns_to_replace; ++k) { @@ -687,6 +749,7 @@ i_t basis_repair(const csc_matrix_t& A, } vstatus[replace_j] = variable_status_t::BASIC; } + work_estimate += 11 * columns_to_replace; return 0; } @@ -694,7 +757,8 @@ i_t basis_repair(const csc_matrix_t& A, template i_t form_b(const csc_matrix_t& A, const std::vector& basic_list, - csc_matrix_t& B) + csc_matrix_t& B, + f_t& work_estimate) { const i_t m = A.m; i_t Bnz = 0; @@ -704,7 +768,9 @@ i_t form_b(const csc_matrix_t& A, const i_t col_end = A.col_start[j + 1]; Bnz += (col_end - col_start); } + work_estimate += 3 * m; B.reallocate(Bnz); + work_estimate += 2 * Bnz; const i_t Bnz_check = Bnz; Bnz = 0; for (i_t k = 0; k < m; ++k) { @@ -718,6 +784,8 @@ i_t form_b(const csc_matrix_t& A, Bnz++; } } + work_estimate += 4 * m; + work_estimate += 4 * Bnz; B.col_start[m] = Bnz; assert(Bnz_check == Bnz); return 0; @@ -784,9 +852,10 @@ i_t b_transpose_solve(const csc_matrix_t& L, raft::common::nvtx::range scope("LU::b_transpose_solve"); + f_t work_estimate = 0; // Solve for r such that U'*r = c std::vector r = rhs; - upper_triangular_transpose_solve(U, r); + upper_triangular_transpose_solve(U, r, work_estimate); #ifdef BASIS_DEBUG // err = norm(U'*r - c, inf) std::vector residual = rhs; @@ -798,7 +867,7 @@ i_t b_transpose_solve(const csc_matrix_t& L, #endif // Solve for w such that L'*w = r - lower_triangular_transpose_solve(L, r); + lower_triangular_transpose_solve(L, r, work_estimate); #ifdef BASIS_DEBUG // err2 = norm(L'*w -r, inf) matrix_transpose_vector_multiply(L, 1.0, r, -1.0, residual2); @@ -825,13 +894,14 @@ i_t b_solve(const csc_matrix_t& L, assert(p.size() == m); assert(rhs.size() == m); assert(solution.size() == m); + f_t work_estimate = 0; // P*B = L*U // B*x = b // P*B*x = P*b = b' permute_vector(p, rhs, solution); // Solve for v such that L*v = b' - lower_triangular_solve(L, solution); + lower_triangular_solve(L, solution, work_estimate); #ifdef BASIS_DEBUG std::vector residual1(m); @@ -844,7 +914,7 @@ i_t b_solve(const csc_matrix_t& L, #endif // Solve for x such that U*x = v - upper_triangular_solve(U, solution); + upper_triangular_solve(U, solution, work_estimate); #ifdef BASIS_DEBUG matrix_vector_multiply(U, 1.0, solution, -1.0, residual2); const f_t err2 = vector_norm_inf(residual2); @@ -874,7 +944,8 @@ template int factorize_basis(const csc_matrix_t& A, std::vector& pinv, std::vector& q, std::vector& deficient, - std::vector& slacks_needed); + std::vector& slacks_needed, + double& work_estimate); template int basis_repair(const csc_matrix_t& A, const simplex_solver_settings_t& settings, @@ -885,11 +956,13 @@ template int basis_repair(const csc_matrix_t& A, std::vector& basis_list, std::vector& nonbasic_list, std::vector& superbasic_list, - std::vector& vstatus); + std::vector& vstatus, + double& work_estimate); template int form_b(const csc_matrix_t& A, const std::vector& basic_list, - csc_matrix_t& B); + csc_matrix_t& B, + double& work_estimate); template int b_multiply(const lp_problem_t& lp, const std::vector& basic_list, diff --git a/cpp/src/dual_simplex/basis_solves.hpp b/cpp/src/dual_simplex/basis_solves.hpp index 59b4725e42..fbeba565f1 100644 --- a/cpp/src/dual_simplex/basis_solves.hpp +++ b/cpp/src/dual_simplex/basis_solves.hpp @@ -36,7 +36,8 @@ i_t factorize_basis(const csc_matrix_t& A, std::vector& pinv, std::vector& q, std::vector& deficient, - std::vector& slacks_need); + std::vector& slacks_need, + f_t& work_estimate); // Repair the basis by bringing in slacks template @@ -49,13 +50,15 @@ i_t basis_repair(const csc_matrix_t& A, std::vector& basis_list, std::vector& nonbasic_list, std::vector& superbasic_list, - std::vector& vstatus); + std::vector& vstatus, + f_t& work_estimate); // Form the basis matrix B = A(:, basic_list) template i_t form_b(const csc_matrix_t& A, const std::vector& basic_list, - csc_matrix_t& B); + csc_matrix_t& B, + f_t& work_estimate); // y = B*x = sum_{j in basis} A(:, j) * x(k) template diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp index 71dce2e39c..a5f59ef579 100644 --- a/cpp/src/dual_simplex/basis_updates.cpp +++ b/cpp/src/dual_simplex/basis_updates.cpp @@ -237,7 +237,8 @@ i_t basis_update_t::l_solve(std::vector& rhs) const #endif // First solve // L0*x0 = b - dual_simplex::lower_triangular_solve(L0_, rhs); + f_t work_estimate = 0; + dual_simplex::lower_triangular_solve(L0_, rhs, work_estimate); #ifdef CHECK_LOWER_SOLVE { matrix_vector_multiply(L0_, 1.0, rhs, -1.0, residual); @@ -275,8 +276,9 @@ i_t basis_update_t::l_solve(sparse_vector_t& rhs) const // L0*x0 = b const i_t m = L0_.m; - i_t top = sparse_triangle_solve( - rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data()); + f_t work_estimate = 0; + i_t top = sparse_triangle_solve( + rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data(), work_estimate); solve_to_sparse_vector(top, rhs); // Uses xi_workspace_ and x_workspace_ to fill rhs #ifdef CHECK_L_SOLVE @@ -370,7 +372,8 @@ i_t basis_update_t::l_transpose_solve(std::vector& rhs) const } // L0'*y = c // TODO: handle a sparse rhs - dual_simplex::lower_triangular_transpose_solve(L0_, rhs); + f_t work_estimate = 0; + dual_simplex::lower_triangular_transpose_solve(L0_, rhs, work_estimate); return 0; } @@ -500,8 +503,9 @@ i_t basis_update_t::l_transpose_solve(sparse_vector_t& rhs) rhs.to_dense(cprime_dense); #endif - i_t top = sparse_triangle_solve( - rhs, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data()); + f_t work_estimate = 0; + i_t top = sparse_triangle_solve( + rhs, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data(), work_estimate); solve_to_sparse_vector(top, rhs); // Uses xi_workspace_ and x_workspace_ to fill rhs #ifdef CHECK_LOWER_TRANSPOSE_SOLVE @@ -558,7 +562,8 @@ i_t basis_update_t::u_solve(std::vector& x) const std::vector residual = bprime; #endif - dual_simplex::upper_triangular_solve(U_, bprime); + f_t work_estimate = 0; + dual_simplex::upper_triangular_solve(U_, bprime, work_estimate); #ifdef CHECK_UPPER_SOLVE matrix_vector_multiply(U_, 1.0, bprime, -1.0, residual); @@ -582,8 +587,9 @@ i_t basis_update_t::u_solve(sparse_vector_t& rhs) const sparse_vector_t bprime(m, 0); rhs.inverse_permute_vector(col_permutation_, bprime); - i_t top = sparse_triangle_solve( - bprime, std::nullopt, xi_workspace_, U_, x_workspace_.data()); + f_t work_estimate = 0; + i_t top = sparse_triangle_solve( + bprime, std::nullopt, xi_workspace_, U_, x_workspace_.data(), work_estimate); solve_to_sparse_vector(top, rhs); // Uses xi_workspace_ and x_workspace_ to fill rhs rhs.inverse_permute_vector(inverse_col_permutation_); @@ -604,7 +610,8 @@ i_t basis_update_t::u_transpose_solve(std::vector& x) const const i_t m = U_.m; std::vector bprime(m); inverse_permute_vector(col_permutation_, x, bprime); - dual_simplex::upper_triangular_transpose_solve(U_, bprime); + f_t work_estimate = 0; + dual_simplex::upper_triangular_transpose_solve(U_, bprime, work_estimate); permute_vector(col_permutation_, bprime, x); return 0; } @@ -649,8 +656,9 @@ i_t basis_update_t::u_transpose_solve(sparse_vector_t& rhs) #endif // U'*y = bprime - i_t top = sparse_triangle_solve( - bprime, std::nullopt, xi_workspace_, U_transpose_, x_workspace_.data()); + f_t work_estimate = 0; + i_t top = sparse_triangle_solve( + bprime, std::nullopt, xi_workspace_, U_transpose_, x_workspace_.data(), work_estimate); solve_to_sparse_vector(top, rhs); // Uses xi_workspace_ and x_workspace_ to fill rhs #ifdef CHECK_WORKSPACE @@ -871,8 +879,9 @@ i_t basis_update_t::update(std::vector& utilde, i_t leaving_index // Solve U'*w = delta*et std::vector w(m); - w[t] = delta; - dual_simplex::upper_triangular_transpose_solve(U_, w); + w[t] = delta; + f_t work_estimate = 0; + dual_simplex::upper_triangular_transpose_solve(U_, w, work_estimate); #ifdef PARANOID { // Compute the residual of the solve @@ -1111,7 +1120,9 @@ i_t basis_update_t::lower_triangular_multiply(const csc_matrix_t i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts_basic) @@ -1121,10 +1132,12 @@ i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts // Solve for U^T W^T = C_B^T // We do this one row at a time of C_B csc_matrix_t WT(m, cuts_basic.m, 0); + work_estimate_ += cuts_basic.m; i_t WT_nz = 0; for (i_t k = 0; k < cuts_basic.m; k++) { sparse_vector_t rhs(cuts_basic, k); + work_estimate_ += rhs.i.size(); u_transpose_solve(rhs); WT.col_start[k] = WT_nz; for (i_t q = 0; q < rhs.i.size(); q++) { @@ -1132,6 +1145,7 @@ i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts WT.x.push_back(rhs.x[q]); WT_nz++; } + work_estimate_ += 3 * rhs.i.size(); } WT.col_start[cuts_basic.m] = WT_nz; @@ -1161,6 +1175,8 @@ i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts #endif csc_matrix_t V(cuts_basic.m, m, 0); + work_estimate_ += m; + i_t V_nz = 0; if (num_updates_ > 0) { // W = V T_0 ... T_{num_updates_ - 1} // or V = W T_{num_updates_ - 1}^{-1} ... T_0^{-1} @@ -1173,11 +1189,12 @@ i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts // for appending to L0 csr_matrix_t V_row(cuts_basic.m, m, 0); - i_t V_nz = 0; + work_estimate_ += m; const f_t zero_tol = 1e-13; for (i_t h = 0; h < cuts_basic.m; h++) { sparse_vector_t rhs(WT, h); scatter_into_workspace(rhs); + work_estimate_ += 2 * rhs.i.size(); i_t nz = rhs.i.size(); for (i_t k = num_updates_ - 1; k >= 0; --k) { // T_k^{-T} = ( I - v u^T/(1 + u^T v)) @@ -1201,10 +1218,12 @@ i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts V_row.x.push_back(rhs.x[q]); V_nz++; } + work_estimate_ += 2 * rhs.i.size(); } V_row.row_start[cuts_basic.m] = V_nz; V_row.to_compressed_col(V); + work_estimate_ += 3 * V_nz; #ifdef CHECK_V csc_matrix_t CB_col(cuts_basic.m, m, 0); @@ -1245,6 +1264,7 @@ i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts } else { // W = V WT.transpose(V); + work_estimate_ += 3 * WT.col_start[WT.n]; } // Extend u_i, v_i for i = 0, ..., num_updates_ - 1 @@ -1254,9 +1274,10 @@ i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts // L = [ L0 0 ] // [ V I ] - i_t V_nz = V.col_start[m]; + V_nz = V.col_start[m]; i_t L_nz = L0_.col_start[m]; csc_matrix_t new_L(m + cuts_basic.m, m + cuts_basic.m, L_nz + V_nz + cuts_basic.m); + work_estimate_ += (L_nz + V_nz + cuts_basic.m) + (m + cuts_basic.m); i_t predicted_nz = L_nz + V_nz + cuts_basic.m; L_nz = 0; for (i_t j = 0; j < m; ++j) { @@ -1276,16 +1297,19 @@ i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts L_nz++; } } + work_estimate_ += 4 * L_nz; for (i_t j = m; j < m + cuts_basic.m; ++j) { new_L.col_start[j] = L_nz; new_L.i[L_nz] = j; new_L.x[L_nz] = 1.0; L_nz++; } + work_estimate_ += 3 * cuts_basic.m; new_L.col_start[m + cuts_basic.m] = L_nz; assert(L_nz == predicted_nz); L0_ = new_L; + work_estimate_ += 2 * L_nz; // Adjust U // U = [ U0 0 ] @@ -1295,12 +1319,14 @@ i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts U0_.col_start.resize(m + cuts_basic.m + 1); U0_.i.resize(U_nz + cuts_basic.m); U0_.x.resize(U_nz + cuts_basic.m); + work_estimate_ += 2 * (U_nz + cuts_basic.m) + (m + cuts_basic.m); for (i_t k = m; k < m + cuts_basic.m; ++k) { U0_.col_start[k] = U_nz; U0_.i[U_nz] = k; U0_.x[U_nz] = 1.0; U_nz++; } + work_estimate_ += 3 * cuts_basic.m; U0_.col_start[m + cuts_basic.m] = U_nz; U0_.n = m + cuts_basic.m; U0_.m = m + cuts_basic.m; @@ -1310,14 +1336,17 @@ i_t basis_update_mpf_t::append_cuts(const csr_matrix_t& cuts // Adjust row_permutation_ and inverse_row_permutation_ row_permutation_.resize(m + cuts_basic.m); inverse_row_permutation_.resize(m + cuts_basic.m); + work_estimate_ += 2 * (m + cuts_basic.m); for (i_t k = m; k < m + cuts_basic.m; ++k) { row_permutation_[k] = k; } + work_estimate_ += cuts_basic.m; inverse_permutation(row_permutation_, inverse_row_permutation_); // Adjust workspace sizes xi_workspace_.resize(2 * (m + cuts_basic.m), 0); x_workspace_.resize(m + cuts_basic.m, 0.0); + work_estimate_ += 3 * (m + cuts_basic.m); return 0; } @@ -1331,6 +1360,7 @@ void basis_update_mpf_t::gather_into_sparse_vector(i_t nz, out.x.clear(); out.i.reserve(nz); out.x.reserve(nz); + work_estimate_ += 2 * nz; const f_t zero_tol = 1e-13; for (i_t k = 0; k < nz; ++k) { const i_t i = xi_workspace_[m + k]; @@ -1342,6 +1372,8 @@ void basis_update_mpf_t::gather_into_sparse_vector(i_t nz, xi_workspace_[i] = 0; x_workspace_[i] = 0.0; } + work_estimate_ += 5 * nz; + work_estimate_ += 3 * out.i.size(); } template @@ -1355,10 +1387,12 @@ void basis_update_mpf_t::solve_to_workspace(i_t top) const xi_workspace_[p] = 0; nz++; } + work_estimate_ += 3 * (m - top); for (i_t k = 0; k < nz; ++k) { const i_t i = xi_workspace_[m + k]; xi_workspace_[i] = 1; } + work_estimate_ += 2 * nz; } template @@ -1372,6 +1406,7 @@ void basis_update_mpf_t::solve_to_sparse_vector(i_t top, out.i.clear(); out.x.reserve(nz); out.i.reserve(nz); + work_estimate_ += 2 * nz; i_t k = 0; const f_t zero_tol = 1e-13; for (i_t p = top; p < m; ++p) { @@ -1384,6 +1419,7 @@ void basis_update_mpf_t::solve_to_sparse_vector(i_t top, xi_workspace_[p] = 0; k++; } + work_estimate_ += 4 * k + 3 * out.i.size(); } template @@ -1397,10 +1433,12 @@ i_t basis_update_mpf_t::scatter_into_workspace(const sparse_vector_t::grow_storage(i_t nz, i_t& S_start, i_t& S_nz) const i_t new_last_S_col = last_S_col + 2; if (new_last_S_col >= S_.col_start.size()) { S_.col_start.resize(new_last_S_col + refactor_frequency_); + work_estimate_ += new_last_S_col + refactor_frequency_; } S_nz = S_.col_start[last_S_col]; if (S_nz + nz > S_.i.size()) { S_.i.resize(std::max(2 * S_nz, S_nz + nz)); S_.x.resize(std::max(2 * S_nz, S_nz + nz)); + work_estimate_ += 2 * std::max(2 * S_nz, S_nz + nz); } S_start = last_S_col; assert(S_nz + nz <= S_.i.size()); @@ -1431,6 +1471,7 @@ i_t basis_update_mpf_t::nonzeros(const std::vector& x) const for (i_t i = 0; i < xsz; ++i) { if (x[i] != 0.0) { nz++; } } + work_estimate_ += xsz; return nz; } @@ -1445,6 +1486,7 @@ f_t basis_update_mpf_t::dot_product(i_t col, const std::vector& x const i_t i = S_.i[p]; dot += S_.x[p] * x[i]; } + work_estimate_ += 3 * (col_end - col_start); return dot; } @@ -1457,10 +1499,15 @@ f_t basis_update_mpf_t::dot_product(i_t col, f_t dot = 0.0; const i_t col_start = S_.col_start[col]; const i_t col_end = S_.col_start[col + 1]; + i_t nz_mark = 0; for (i_t p = col_start; p < col_end; ++p) { const i_t i = S_.i[p]; - if (mark[i]) { dot += S_.x[p] * x[i]; } + if (mark[i]) { + dot += S_.x[p] * x[i]; + nz_mark++; + } } + work_estimate_ += 2 * nz_mark + (col_end - col_start); return dot; } @@ -1477,6 +1524,7 @@ void basis_update_mpf_t::add_sparse_column(const csc_matrix_t @@ -1490,6 +1538,7 @@ void basis_update_mpf_t::add_sparse_column(const csc_matrix_t::add_sparse_column(const csc_matrix_t @@ -1528,14 +1578,17 @@ i_t basis_update_mpf_t::b_transpose_solve(const std::vector& rhs, // Solve for r such that U'*r = c std::vector r = rhs; + work_estimate_ += 2 * r.size(); u_transpose_solve(r); UTsol = r; + work_estimate_ += 2 * r.size(); // Solve for w such that L'*w = r l_transpose_solve(r); // Compute y = P'*w inverse_permute_vector(row_permutation_, r, solution); + work_estimate_ += 3 * r.size(); return 0; } @@ -1630,7 +1683,7 @@ template i_t basis_update_mpf_t::u_transpose_solve(std::vector& rhs) const { total_dense_U_transpose_++; - dual_simplex::upper_triangular_transpose_solve(U0_, rhs); + dual_simplex::upper_triangular_transpose_solve(U0_, rhs, work_estimate_); return 0; } @@ -1641,7 +1694,7 @@ i_t basis_update_mpf_t::u_transpose_solve(sparse_vector_t& r // U0'*x = y // Solve U0'*x0 = y i_t top = dual_simplex::sparse_triangle_solve( - rhs, std::nullopt, xi_workspace_, U0_transpose_, x_workspace_.data()); + rhs, std::nullopt, xi_workspace_, U0_transpose_, x_workspace_.data(), work_estimate_); solve_to_sparse_vector(top, rhs); return 0; } @@ -1671,9 +1724,10 @@ i_t basis_update_mpf_t::l_transpose_solve(std::vector& rhs) const if (std::abs(theta) > zero_tol) { add_sparse_column(S_, v_col, -theta, rhs); } } + work_estimate_ += 2 * num_updates_; // Solve for x such that L0^T * x = b' - dual_simplex::lower_triangular_transpose_solve(L0_, rhs); + dual_simplex::lower_triangular_transpose_solve(L0_, rhs, work_estimate_); return 0; } @@ -1729,6 +1783,7 @@ i_t basis_update_mpf_t::l_transpose_solve(sparse_vector_t& r } #endif } + work_estimate_ += 2 * num_updates_; #ifdef CHECK_MULTIPLY for (i_t i = 0; i < m; ++i) { @@ -1743,9 +1798,10 @@ i_t basis_update_mpf_t::l_transpose_solve(sparse_vector_t& r #endif sparse_vector_t b(m, nz); + work_estimate_ += nz; gather_into_sparse_vector(nz, b); i_t top = dual_simplex::sparse_triangle_solve( - b, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data()); + b, std::nullopt, xi_workspace_, L0_transpose_, x_workspace_.data(), work_estimate_); solve_to_sparse_vector(top, rhs); #ifdef CHECK_SPARSE_SOLVE @@ -1772,6 +1828,7 @@ i_t basis_update_mpf_t::b_solve(const std::vector& rhs, { const i_t m = L0_.m; std::vector Lsol(m); + work_estimate_ += m; return b_solve(rhs, solution, Lsol); } @@ -1788,6 +1845,7 @@ i_t basis_update_mpf_t::b_solve(const std::vector& rhs, // P*B*x = P*b permute_vector(row_permutation_, rhs, solution); + work_estimate_ += 3 * rhs.size(); // L*U*x = b' // Solve for v such that L*v = b' @@ -1795,7 +1853,10 @@ i_t basis_update_mpf_t::b_solve(const std::vector& rhs, std::vector rhs_permuted = solution; #endif l_solve(solution); - if (need_Lsol) { Lsol = solution; } + if (need_Lsol) { + Lsol = solution; + work_estimate_ += 2 * solution.size(); + } #ifdef CHECK_L_SOLVE std::vector Lsol_check = Lsol; @@ -1836,7 +1897,9 @@ i_t basis_update_mpf_t::b_solve(const sparse_vector_t& rhs, { const i_t m = L0_.m; solution = rhs; + work_estimate_ += 2 * rhs.i.size(); solution.inverse_permute_vector(inverse_row_permutation_); + work_estimate_ += 3 * rhs.i.size(); #ifdef CHECK_PERMUTATION std::vector permuation_rhs; @@ -1866,10 +1929,15 @@ i_t basis_update_mpf_t::b_solve(const sparse_vector_t& rhs, } else { std::vector solution_dense; solution.to_dense(solution_dense); + work_estimate_ += solution_dense.size(); l_solve(solution_dense); solution.from_dense(solution_dense); + work_estimate_ += solution_dense.size(); + } + if (need_Lsol) { + Lsol = solution; + work_estimate_ += 2 * solution.i.size(); } - if (need_Lsol) { Lsol = solution; } sum_L_ += static_cast(solution.i.size()) / input_size; #ifdef CHECK_L_SOLVE @@ -1897,8 +1965,10 @@ i_t basis_update_mpf_t::b_solve(const sparse_vector_t& rhs, } else { std::vector solution_dense; solution.to_dense(solution_dense); + work_estimate_ += solution_dense.size(); u_solve(solution_dense); solution.from_dense(solution_dense); + work_estimate_ += solution_dense.size(); } sum_U_ += static_cast(solution.i.size()) / rhs_size; @@ -1921,7 +1991,7 @@ i_t basis_update_mpf_t::u_solve(std::vector& rhs) const total_dense_U_++; const i_t m = L0_.m; // U*x = y - dual_simplex::upper_triangular_solve(U0_, rhs); + dual_simplex::upper_triangular_solve(U0_, rhs, work_estimate_); return 0; } @@ -1934,7 +2004,7 @@ i_t basis_update_mpf_t::u_solve(sparse_vector_t& rhs) const // Solve U0*x = y i_t top = dual_simplex::sparse_triangle_solve( - rhs, std::nullopt, xi_workspace_, U0_, x_workspace_.data()); + rhs, std::nullopt, xi_workspace_, U0_, x_workspace_.data(), work_estimate_); solve_to_sparse_vector(top, rhs); return 0; @@ -1955,7 +2025,7 @@ i_t basis_update_mpf_t::l_solve(std::vector& rhs) const #ifdef CHECK_L_SOLVE std::vector rhs_check = rhs; #endif - dual_simplex::lower_triangular_solve(L0_, rhs); + dual_simplex::lower_triangular_solve(L0_, rhs, work_estimate_); #ifdef CHECK_L0_SOLVE matrix_vector_multiply(L0_, 1.0, rhs, -1.0, residual); @@ -1979,6 +2049,7 @@ i_t basis_update_mpf_t::l_solve(std::vector& rhs) const if (std::abs(theta) > zero_tol) { add_sparse_column(S_, u_col, -theta, rhs); } } + work_estimate_ += 2 * num_updates_; #ifdef CHECK_L_SOLVE std::vector inout = rhs; @@ -2004,7 +2075,7 @@ i_t basis_update_mpf_t::l_solve(sparse_vector_t& rhs) const // First solve L0*x0 = y i_t top = dual_simplex::sparse_triangle_solve( - rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data()); + rhs, std::nullopt, xi_workspace_, L0_, x_workspace_.data(), work_estimate_); solve_to_workspace(top); // Uses xi_workspace_ and x_workspace_ to fill rhs i_t nz = m - top; // Then T0 * T1 * ... * T_{num_updates_ - 1} * x = x0 @@ -2027,6 +2098,7 @@ i_t basis_update_mpf_t::l_solve(sparse_vector_t& rhs) const add_sparse_column(S_, u_col, -theta, xi_workspace_, nz, x_workspace_); } } + work_estimate_ += 2 * num_updates_; gather_into_sparse_vector(nz, rhs); @@ -2049,6 +2121,7 @@ i_t basis_update_mpf_t::update(const std::vector& utilde, const i_t col_start = U0_.col_start[leaving_index]; const i_t col_end = U0_.col_start[leaving_index + 1]; std::vector u = utilde; + work_estimate_ += 2 * utilde.size(); // u = utilde - U0(:, leaving_index) add_sparse_column(U0_, leaving_index, -1.0, u); @@ -2069,9 +2142,11 @@ i_t basis_update_mpf_t::update(const std::vector& utilde, // Scatter u into S S_.append_column(u); + work_estimate_ += u.size() + 3 * u_nz; // Scatter v into S S_.append_column(etilde); + work_estimate_ += etilde.size() + 3 * v_nz; // Compute mu = 1 + v^T * u const f_t mu = 1.0 + sparse_dot(S_.i.data() + S_.col_start[S_start], @@ -2080,6 +2155,7 @@ i_t basis_update_mpf_t::update(const std::vector& utilde, S_.i.data() + S_.col_start[S_start + 1], S_.x.data() + S_.col_start[S_start + 1], v_nz); + work_estimate_ += 3 * std::min(u_nz, v_nz); if (std::abs(mu) < 1E-8 || std::abs(mu) > 1E+8) { // Force a refactor. Otherwise we will get numerical issues when dividing by mu. @@ -2126,6 +2202,7 @@ i_t basis_update_mpf_t::update(const sparse_vector_t& utilde // Ensure the workspace is sorted. Otherwise, the sparse dot will be incorrect. std::sort(xi_workspace_.begin() + m, xi_workspace_.begin() + m + nz, std::less()); + work_estimate_ += (m + nz) * std::log2(m + nz); // Gather the workspace into a column of S i_t S_start; @@ -2133,10 +2210,13 @@ i_t basis_update_mpf_t::update(const sparse_vector_t& utilde grow_storage(nz + etilde.i.size(), S_start, S_nz); S_.append_column(nz, xi_workspace_.data() + m, x_workspace_.data()); + work_estimate_ += 5 * nz; // Gather etilde into a column of S etilde.sort(); // Needs to be sorted for the sparse dot. TODO(CMM): Is etilde sorted on input? + work_estimate_ += etilde.i.size() * std::log2(etilde.i.size()); S_.append_column(etilde); + work_estimate_ += 4 * etilde.i.size(); // Compute mu = 1 + v^T * u const f_t mu = 1.0 + sparse_dot(S_.i.data() + S_.col_start[S_start], @@ -2145,6 +2225,7 @@ i_t basis_update_mpf_t::update(const sparse_vector_t& utilde S_.i.data() + S_.col_start[S_start + 1], S_.x.data() + S_.col_start[S_start + 1], S_.col_start[S_start + 2] - S_.col_start[S_start + 1]); + work_estimate_ += 3 * std::min(nz, static_cast(etilde.i.size())); if (std::abs(mu) < 1E-8 || std::abs(mu) > 1E+8) { // Force a refactor. Otherwise we will get numerical issues when dividing by mu. @@ -2158,6 +2239,7 @@ i_t basis_update_mpf_t::update(const sparse_vector_t& utilde x_workspace_[i] = 0.0; xi_workspace_[m + k] = 0; } + work_estimate_ += 4 * nz; #ifdef PRINT_MU_INFO printf("Update mu %e u nz %d v nz %d\n", @@ -2190,7 +2272,6 @@ void basis_update_mpf_t::l_multiply(std::vector& inout) const const f_t theta = dot; add_sparse_column(S_, u_col, theta, inout); } - std::vector out(m, 0.0); matrix_vector_multiply(L0_, 1.0, inout, 0.0, out); inout = out; @@ -2271,7 +2352,10 @@ int basis_update_mpf_t::refactor_basis( std::vector slacks_needed; std::vector superbasic_list; // Empty superbasic list - if (L0_.m != A.m) { resize(A.m); } + if (L0_.m != A.m) { + resize(A.m); + work_estimate_ += A.m; + } std::vector q; i_t status = factorize_basis(A, settings, @@ -2282,7 +2366,8 @@ int basis_update_mpf_t::refactor_basis( inverse_row_permutation_, q, deficient, - slacks_needed); + slacks_needed, + work_estimate_); if (status == CONCURRENT_HALT_RETURN) { return CONCURRENT_HALT_RETURN; } if (status == -1) { settings.log.debug("Initial factorization failed\n"); @@ -2295,7 +2380,8 @@ int basis_update_mpf_t::refactor_basis( basic_list, nonbasic_list, superbasic_list, - vstatus); + vstatus, + work_estimate_); #ifdef CHECK_BASIS_REPAIR const i_t m = A.m; @@ -2323,7 +2409,8 @@ int basis_update_mpf_t::refactor_basis( inverse_row_permutation_, q, deficient, - slacks_needed); + slacks_needed, + work_estimate_); if (status == CONCURRENT_HALT_RETURN) { return CONCURRENT_HALT_RETURN; } if (status == -1) { #ifdef CHECK_L_FACTOR @@ -2338,6 +2425,7 @@ int basis_update_mpf_t::refactor_basis( assert(q.size() == A.m); reorder_basic_list(q, basic_list); // We no longer need q after reordering the basic list + work_estimate_ += 3 * q.size(); reset(); return 0; } diff --git a/cpp/src/dual_simplex/basis_updates.hpp b/cpp/src/dual_simplex/basis_updates.hpp index d9783053f7..bb2b42d8ae 100644 --- a/cpp/src/dual_simplex/basis_updates.hpp +++ b/cpp/src/dual_simplex/basis_updates.hpp @@ -265,6 +265,7 @@ class basis_update_mpf_t { assert(p.size() == Linit.m); row_permutation_ = p; inverse_permutation(row_permutation_, inverse_row_permutation_); + work_estimate_ += 4 * p.size(); clear(); compute_transposes(); reset_stats(); @@ -371,6 +372,7 @@ class basis_update_mpf_t { { L0_.transpose(L0_transpose_); U0_.transpose(U0_transpose_); + work_estimate_ += 6 * L0_.col_start[L0_.n] + 6 * U0_.col_start[U0_.n]; } void multiply_lu(csc_matrix_t& out) const; @@ -386,6 +388,9 @@ class basis_update_mpf_t { void set_refactor_frequency(i_t new_frequency) { refactor_frequency_ = new_frequency; } + f_t work_estimate() const { return work_estimate_; } + void clear_work_estimate() { work_estimate_ = 0.0; } + private: void clear() { @@ -400,9 +405,11 @@ class basis_update_mpf_t { mu_values_.clear(); mu_values_.reserve(refactor_frequency_); num_updates_ = 0; + work_estimate_ += 2 * refactor_frequency_; std::fill(xi_workspace_.begin(), xi_workspace_.end(), 0); std::fill(x_workspace_.begin(), x_workspace_.end(), 0.0); + work_estimate_ += xi_workspace_.size() + x_workspace_.size(); } void grow_storage(i_t nz, i_t& S_start, i_t& S_nz); @@ -472,6 +479,8 @@ class basis_update_mpf_t { mutable f_t sum_U_transpose_; f_t hypersparse_threshold_; + + mutable f_t work_estimate_{0.0}; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp index fac65b8140..e30b067398 100644 --- a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp +++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp @@ -26,9 +26,9 @@ i_t bound_flipping_ratio_test_t::compute_breakpoints(std::vector& i_t idx = 0; while (idx == 0 && pivot_tol >= 1e-12) { - // for (i_t k = 0; k < n - m; ++k) { - // const i_t j = nonbasic_list_[k]; - for (i_t h = 0; h < delta_z_indices_.size(); ++h) { + // Loop over the nonbasic variables j with non-zero delta_z + const i_t nz = delta_z_indices_.size(); + for (i_t h = 0; h < nz; ++h) { const i_t j = delta_z_indices_[h]; const i_t k = nonbasic_mark_[j]; if (vstatus_[j] == variable_status_t::NONBASIC_FIXED) { continue; } @@ -45,6 +45,8 @@ i_t bound_flipping_ratio_test_t::compute_breakpoints(std::vector& idx++; } } + work_estimate_ += 4 * nz; + work_estimate_ += 4 * idx; pivot_tol /= 10; } return idx; @@ -66,11 +68,15 @@ i_t bound_flipping_ratio_test_t::single_pass(i_t start, i_t candidate = -1; f_t zero_tol = settings_.zero_tol; i_t k_idx = -1; + + i_t min_found = 0; + i_t harris_found = 0; for (i_t k = start; k < end; ++k) { if (ratios[k] < min_val) { min_val = ratios[k]; candidate = indicies[k]; k_idx = k; + min_found++; } else if (ratios[k] < min_val + zero_tol) { // Use Harris to select variables with larger pivots const i_t j = nonbasic_list_[indicies[k]]; @@ -79,8 +85,11 @@ i_t bound_flipping_ratio_test_t::single_pass(i_t start, candidate = indicies[k]; k_idx = k; } + harris_found++; } } + work_estimate_ += (end - start) + 2 * min_found + 6 * harris_found; + step_length = min_val; nonbasic_entering = candidate; // this should be temporary, find root causes where the candidate is not filled @@ -116,6 +125,7 @@ i_t bound_flipping_ratio_test_t::compute_step_length(f_t& step_length, // Compute the initial set of breakpoints std::vector indicies(nz); std::vector ratios(nz); + work_estimate_ += 2 * nz; i_t num_breakpoints = compute_breakpoints(indicies, ratios); if constexpr (verbose) { settings_.log.printf("Initial breakpoints %d\n", num_breakpoints); } if (num_breakpoints == 0) { @@ -132,7 +142,7 @@ i_t bound_flipping_ratio_test_t::compute_step_length(f_t& step_length, if (k_idx == RATIO_TEST_NUMERICAL_ISSUES) { return RATIO_TEST_NUMERICAL_ISSUES; } bool continue_search = k_idx >= 0 && num_breakpoints > 1 && slope > 0.0; if (!continue_search) { - if constexpr (0) { + if constexpr (verbose) { settings_.log.printf( "BFRT stopping. No bound flips. Step length %e Nonbasic entering %d Entering %d pivot %e\n", step_length, @@ -163,6 +173,7 @@ i_t bound_flipping_ratio_test_t::compute_step_length(f_t& step_length, for (i_t k = 0; k < num_breakpoints - 1; ++k) { if (ratios[k] > max_ratio) { max_ratio = ratios[k]; } } + work_estimate_ += 2 * num_breakpoints; settings_.log.printf( "Starting heap passes. %d breakpoints max ratio %e\n", num_breakpoints - 1, max_ratio); bucket_pass( @@ -207,6 +218,7 @@ void bound_flipping_ratio_test_t::heap_passes(const std::vector& std::abs(delta_z[nonbasic_list[current_indicies[k]]])); } } + work_estimate_ += N; auto compare = [zero_tol, ¤t_ratios, ¤t_indicies, &delta_z, &nonbasic_list]( const i_t& a, const i_t& b) { @@ -217,11 +229,13 @@ void bound_flipping_ratio_test_t::heap_passes(const std::vector& }; std::make_heap(bare_idx.begin(), bare_idx.end(), compare); + work_estimate_ += 3 * bare_idx.size(); while (bare_idx.size() > 0 && slope > 0) { // Remove minimum ratio from the heap and rebalance i_t heap_index = bare_idx.front(); std::pop_heap(bare_idx.begin(), bare_idx.end(), compare); + work_estimate_ += 2 * std::log2(bare_idx.size()); bare_idx.pop_back(); nonbasic_entering = current_indicies[heap_index]; diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp index 51b00b1097..244ff334df 100644 --- a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp +++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp @@ -54,6 +54,7 @@ class bound_flipping_ratio_test_t { } i_t compute_step_length(f_t& step_length, i_t& nonbasic_entering); + f_t work_estimate() const { return work_estimate_; } private: i_t compute_breakpoints(std::vector& indices, std::vector& ratios); @@ -98,6 +99,8 @@ class bound_flipping_ratio_test_t { i_t n_; i_t m_; + + f_t work_estimate_; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/bounds_strengthening.cpp b/cpp/src/dual_simplex/bounds_strengthening.cpp index 37ab114a7e..6506ce4873 100644 --- a/cpp/src/dual_simplex/bounds_strengthening.cpp +++ b/cpp/src/dual_simplex/bounds_strengthening.cpp @@ -102,11 +102,6 @@ bool bounds_strengthening_t::bounds_strengthening( std::vector variable_changed(n, false); std::vector constraint_changed_next(m, false); - auto& A_i = A.i.underlying(); - auto& A_x = A.x.underlying(); - auto& Arow_j = Arow.j.underlying(); - auto& Arow_x = Arow.x.underlying(); - size_t nnz_processed = 0; if (!bounds_changed.empty()) { @@ -116,7 +111,7 @@ bool bounds_strengthening_t::bounds_strengthening( const i_t col_start = A.col_start[j]; const i_t col_end = A.col_start[j + 1]; for (i_t p = col_start; p < col_end; ++p) { - const i_t i = A_i[p]; + const i_t i = A.i[p]; constraint_changed[i] = true; } } @@ -139,8 +134,8 @@ bool bounds_strengthening_t::bounds_strengthening( f_t min_a = 0.0; f_t max_a = 0.0; for (i_t p = row_start; p < row_end; ++p) { - const i_t j = Arow_j[p]; - const f_t a_ij = Arow_x[p]; + const i_t j = Arow.j[p]; + const f_t a_ij = Arow.x[p]; variable_changed[j] = true; if (a_ij > 0) { @@ -192,10 +187,10 @@ bool bounds_strengthening_t::bounds_strengthening( const i_t col_end = A.col_start[k + 1]; nnz_processed += (col_end - col_start); for (i_t p = col_start; p < col_end; ++p) { - const i_t i = A_i[p]; + const i_t i = A.i[p]; if (!constraint_changed[i]) { continue; } - const f_t a_ik = A_x[p]; + const f_t a_ik = A.x[p]; f_t delta_min_act = delta_min_activity[i]; f_t delta_max_act = delta_max_activity[i]; diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp index 597628e735..aa2baa6ea2 100644 --- a/cpp/src/dual_simplex/crossover.cpp +++ b/cpp/src/dual_simplex/crossover.cpp @@ -505,8 +505,9 @@ i_t dual_push(const lp_problem_t& lp, std::vector q(m); std::vector deficient; std::vector slacks_needed; - i_t rank = - factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed); + f_t work_estimate = 0; + i_t rank = factorize_basis( + lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate); if (rank == CONCURRENT_HALT_RETURN) { return CONCURRENT_HALT_RETURN; } else if (rank != m) { @@ -520,9 +521,10 @@ i_t dual_push(const lp_problem_t& lp, basic_list, nonbasic_list, superbasic_list, - vstatus); - rank = - factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed); + vstatus, + work_estimate); + rank = factorize_basis( + lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate); if (rank == CONCURRENT_HALT_RETURN) { return CONCURRENT_HALT_RETURN; } else if (rank == -1) { @@ -860,8 +862,9 @@ i_t primal_push(const lp_problem_t& lp, std::vector q(m); std::vector deficient; std::vector slacks_needed; - i_t rank = - factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed); + f_t work_estimate = 0; + i_t rank = factorize_basis( + lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate); if (rank == CONCURRENT_HALT_RETURN) { return CONCURRENT_HALT_RETURN; } else if (rank != m) { @@ -875,12 +878,13 @@ i_t primal_push(const lp_problem_t& lp, basic_list, nonbasic_list, superbasic_list, - vstatus); + vstatus, + work_estimate); // We need to be careful. As basis_repair may have changed the superbasic list find_primal_superbasic_variables( lp, settings, solution, solution, vstatus, nonbasic_list, superbasic_list); - rank = - factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed); + rank = factorize_basis( + lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate); if (rank == CONCURRENT_HALT_RETURN) { return CONCURRENT_HALT_RETURN; } else if (rank == -1) { @@ -1142,6 +1146,7 @@ crossover_status_t crossover(const lp_problem_t& lp, const i_t m = lp.num_rows; const i_t n = lp.num_cols; f_t crossover_start = tic(); + f_t work_estimate = 0; settings.log.printf("\n"); settings.log.printf("Starting crossover\n"); @@ -1223,7 +1228,8 @@ crossover_status_t crossover(const lp_problem_t& lp, std::vector deficient; std::vector slacks_needed; - rank = factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed); + rank = factorize_basis( + lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate); if (rank == CONCURRENT_HALT_RETURN) { return crossover_status_t::CONCURRENT_LIMIT; } if (rank != m) { settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m); @@ -1236,8 +1242,10 @@ crossover_status_t crossover(const lp_problem_t& lp, basic_list, nonbasic_list, superbasic_list, - vstatus); - rank = factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed); + vstatus, + work_estimate); + rank = factorize_basis( + lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate); if (rank == CONCURRENT_HALT_RETURN) { return crossover_status_t::CONCURRENT_LIMIT; } else if (rank == -1) { @@ -1392,8 +1400,8 @@ crossover_status_t crossover(const lp_problem_t& lp, nonbasic_list.clear(); superbasic_list.clear(); get_basis_from_vstatus(m, vstatus, basic_list, nonbasic_list, superbasic_list); - rank = - factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed); + rank = factorize_basis( + lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate); if (rank == CONCURRENT_HALT_RETURN) { return crossover_status_t::CONCURRENT_LIMIT; } else if (rank != m) { @@ -1407,9 +1415,10 @@ crossover_status_t crossover(const lp_problem_t& lp, basic_list, nonbasic_list, superbasic_list, - vstatus); - rank = - factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed); + vstatus, + work_estimate); + rank = factorize_basis( + lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate); if (rank == CONCURRENT_HALT_RETURN) { return crossover_status_t::CONCURRENT_LIMIT; } else if (rank == -1) { diff --git a/cpp/src/dual_simplex/folding.cpp b/cpp/src/dual_simplex/folding.cpp index 5e42afb01f..f851e51d79 100644 --- a/cpp/src/dual_simplex/folding.cpp +++ b/cpp/src/dual_simplex/folding.cpp @@ -128,8 +128,8 @@ void compute_sums(const csc_matrix_t& A, // Find all vertices (columns) that have a neighbor in the refining color colors_to_update.reserve(num_col_colors); find_vertices_to_refine(refining_color.vertices, - Arow.row_start.underlying(), - Arow.j.underlying(), + Arow.row_start, + Arow.j, col_color_map, marked_vertices, vertices_to_refine, @@ -145,9 +145,9 @@ void compute_sums(const csc_matrix_t& A, compute_sums_of_refined_vertices(refining_color.color, refining_color.vertices, vertices_to_refine, - Arow.row_start.underlying(), - Arow.j.underlying(), - Arow.x.underlying(), + Arow.row_start, + Arow.j, + Arow.x, col_color_map, vertex_to_sum, max_sum_by_color); @@ -156,8 +156,8 @@ void compute_sums(const csc_matrix_t& A, // Find all vertices (rows) that have a neighbor in the refining color colors_to_update.reserve(num_row_colors); find_vertices_to_refine(refining_color.vertices, - A.col_start.underlying(), - A.i.underlying(), + A.col_start, + A.i, row_color_map, marked_vertices, vertices_to_refine, @@ -173,9 +173,9 @@ void compute_sums(const csc_matrix_t& A, compute_sums_of_refined_vertices(refining_color.color, refining_color.vertices, vertices_to_refine, - A.col_start.underlying(), - A.i.underlying(), - A.x.underlying(), + A.col_start, + A.i, + A.x, row_color_map, vertex_to_sum, max_sum_by_color); diff --git a/cpp/src/dual_simplex/initial_basis.cpp b/cpp/src/dual_simplex/initial_basis.cpp index 5da8449040..9343769d00 100644 --- a/cpp/src/dual_simplex/initial_basis.cpp +++ b/cpp/src/dual_simplex/initial_basis.cpp @@ -79,7 +79,8 @@ i_t initial_basis_selection(const lp_problem_t& problem, i_t row_singletons; i_t col_singletons; std::vector row_perm(N); - find_singletons(C, row_singletons, row_perm, col_singletons, q); + f_t work_estimate = 0; + find_singletons(C, row_singletons, row_perm, col_singletons, q, work_estimate); std::vector row_perm_inv(N); inverse_permutation(row_perm, row_perm_inv); diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index 1e057a16b8..50af6971b9 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -326,55 +326,47 @@ template void compute_reduced_cost_update(const lp_problem_t& lp, const std::vector& basic_list, const std::vector& nonbasic_list, - const ins_vector& delta_y, + const std::vector& delta_y, i_t leaving_index, i_t direction, - ins_vector& delta_z_mark, - ins_vector& delta_z_indices, - ins_vector& delta_z) + std::vector& delta_z_mark, + std::vector& delta_z_indices, + std::vector& delta_z, + f_t& work_estimate) { const i_t m = lp.num_rows; const i_t n = lp.num_cols; - const f_t* __restrict__ delta_y_ptr = delta_y.data(); - const f_t* __restrict__ Ax = lp.A.x.data(); - const i_t* __restrict__ Ai = lp.A.i.data(); - const i_t* __restrict__ ptr_col_start = lp.A.col_start.data(); - f_t* __restrict__ delta_z_ptr = delta_z.data(); - size_t nnzs_processed = 0; - // delta_zB = sigma*ei for (i_t k = 0; k < m; k++) { - const i_t j = basic_list[k]; - delta_z_ptr[j] = 0; + const i_t j = basic_list[k]; + delta_z[j] = 0; } - delta_z_ptr[leaving_index] = direction; + work_estimate += 2 * m; + delta_z[leaving_index] = direction; // delta_zN = -N'*delta_y const i_t num_nonbasic = n - m; for (i_t k = 0; k < num_nonbasic; k++) { const i_t j = nonbasic_list[k]; // z_j <- -A(:, j)'*delta_y - const i_t col_start = ptr_col_start[j]; - const i_t col_end = ptr_col_start[j + 1]; + const i_t col_start = lp.A.col_start[j]; + const i_t col_end = lp.A.col_start[j + 1]; f_t dot = 0.0; for (i_t p = col_start; p < col_end; ++p) { - dot += Ax[p] * delta_y_ptr[Ai[p]]; + dot += lp.A.x[p] * delta_y[lp.A.i[p]]; } nnzs_processed += col_end - col_start; - delta_z_ptr[j] = -dot; + delta_z[j] = -dot; if (dot != 0.0) { delta_z_indices.push_back(j); // Note delta_z_indices has n elements reserved delta_z_mark[j] = 1; } } - - lp.A.x.byte_loads += nnzs_processed * sizeof(f_t); - lp.A.i.byte_loads += nnzs_processed * sizeof(i_t); - delta_y.byte_loads += nnzs_processed * sizeof(f_t); - lp.A.col_start.byte_loads += 2 * num_nonbasic * sizeof(i_t); - delta_z.byte_stores += (m + 1 + num_nonbasic) * sizeof(f_t); + work_estimate += 3 * num_nonbasic; + work_estimate += 3 * nnzs_processed; + work_estimate += 2 * delta_z_indices.size(); } template @@ -382,67 +374,42 @@ void compute_delta_z(const csc_matrix_t& A_transpose, const sparse_vector_t& delta_y, i_t leaving_index, i_t direction, - ins_vector& nonbasic_mark, - ins_vector& delta_z_mark, - ins_vector& delta_z_indices, - ins_vector& delta_z) + std::vector& nonbasic_mark, + std::vector& delta_z_mark, + std::vector& delta_z_indices, + std::vector& delta_z, + f_t& work_estimate) { - auto& At_cs = A_transpose.col_start.underlying(); - auto& At_i = A_transpose.i.underlying(); - auto& At_x = A_transpose.x.underlying(); - auto& dy_i = delta_y.i.underlying(); - auto& dy_x = delta_y.x.underlying(); - auto& nb_mk = nonbasic_mark.underlying(); - auto& dz_mk = delta_z_mark.underlying(); - auto& dz_idx = delta_z_indices.underlying(); - auto& dz = delta_z.underlying(); - // delta_zN = - N'*delta_y - const i_t nz_delta_y = dy_i.size(); - size_t nnz_processed = 0; - size_t nb_reads = 0; - size_t dz_rmws = 0; - size_t dz_mk_reads = 0; - size_t dz_mk_writes = 0; - size_t dz_idx_writes = 0; + const i_t nz_delta_y = delta_y.i.size(); + size_t nnz_processed = 0; + size_t nonbasic_marked = 0; for (i_t k = 0; k < nz_delta_y; k++) { - const i_t i = dy_i[k]; - const f_t delta_y_i = dy_x[k]; + const i_t i = delta_y.i[k]; + const f_t delta_y_i = delta_y.x[k]; if (std::abs(delta_y_i) < 1e-12) { continue; } - const i_t row_start = At_cs[i]; - const i_t row_end = At_cs[i + 1]; + const i_t row_start = A_transpose.col_start[i]; + const i_t row_end = A_transpose.col_start[i + 1]; nnz_processed += row_end - row_start; for (i_t p = row_start; p < row_end; ++p) { - const i_t j = At_i[p]; - nb_reads++; - if (nb_mk[j] >= 0) { - dz[j] -= delta_y_i * At_x[p]; - dz_rmws++; - dz_mk_reads++; - if (!dz_mk[j]) { - dz_mk[j] = 1; - dz_idx.push_back(j); - dz_mk_writes++; - dz_idx_writes++; + const i_t j = A_transpose.i[p]; + if (nonbasic_mark[j] >= 0) { + delta_z[j] -= delta_y_i * A_transpose.x[p]; + nonbasic_marked++; + if (!delta_z_mark[j]) { + delta_z_mark[j] = 1; + delta_z_indices.push_back(j); } } } } + work_estimate += 4 * nz_delta_y; + work_estimate += 2 * nnz_processed; + work_estimate += 3 * nonbasic_marked; + work_estimate += 2 * delta_z_indices.size(); // delta_zB = sigma*ei - dz[leaving_index] = direction; - - delta_y.i.byte_loads += nz_delta_y * sizeof(i_t); - delta_y.x.byte_loads += nz_delta_y * sizeof(f_t); - A_transpose.col_start.byte_loads += 2 * nz_delta_y * sizeof(i_t); - A_transpose.i.byte_loads += nnz_processed * sizeof(i_t); - A_transpose.x.byte_loads += dz_rmws * sizeof(f_t); - nonbasic_mark.byte_loads += nb_reads * sizeof(i_t); - delta_z.byte_loads += dz_rmws * sizeof(f_t); - delta_z.byte_stores += (dz_rmws + 1) * sizeof(f_t); - delta_z_mark.byte_loads += dz_mk_reads * sizeof(i_t); - delta_z_mark.byte_stores += dz_mk_writes * sizeof(i_t); - delta_z_indices.byte_stores += dz_idx_writes * sizeof(i_t); + delta_z[leaving_index] = direction; #ifdef CHECK_CHANGE_IN_REDUCED_COST delta_y_sparse.to_dense(delta_y); @@ -480,7 +447,8 @@ void compute_reduced_costs(const std::vector& objective, const std::vector& y, const std::vector& basic_list, const std::vector& nonbasic_list, - std::vector& z) + std::vector& z, + f_t& work_estimate) { PHASE2_NVTX_RANGE("DualSimplex::compute_reduced_costs"); @@ -499,12 +467,15 @@ void compute_reduced_costs(const std::vector& objective, for (i_t p = col_start; p < col_end; ++p) { dot += A.x[p] * y[A.i[p]]; } + work_estimate += 3 * (col_end - col_start); z[j] -= dot; } + work_estimate += 5 * (n - m); // zB = 0 for (i_t k = 0; k < m; ++k) { z[basic_list[k]] = 0.0; } + work_estimate += 2 * m; } template @@ -515,12 +486,14 @@ void compute_primal_variables(const basis_update_mpf_t& ft, const std::vector& nonbasic_list, f_t tight_tol, std::vector& x, - ins_vector& xB_workspace) + std::vector& xB_workspace, + f_t& work_estimate) { PHASE2_NVTX_RANGE("DualSimplex::compute_primal_variables"); const i_t m = A.m; const i_t n = A.n; std::vector rhs = lp_rhs; + work_estimate += 2 * m; // rhs = b - sum_{j : x_j = l_j} A(:, j) * l(j) // - sum_{j : x_j = u_j} A(:, j) * u(j) for (i_t k = 0; k < n - m; ++k) { @@ -532,25 +505,31 @@ void compute_primal_variables(const basis_update_mpf_t& ft, for (i_t p = col_start; p < col_end; ++p) { rhs[A.i[p]] -= xj * A.x[p]; } + work_estimate += 3 * (col_end - col_start); } + work_estimate += 5 * (n - m); xB_workspace.resize(m); + work_estimate += m; ft.b_solve(rhs, xB_workspace); for (i_t k = 0; k < m; ++k) { const i_t j = basic_list[k]; x[j] = xB_workspace[k]; } + work_estimate += 2 * m; } +// Work is 3*delta_z_indices.size() template void clear_delta_z(i_t entering_index, i_t leaving_index, - ins_vector& delta_z_mark, - ins_vector& delta_z_indices, - ins_vector& delta_z) + std::vector& delta_z_mark, + std::vector& delta_z_indices, + std::vector& delta_z) { - for (i_t k = 0; k < delta_z_indices.size(); k++) { + const i_t nz = delta_z_indices.size(); + for (i_t k = 0; k < nz; k++) { const i_t j = delta_z_indices[k]; delta_z[j] = 0.0; delta_z_mark[j] = 0; @@ -564,13 +543,15 @@ template void clear_delta_x(const std::vector& basic_list, i_t entering_index, sparse_vector_t& scaled_delta_xB_sparse, - ins_vector& delta_x) + std::vector& delta_x, + f_t& work_estimate) { const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size(); for (i_t k = 0; k < scaled_delta_xB_nz; ++k) { const i_t j = basic_list[scaled_delta_xB_sparse.i[k]]; delta_x[j] = 0.0; } + work_estimate += 3 * scaled_delta_xB_nz; // Leaving index already included above delta_x[entering_index] = 0.0; scaled_delta_xB_sparse.i.clear(); @@ -631,7 +612,7 @@ void vstatus_changes(const std::vector& vstatus, template void compute_bounded_info(const std::vector& lower, const std::vector& upper, - ins_vector& bounded_variables) + std::vector& bounded_variables) { const size_t n = lower.size(); for (size_t j = 0; j < n; j++) { @@ -646,17 +627,20 @@ void compute_dual_solution_from_basis(const lp_problem_t& lp, const std::vector& basic_list, const std::vector& nonbasic_list, std::vector& y, - std::vector& z) + std::vector& z, + f_t& work_estimate) { const i_t m = lp.num_rows; const i_t n = lp.num_cols; y.resize(m); - ins_vector cB(m); + std::vector cB(m); + work_estimate += 2 * m; for (i_t k = 0; k < m; ++k) { const i_t j = basic_list[k]; cB[k] = lp.objective[j]; } + work_estimate += 3 * m; ft.b_transpose_solve(cB, y); // We want A'y + z = c @@ -664,6 +648,7 @@ void compute_dual_solution_from_basis(const lp_problem_t& lp, // B' y = c_B, z_B = 0 // N' y + z_N = c_N z.resize(n); + work_estimate += n; // zN = cN - N'*y for (i_t k = 0; k < n - m; k++) { const i_t j = nonbasic_list[k]; @@ -677,12 +662,15 @@ void compute_dual_solution_from_basis(const lp_problem_t& lp, for (i_t p = col_start; p < col_end; ++p) { dot += lp.A.x[p] * y[lp.A.i[p]]; } + work_estimate += 3 * (col_end - col_start); z[j] -= dot; } + work_estimate += 5 * (n - m); // zB = 0 for (i_t k = 0; k < m; ++k) { z[basic_list[k]] = 0.0; } + work_estimate += 2 * m; } template @@ -692,11 +680,13 @@ i_t compute_primal_solution_from_basis(const lp_problem_t& lp, const std::vector& nonbasic_list, const std::vector& vstatus, std::vector& x, - ins_vector& xB_workspace) + std::vector& xB_workspace, + f_t& work_estimate) { const i_t m = lp.num_rows; const i_t n = lp.num_cols; std::vector rhs = lp.rhs; + work_estimate += 2 * m; for (i_t k = 0; k < n - m; ++k) { const i_t j = nonbasic_list[k]; @@ -709,6 +699,7 @@ i_t compute_primal_solution_from_basis(const lp_problem_t& lp, x[j] = 0.0; } } + work_estimate += 4 * (n - m); // rhs = b - sum_{j : x_j = l_j} A(:, j) l(j) - sum_{j : x_j = u_j} A(:, j) * // u(j) @@ -720,25 +711,30 @@ i_t compute_primal_solution_from_basis(const lp_problem_t& lp, for (i_t p = col_start; p < col_end; ++p) { rhs[lp.A.i[p]] -= xj * lp.A.x[p]; } + work_estimate += 3 * (col_end - col_start); } + work_estimate += 4 * (n - m); xB_workspace.resize(m); + work_estimate += m; ft.b_solve(rhs, xB_workspace); for (i_t k = 0; k < m; ++k) { const i_t j = basic_list[k]; x[j] = xB_workspace[k]; } + work_estimate += 3 * m; return 0; } +// Work is 4*m + 2*n template f_t compute_initial_primal_infeasibilities(const lp_problem_t& lp, const simplex_solver_settings_t& settings, const std::vector& basic_list, const std::vector& x, - ins_vector& squared_infeasibilities, - ins_vector& infeasibility_indices, + std::vector& squared_infeasibilities, + std::vector& infeasibility_indices, f_t& primal_inf) { PHASE2_NVTX_RANGE("DualSimplex::compute_initial_primal_infeasibilities"); @@ -816,7 +812,8 @@ void update_primal_infeasibilities(const lp_problem_t& lp, std::vector& basic_change_list, std::vector& squared_infeasibilities, std::vector& infeasibility_indices, - f_t& primal_inf) + f_t& primal_inf, + f_t& work_estimate) { const f_t primal_tol = settings.primal_tol; const i_t nz = basic_change_list.size(); @@ -841,45 +838,41 @@ void update_primal_infeasibilities(const lp_problem_t& lp, j, primal_inf); } + work_estimate += 8 * nz; } template -void clean_up_infeasibilities(ins_vector& squared_infeasibilities, - ins_vector& infeasibility_indices) +void clean_up_infeasibilities(std::vector& squared_infeasibilities, + std::vector& infeasibility_indices, + f_t& work_estimate) { - auto& sq_inf = squared_infeasibilities.underlying(); - auto& idx = infeasibility_indices.underlying(); - bool needs_clean_up = false; - const i_t initial_nz = idx.size(); + const i_t initial_nz = infeasibility_indices.size(); for (i_t k = 0; k < initial_nz; ++k) { - const i_t j = idx[k]; - const f_t squared_infeas = sq_inf[j]; + const i_t j = infeasibility_indices[k]; + const f_t squared_infeas = squared_infeasibilities[j]; if (squared_infeas == 0.0) { needs_clean_up = true; } } - - size_t idx_accesses = initial_nz; - size_t sq_accesses = initial_nz; + work_estimate += 2 * initial_nz; if (needs_clean_up) { - for (size_t k = 0; k < idx.size(); ++k) { - const i_t j = idx[k]; - const f_t squared_infeas = sq_inf[j]; - idx_accesses++; - sq_accesses++; + i_t num_cleans = 0; + work_estimate += 2 * infeasibility_indices.size(); + for (size_t k = 0; k < infeasibility_indices.size(); ++k) { + const i_t j = infeasibility_indices[k]; + const f_t squared_infeas = squared_infeasibilities[j]; if (squared_infeas == 0.0) { - const i_t new_j = idx.back(); - idx[k] = new_j; - idx.pop_back(); - idx_accesses += 2; - sq_accesses++; - if (sq_inf[new_j] == 0.0) { k--; } + const i_t new_j = infeasibility_indices.back(); + infeasibility_indices[k] = new_j; + infeasibility_indices.pop_back(); + if (squared_infeasibilities[new_j] == 0.0) { + k--; + } // Decrement k so that we process the same index again + num_cleans++; } } + work_estimate += 4 * num_cleans; } - - infeasibility_indices.byte_loads += idx_accesses * sizeof(i_t); - squared_infeasibilities.byte_loads += sq_accesses * sizeof(f_t); } template @@ -887,23 +880,21 @@ i_t steepest_edge_pricing_with_infeasibilities(const lp_problem_t& lp, const simplex_solver_settings_t& settings, const std::vector& x, const std::vector& dy_steepest_edge, - const ins_vector& basic_mark, - ins_vector& squared_infeasibilities, - ins_vector& infeasibility_indices, + const std::vector& basic_mark, + std::vector& squared_infeasibilities, + std::vector& infeasibility_indices, i_t& direction, i_t& basic_leaving, - f_t& max_val) + f_t& max_val, + f_t& work_estimate) { - auto& sq_inf = squared_infeasibilities.underlying(); - auto& idx = infeasibility_indices.underlying(); - auto& bmark = basic_mark.underlying(); - max_val = 0.0; i_t leaving_index = -1; - const i_t nz = idx.size(); + const i_t nz = infeasibility_indices.size(); + i_t max_count = 0; for (i_t k = 0; k < nz; ++k) { - const i_t j = idx[k]; - const f_t squared_infeas = sq_inf[j]; + const i_t j = infeasibility_indices[k]; + const f_t squared_infeas = squared_infeasibilities[j]; const f_t val = squared_infeas / dy_steepest_edge[j]; if (val > max_val || (val == max_val && j > leaving_index)) { max_val = val; @@ -911,14 +902,12 @@ i_t steepest_edge_pricing_with_infeasibilities(const lp_problem_t& lp, const f_t lower_infeas = lp.lower[j] - x[j]; const f_t upper_infeas = x[j] - lp.upper[j]; direction = lower_infeas >= upper_infeas ? 1 : -1; + max_count++; } } + work_estimate += 3 * nz + 3 * max_count; - basic_leaving = leaving_index >= 0 ? bmark[leaving_index] : -1; - - infeasibility_indices.byte_loads += nz * sizeof(i_t); - squared_infeasibilities.byte_loads += nz * sizeof(f_t); - if (leaving_index >= 0) { basic_mark.byte_loads += sizeof(i_t); } + basic_leaving = leaving_index >= 0 ? basic_mark[leaving_index] : -1; return leaving_index; } @@ -1042,7 +1031,7 @@ f_t first_stage_harris(const lp_problem_t& lp, const std::vector& vstatus, const std::vector& nonbasic_list, std::vector& z, - ins_vector& delta_z) + std::vector& delta_z) { const i_t n = lp.num_cols; const i_t m = lp.num_rows; @@ -1076,7 +1065,7 @@ i_t second_stage_harris(const lp_problem_t& lp, const std::vector& vstatus, const std::vector& nonbasic_list, const std::vector& z, - const ins_vector& delta_z, + const std::vector& delta_z, f_t max_step_length, f_t& step_length, i_t& nonbasic_entering) @@ -1119,7 +1108,7 @@ i_t phase2_ratio_test(const lp_problem_t& lp, std::vector& vstatus, std::vector& nonbasic_list, std::vector& z, - ins_vector& delta_z, + std::vector& delta_z, f_t& step_length, i_t& nonbasic_entering) { @@ -1169,17 +1158,18 @@ i_t phase2_ratio_test(const lp_problem_t& lp, template i_t flip_bounds(const lp_problem_t& lp, const simplex_solver_settings_t& settings, - const ins_vector& bounded_variables, - const ins_vector& objective, + const std::vector& bounded_variables, + const std::vector& objective, const std::vector& z, - const ins_vector& delta_z_indices, + const std::vector& delta_z_indices, const std::vector& nonbasic_list, i_t entering_index, std::vector& vstatus, - ins_vector& delta_x, - ins_vector& mark, - ins_vector& atilde, - ins_vector& atilde_index) + std::vector& delta_x, + std::vector& mark, + std::vector& atilde, + std::vector& atilde_index, + f_t& work_estimate) { i_t num_flipped = 0; for (i_t k = 0; k < delta_z_indices.size(); ++k) { @@ -1191,8 +1181,11 @@ i_t flip_bounds(const lp_problem_t& lp, const f_t dual_tol = settings.dual_tol; // lower to 1e-7 or less will cause 25fv47 and d2q06c to cycle if (vstatus[j] == variable_status_t::NONBASIC_LOWER && z[j] < -dual_tol) { - const f_t delta = lp.upper[j] - lp.lower[j]; + const f_t delta = lp.upper[j] - lp.lower[j]; + const size_t atilde_start_size = atilde_index.size(); scatter_dense(lp.A, j, -delta, atilde, mark, atilde_index); + work_estimate += 2 * (atilde_index.size() - atilde_start_size) + + 4 * (lp.A.col_start[j + 1] - lp.A.col_start[j]) + 10; delta_x[j] += delta; vstatus[j] = variable_status_t::NONBASIC_UPPER; #ifdef BOUND_FLIP_DEBUG @@ -1201,8 +1194,11 @@ i_t flip_bounds(const lp_problem_t& lp, #endif num_flipped++; } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER && z[j] > dual_tol) { - const f_t delta = lp.lower[j] - lp.upper[j]; + const f_t delta = lp.lower[j] - lp.upper[j]; + const size_t atilde_start_size = atilde_index.size(); scatter_dense(lp.A, j, -delta, atilde, mark, atilde_index); + work_estimate += 2 * (atilde_index.size() - atilde_start_size) + + 4 * (lp.A.col_start[j + 1] - lp.A.col_start[j]) + 10; delta_x[j] += delta; vstatus[j] = variable_status_t::NONBASIC_LOWER; #ifdef BOUND_FLIP_DEBUG @@ -1239,7 +1235,8 @@ i_t initialize_steepest_edge_norms(const lp_problem_t& lp, const f_t start_time, const std::vector& basic_list, basis_update_mpf_t& ft, - std::vector& delta_y_steepest_edge) + std::vector& delta_y_steepest_edge, + f_t& work_estimate) { const i_t m = basic_list.size(); @@ -1251,6 +1248,7 @@ i_t initialize_steepest_edge_norms(const lp_problem_t& lp, std::vector row_degree(m, 0); std::vector mapping(m, -1); std::vector coeff(m, 0.0); + work_estimate += 3 * m; for (i_t k = 0; k < m; ++k) { const i_t j = basic_list[k]; @@ -1263,7 +1261,9 @@ i_t initialize_steepest_edge_norms(const lp_problem_t& lp, mapping[k] = i; coeff[k] = lp.A.x[p]; } + work_estimate += 5 * (col_end - col_start); } + work_estimate += 3 * m; #ifdef CHECK_SINGLETON_ROWS csc_matrix_t B(m, m, 0); @@ -1285,6 +1285,7 @@ i_t initialize_steepest_edge_norms(const lp_problem_t& lp, #endif } } + work_estimate += m; if (num_singleton_rows > 0) { settings.log.printf("Found %d singleton rows for steepest edge norms in %.2fs\n", @@ -1347,6 +1348,7 @@ i_t initialize_steepest_edge_norms(const lp_problem_t& lp, for (i_t p = 0; p < sparse_dy.x.size(); ++p) { my_init += sparse_dy.x[p] * sparse_dy.x[p]; } + work_estimate += 2 * sparse_dy.x.size(); #endif #if COMPARE_WITH_DENSE if (std::abs(init - my_init) > 1e-12) { @@ -1375,6 +1377,7 @@ i_t initialize_steepest_edge_norms(const lp_problem_t& lp, return CONCURRENT_HALT_RETURN; } } + work_estimate += 7 * m; return 0; } @@ -1388,17 +1391,22 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t& settin const sparse_vector_t& scaled_delta_xB, i_t basic_leaving_index, i_t entering_index, - ins_vector& v, + std::vector& v, sparse_vector_t& v_sparse, - std::vector& delta_y_steepest_edge) + std::vector& delta_y_steepest_edge, + f_t& work_estimate) { const i_t delta_y_nz = delta_y_sparse.i.size(); v_sparse.clear(); // B^T delta_y = - direction * e_basic_leaving_index // We want B v = - B^{-T} e_basic_leaving_index ft.b_solve(delta_y_sparse, v_sparse); - if (direction == -1) { v_sparse.negate(); } + if (direction == -1) { + v_sparse.negate(); + work_estimate += 2 * v_sparse.i.size(); + } v_sparse.scatter(v); + work_estimate += 2 * v_sparse.i.size(); const i_t leaving_index = basic_list[basic_leaving_index]; const f_t prev_dy_norm_squared = delta_y_steepest_edge[leaving_index]; @@ -1417,23 +1425,21 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t& settin // B*w = A(:, leaving_index) // B*scaled_delta_xB = -A(:, leaving_index) so w = -scaled_delta_xB const f_t wr = -scaled_delta_xB.find_coefficient(basic_leaving_index); + work_estimate += scaled_delta_xB.i.size(); if (wr == 0) { return -1; } const f_t omegar = dy_norm_squared / (wr * wr); const i_t scaled_delta_xB_nz = scaled_delta_xB.i.size(); - auto& dxB_i = scaled_delta_xB.i.underlying(); - auto& dxB_x = scaled_delta_xB.x.underlying(); - auto& v_raw = v.underlying(); - for (i_t h = 0; h < scaled_delta_xB_nz; ++h) { - const i_t k = dxB_i[h]; + const i_t k = scaled_delta_xB.i[h]; const i_t j = basic_list[k]; if (k == basic_leaving_index) { - const f_t w_squared = dxB_x[h] * dxB_x[h]; + const f_t w = scaled_delta_xB.x[h]; + const f_t w_squared = w * w; delta_y_steepest_edge[j] = (1.0 / w_squared) * dy_norm_squared; } else { - const f_t wk = -dxB_x[h]; - f_t new_val = delta_y_steepest_edge[j] + wk * (2.0 * v_raw[k] / wr + wk * omegar); + const f_t wk = -scaled_delta_xB.x[h]; + f_t new_val = delta_y_steepest_edge[j] + wk * (2.0 * v[k] / wr + wk * omegar); new_val = std::max(new_val, 1e-4); #ifdef STEEPEST_EDGE_DEBUG if (!(new_val >= 0)) { @@ -1452,18 +1458,13 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t& settin delta_y_steepest_edge[j] = new_val; } } + work_estimate += 5 * scaled_delta_xB_nz; - auto& vs_i = v_sparse.i.underlying(); - const i_t v_nz = vs_i.size(); + const i_t v_nz = v_sparse.i.size(); for (i_t k = 0; k < v_nz; ++k) { - v_raw[vs_i[k]] = 0.0; + v[v_sparse.i[k]] = 0.0; } - - scaled_delta_xB.i.byte_loads += scaled_delta_xB_nz * sizeof(i_t); - scaled_delta_xB.x.byte_loads += scaled_delta_xB_nz * sizeof(f_t); - v.byte_loads += scaled_delta_xB_nz * sizeof(f_t); - v.byte_stores += v_nz * sizeof(f_t); - v_sparse.i.byte_loads += v_nz * sizeof(i_t); + work_estimate += 2 * v_nz; return 0; } @@ -1520,10 +1521,11 @@ i_t check_steepest_edge_norms(const simplex_solver_settings_t& setting template i_t compute_perturbation(const lp_problem_t& lp, const simplex_solver_settings_t& settings, - const ins_vector& delta_z_indices, + const std::vector& delta_z_indices, std::vector& z, - ins_vector& objective, - f_t& sum_perturb) + std::vector& objective, + f_t& sum_perturb, + f_t& work_estimate) { const i_t n = lp.num_cols; const i_t m = lp.num_rows; @@ -1558,6 +1560,7 @@ i_t compute_perturbation(const lp_problem_t& lp, #endif } } + work_estimate += 7 * delta_z_indices.size(); #ifdef PERTURBATION_DEBUG if (num_perturb > 0) { settings.log.printf("Perturbed %d dual variables by %e\n", num_perturb, sum_perturb); @@ -1566,11 +1569,12 @@ i_t compute_perturbation(const lp_problem_t& lp, return 0; } -template +template void reset_basis_mark(const std::vector& basic_list, const std::vector& nonbasic_list, - ins_vector& basic_mark, - ins_vector& nonbasic_mark) + std::vector& basic_mark, + std::vector& nonbasic_mark, + f_t& work_estimate) { const i_t m = basic_list.size(); const i_t n = nonbasic_mark.size(); @@ -1579,18 +1583,22 @@ void reset_basis_mark(const std::vector& basic_list, for (i_t k = 0; k < n; k++) { basic_mark[k] = -1; } + work_estimate += n; for (i_t k = 0; k < n; k++) { nonbasic_mark[k] = -1; } + work_estimate += n; for (i_t k = 0; k < n_minus_m; k++) { nonbasic_mark[nonbasic_list[k]] = k; } + work_estimate += 2 * n_minus_m; for (i_t k = 0; k < m; k++) { basic_mark[basic_list[k]] = k; } + work_estimate += 2 * m; } template @@ -1617,20 +1625,14 @@ void compute_delta_y(const basis_update_mpf_t& ft, #ifdef CHECK_B_TRANSPOSE_SOLVE std::vector delta_y_sparse_vector_check(m); delta_y_sparse.to_dense(delta_y_sparse_vector_check); - f_t error_check = 0.0; - for (i_t k = 0; k < m; ++k) { - if (std::abs(delta_y[k] - delta_y_sparse_vector_check[k]) > 1e-6) { - settings.log.printf( - "\tBTranspose error %d %e %e\n", k, delta_y[k], delta_y_sparse_vector_check[k]); - } - error_check += std::abs(delta_y[k] - delta_y_sparse_vector_check[k]); - } - if (error_check > 1e-6) { settings.log.printf("BTranspose error %e\n", error_check); } + // Pass in basic_list and lp for this code to work std::vector residual(m); + std::vector ei(m, 0); + ei[basic_leaving_index] = -direction; b_transpose_multiply(lp, basic_list, delta_y_sparse_vector_check, residual); for (i_t k = 0; k < m; ++k) { if (std::abs(residual[k] - ei[k]) > 1e-6) { - settings.log.printf("\tBTranspose multiply error %d %e %e\n", k, residual[k], ei[k]); + printf("\tBTranspose multiply error %d %e %e\n", k, residual[k], ei[k]); } } #endif @@ -1638,12 +1640,13 @@ void compute_delta_y(const basis_update_mpf_t& ft, template i_t update_dual_variables(const sparse_vector_t& delta_y_sparse, - const ins_vector& delta_z_indices, - const ins_vector& delta_z, + const std::vector& delta_z_indices, + const std::vector& delta_z, f_t step_length, i_t leaving_index, std::vector& y, - std::vector& z) + std::vector& z, + f_t& work_estimate) { // Update dual variables // y <- y + steplength * delta_y @@ -1652,12 +1655,14 @@ i_t update_dual_variables(const sparse_vector_t& delta_y_sparse, const i_t i = delta_y_sparse.i[k]; y[i] += step_length * delta_y_sparse.x[k]; } + work_estimate += 3 * delta_y_nz; // z <- z + steplength * delta_z const i_t delta_z_nz = delta_z_indices.size(); for (i_t k = 0; k < delta_z_nz; ++k) { const i_t j = delta_z_indices[k]; z[j] += step_length * delta_z[j]; } + work_estimate += 3 * delta_z_nz; z[leaving_index] += step_length * delta_z[leaving_index]; return 0; } @@ -1665,14 +1670,15 @@ i_t update_dual_variables(const sparse_vector_t& delta_y_sparse, template void adjust_for_flips(const basis_update_mpf_t& ft, const std::vector& basic_list, - const ins_vector& delta_z_indices, - ins_vector& atilde_index, - ins_vector& atilde, - ins_vector& atilde_mark, + const std::vector& delta_z_indices, + std::vector& atilde_index, + std::vector& atilde, + std::vector& atilde_mark, sparse_vector_t& atilde_sparse, sparse_vector_t& delta_xB_0_sparse, - ins_vector& delta_x_flip, - std::vector& x) + std::vector& delta_x_flip, + std::vector& x, + f_t& work_estimate) { const i_t atilde_nz = atilde_index.size(); // B*delta_xB_0 = atilde @@ -1683,27 +1689,30 @@ void adjust_for_flips(const basis_update_mpf_t& ft, atilde_sparse.i.push_back(atilde_index[k]); atilde_sparse.x.push_back(atilde[atilde_index[k]]); } + work_estimate += 5 * atilde_nz; ft.b_solve(atilde_sparse, delta_xB_0_sparse); const i_t delta_xB_0_nz = delta_xB_0_sparse.i.size(); for (i_t k = 0; k < delta_xB_0_nz; ++k) { const i_t j = basic_list[delta_xB_0_sparse.i[k]]; x[j] += delta_xB_0_sparse.x[k]; } - + work_estimate += 4 * delta_xB_0_nz; for (i_t k = 0; k < delta_z_indices.size(); ++k) { const i_t j = delta_z_indices[k]; x[j] += delta_x_flip[j]; delta_x_flip[j] = 0.0; } - + work_estimate += 4 * delta_z_indices.size(); // Clear atilde for (i_t k = 0; k < atilde_index.size(); ++k) { atilde[atilde_index[k]] = 0.0; } + work_estimate += 2 * atilde_index.size(); // Clear atilde_mark for (i_t k = 0; k < atilde_mark.size(); ++k) { atilde_mark[k] = 0; } + work_estimate += atilde_mark.size(); atilde_index.clear(); } @@ -1715,21 +1724,27 @@ i_t compute_delta_x(const lp_problem_t& lp, i_t basic_leaving_index, i_t direction, const std::vector& basic_list, - const ins_vector& delta_x_flip, + const std::vector& delta_x_flip, const sparse_vector_t& rhs_sparse, const std::vector& x, sparse_vector_t& utilde_sparse, sparse_vector_t& scaled_delta_xB_sparse, - ins_vector& delta_x) + std::vector& delta_x, + f_t& work_estimate) { f_t delta_x_leaving = direction == 1 ? lp.lower[leaving_index] - x[leaving_index] : lp.upper[leaving_index] - x[leaving_index]; // B*w = -A(:, entering) ft.b_solve(rhs_sparse, scaled_delta_xB_sparse, utilde_sparse); scaled_delta_xB_sparse.negate(); + work_estimate += 2 * scaled_delta_xB_sparse.i.size(); #ifdef CHECK_B_SOLVE + const i_t m = basic_list.size(); std::vector scaled_delta_xB(m); + scaled_delta_xB_sparse.to_dense(scaled_delta_xB); + std::vector rhs(m); + rhs_sparse.to_dense(rhs); { std::vector residual_B(m); b_multiply(lp, basic_list, scaled_delta_xB, residual_B); @@ -1737,28 +1752,35 @@ i_t compute_delta_x(const lp_problem_t& lp, for (i_t k = 0; k < m; ++k) { const f_t err = std::abs(rhs[k] + residual_B[k]); if (err >= 1e-6) { - settings.log.printf( - "Bsolve diff %d %e rhs %e residual %e\n", k, err, rhs[k], residual_B[k]); + printf("Bsolve diff %d %e rhs %e residual %e\n", k, err, rhs[k], residual_B[k]); } err_max = std::max(err_max, err); } - if (err_max > 1e-6) { settings.log.printf("B multiply error %e\n", err_max); } + if (err_max > 1e-6) { + printf("B multiply error %e\n", err_max); + } else { + printf("B multiply error %e\n", err_max); + } } #endif f_t scale = scaled_delta_xB_sparse.find_coefficient(basic_leaving_index); + work_estimate += 2 * scaled_delta_xB_sparse.i.size(); if (scale != scale) { // We couldn't find a coefficient for the basic leaving index. // The coefficient might be very small. Switch to a regular solve and try to recover. std::vector rhs; rhs_sparse.to_dense(rhs); + work_estimate += 2 * rhs.size(); const i_t m = basic_list.size(); std::vector scaled_delta_xB(m); + work_estimate += m; ft.b_solve(rhs, scaled_delta_xB); if (scaled_delta_xB[basic_leaving_index] != 0.0 && !std::isnan(scaled_delta_xB[basic_leaving_index])) { scaled_delta_xB_sparse.from_dense(scaled_delta_xB); scaled_delta_xB_sparse.negate(); + work_estimate += 2 * scaled_delta_xB_sparse.i.size() + scaled_delta_xB.size(); scale = -scaled_delta_xB[basic_leaving_index]; } else { return -1; @@ -1770,6 +1792,7 @@ i_t compute_delta_x(const lp_problem_t& lp, const i_t j = basic_list[scaled_delta_xB_sparse.i[k]]; delta_x[j] = primal_step_length * scaled_delta_xB_sparse.x[k]; } + work_estimate += 4 * scaled_delta_xB_nz; delta_x[leaving_index] = delta_x_leaving; delta_x[entering_index] = primal_step_length; return 0; @@ -1778,9 +1801,10 @@ i_t compute_delta_x(const lp_problem_t& lp, template void update_primal_variables(const sparse_vector_t& scaled_delta_xB_sparse, const std::vector& basic_list, - const ins_vector& delta_x, + const std::vector& delta_x, i_t entering_index, - std::vector& x) + std::vector& x, + f_t& work_estimate) { // x <- x + delta_x const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size(); @@ -1788,23 +1812,26 @@ void update_primal_variables(const sparse_vector_t& scaled_delta_xB_sp const i_t j = basic_list[scaled_delta_xB_sparse.i[k]]; x[j] += delta_x[j]; } + work_estimate += 4 * scaled_delta_xB_nz; // Leaving index already included above x[entering_index] += delta_x[entering_index]; } template void update_objective(const std::vector& basic_list, - const ins_vector& changed_basic_indices, + const std::vector& changed_basic_indices, const std::vector& objective, - const ins_vector& delta_x, + const std::vector& delta_x, i_t entering_index, - f_t& obj) + f_t& obj, + f_t& work_estimate) { const i_t changed_basic_nz = changed_basic_indices.size(); for (i_t k = 0; k < changed_basic_nz; ++k) { const i_t j = basic_list[changed_basic_indices[k]]; obj += delta_x[j] * objective[j]; } + work_estimate += 4 * changed_basic_nz; // Leaving index already included above obj += delta_x[entering_index] * objective[entering_index]; } @@ -2219,7 +2246,7 @@ void set_primal_variables_on_bounds(const lp_problem_t& lp, } template -f_t compute_perturbed_objective(const ins_vector& objective, const std::vector& x) +f_t compute_perturbed_objective(const std::vector& objective, const std::vector& x) { const size_t n = objective.size(); f_t obj_val = 0.0; @@ -2230,7 +2257,7 @@ f_t compute_perturbed_objective(const ins_vector& objective, const std::vec } template -f_t amount_of_perturbation(const lp_problem_t& lp, const ins_vector& objective) +f_t amount_of_perturbation(const lp_problem_t& lp, const std::vector& objective) { f_t perturbation = 0.0; const i_t n = lp.num_cols; @@ -2246,7 +2273,7 @@ void prepare_optimality(i_t info, const lp_problem_t& lp, const simplex_solver_settings_t& settings, basis_update_mpf_t& ft, - const ins_vector& objective, + const std::vector& objective, const std::vector& basic_list, const std::vector& nonbasic_list, const std::vector& vstatus, @@ -2259,8 +2286,9 @@ void prepare_optimality(i_t info, std::vector& z, lp_solution_t& sol) { - const i_t m = lp.num_rows; - const i_t n = lp.num_cols; + const i_t m = lp.num_rows; + const i_t n = lp.num_cols; + f_t work_estimate = 0; // Work in this function is not captured sol.objective = compute_objective(lp, sol.x); sol.user_objective = compute_user_objective(lp, sol.objective); @@ -2271,7 +2299,7 @@ void prepare_optimality(i_t info, std::vector unperturbed_y(m); std::vector unperturbed_z(n); phase2::compute_dual_solution_from_basis( - lp, ft, basic_list, nonbasic_list, unperturbed_y, unperturbed_z); + lp, ft, basic_list, nonbasic_list, unperturbed_y, unperturbed_z, work_estimate); { const f_t dual_infeas = phase2::dual_infeasibility( lp, settings, vstatus, unperturbed_z, settings.tight_tol, settings.dual_tol); @@ -2468,6 +2496,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, assert(lp.lower.size() == n); assert(lp.upper.size() == n); assert(lp.rhs.size() == m); + f_t phase2_work_estimate = 0.0; + ft.clear_work_estimate(); std::vector& x = sol.x; std::vector& y = sol.y; @@ -2475,16 +2505,11 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, // Declare instrumented vectors used during initialization (before aggregator setup) // Perturbed objective - ins_vector objective(lp.objective); - ins_vector c_basic(m); - ins_vector xB_workspace(m); - - // Create instrumentation aggregator early to capture init section memory operations - instrumentation_aggregator_t aggregator; + std::vector objective(lp.objective); + std::vector c_basic(m); + std::vector xB_workspace(m); - aggregator.add("objective", objective); - aggregator.add("c_basic", c_basic); - aggregator.add("xB_workspace", xB_workspace); + phase2_work_estimate += 2 * (n + m); dual::status_t status = dual::status_t::UNSET; @@ -2493,14 +2518,20 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, settings.log.printf("Dual Simplex Phase %d\n", phase); std::vector vstatus_old = vstatus; std::vector z_old = z; + phase2_work_estimate += 4 * n; phase2::bound_info(lp, settings); + phase2_work_estimate += 2 * n; + if (initialize_basis) { PHASE2_NVTX_RANGE("DualSimplex::init_basis"); std::vector superbasic_list; nonbasic_list.clear(); nonbasic_list.reserve(n - m); + phase2_work_estimate += (n - m); + get_basis_from_vstatus(m, vstatus, basic_list, nonbasic_list, superbasic_list); + phase2_work_estimate += 2 * n; assert(superbasic_list.size() == 0); assert(nonbasic_list.size() == n - m); @@ -2517,6 +2548,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, const i_t j = basic_list[k]; c_basic[k] = objective[j]; } + phase2_work_estimate += 2 * m; // Solve B'*y = cB ft.b_transpose_solve(c_basic, y); @@ -2527,7 +2559,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, "|| y || %e || cB || %e\n", vector_norm_inf(y), vector_norm_inf(c_basic)); } - phase2::compute_reduced_costs(objective.underlying(), lp.A, y, basic_list, nonbasic_list, z); + phase2::compute_reduced_costs( + objective, lp.A, y, basic_list, nonbasic_list, z, phase2_work_estimate); if constexpr (print_norms) { settings.log.printf("|| z || %e\n", vector_norm_inf(z)); } #ifdef COMPUTE_DUAL_RESIDUAL @@ -2541,6 +2574,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, #endif phase2::set_primal_variables_on_bounds(lp, settings, z, vstatus, x); + phase2_work_estimate += 5 * (n - m); #ifdef PRINT_VSTATUS_CHANGES i_t num_vstatus_changes; @@ -2552,6 +2586,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, const f_t init_dual_inf = phase2::dual_infeasibility(lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol); + phase2_work_estimate += 3 * n; if (init_dual_inf > settings.dual_tol) { settings.log.printf("Initial dual infeasibility %e\n", init_dual_inf); } @@ -2561,9 +2596,17 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, settings.log.printf("Free variable %d vstatus %d\n", j, vstatus[j]); } } + phase2_work_estimate += 3 * n; - phase2::compute_primal_variables( - ft, lp.rhs, lp.A, basic_list, nonbasic_list, settings.tight_tol, x, xB_workspace); + phase2::compute_primal_variables(ft, + lp.rhs, + lp.A, + basic_list, + nonbasic_list, + settings.tight_tol, + x, + xB_workspace, + phase2_work_estimate); if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; } if (print_norms) { settings.log.printf("|| x || %e\n", vector_norm2(x)); } @@ -2580,14 +2623,17 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, if (delta_y_steepest_edge.size() == 0) { PHASE2_NVTX_RANGE("DualSimplex::initialize_steepest_edge_norms"); delta_y_steepest_edge.resize(n); + phase2_work_estimate += n; if (slack_basis) { phase2::initialize_steepest_edge_norms_from_slack_basis( basic_list, nonbasic_list, delta_y_steepest_edge); + phase2_work_estimate += 2 * n; } else { std::fill(delta_y_steepest_edge.begin(), delta_y_steepest_edge.end(), -1); + phase2_work_estimate += n; f_t steepest_edge_start = tic(); i_t status = phase2::initialize_steepest_edge_norms( - lp, settings, start_time, basic_list, ft, delta_y_steepest_edge); + lp, settings, start_time, basic_list, ft, delta_y_steepest_edge, phase2_work_estimate); f_t steepest_edge_time = toc(steepest_edge_start); if (status == CONCURRENT_HALT_RETURN) { return dual::status_t::CONCURRENT_LIMIT; } if (status == -1) { return dual::status_t::TIME_LIMIT; } @@ -2598,6 +2644,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, const i_t j = basic_list[k]; if (delta_y_steepest_edge[j] <= 0.0) { delta_y_steepest_edge[j] = 1e-4; } } + phase2_work_estimate += 2 * m; settings.log.printf("using exisiting steepest edge %e\n", vector_norm2(delta_y_steepest_edge)); } @@ -2608,28 +2655,31 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, const i_t iter_limit = settings.iteration_limit; - // Instrumented vectors for memory access tracking - ins_vector delta_y(m, 0.0); - ins_vector delta_z(n, 0.0); - ins_vector delta_x(n, 0.0); - ins_vector delta_x_flip(n, 0.0); - ins_vector atilde(m, 0.0); - ins_vector atilde_mark(m, 0); - ins_vector atilde_index; - ins_vector nonbasic_mark(n); - ins_vector basic_mark(n); - ins_vector delta_z_mark(n, 0); - ins_vector delta_z_indices; - ins_vector v(m, 0.0); - ins_vector squared_infeasibilities; - ins_vector infeasibility_indices; + std::vector delta_y(m, 0.0); + std::vector delta_z(n, 0.0); + std::vector delta_x(n, 0.0); + std::vector delta_x_flip(n, 0.0); + std::vector atilde(m, 0.0); + std::vector atilde_mark(m, 0); + std::vector atilde_index; + std::vector nonbasic_mark(n); + std::vector basic_mark(n); + std::vector delta_z_mark(n, 0); + std::vector delta_z_indices; + std::vector v(m, 0.0); + std::vector squared_infeasibilities; + std::vector infeasibility_indices; + phase2_work_estimate += 6 * n + 4 * m; delta_z_indices.reserve(n); + phase2_work_estimate += n; - phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark); + phase2::reset_basis_mark( + basic_list, nonbasic_list, basic_mark, nonbasic_mark, phase2_work_estimate); - ins_vector bounded_variables(n, 0); + std::vector bounded_variables(n, 0); phase2::compute_bounded_info(lp.lower, lp.upper, bounded_variables); + phase2_work_estimate += 4 * n; f_t primal_infeasibility; f_t primal_infeasibility_squared = @@ -2640,21 +2690,18 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, squared_infeasibilities, infeasibility_indices, primal_infeasibility); + phase2_work_estimate += 4 * m + 2 * n; #ifdef CHECK_BASIC_INFEASIBILITIES phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0); #endif csc_matrix_t A_transpose(1, 1, 0); - aggregator.add("A_transpose.col_start", A_transpose.col_start); - aggregator.add("A_transpose.i", A_transpose.i); - aggregator.add("A_transpose.x", A_transpose.x); - { - PHASE2_NVTX_RANGE("DualSimplex::transpose_A"); - lp.A.transpose(A_transpose); - } + lp.A.transpose(A_transpose); + phase2_work_estimate += 2 * lp.A.col_start[lp.A.n]; + f_t obj = compute_objective(lp, x); - init_scope.pop(); // End phase2_advanced_init range + phase2_work_estimate += 2 * n; const i_t start_iter = iter; @@ -2665,11 +2712,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, f_t delta_y_nz_percentage = 0.0; phase2::phase2_timers_t timers(false); - // // Feature collection for regression training - // dual_simplex_features_t features; - // features.init_from_problem(lp, settings, phase, slack_basis != 0, initialize_basis); - // features.start_iteration = iter; - // Sparse vectors for main loop (declared outside loop for instrumentation) sparse_vector_t delta_y_sparse(m, 0); sparse_vector_t UTsol_sparse(m, 0); @@ -2680,82 +2722,20 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, sparse_vector_t v_sparse(m, 0); // For steepest edge norms sparse_vector_t atilde_sparse(m, 0); // For flip adjustments - // Add remaining instrumented vectors to aggregator (x, y, z, objective, c_basic, xB_workspace - // added earlier) Delta vectors - aggregator.add("delta_y", delta_y); - aggregator.add("delta_z", delta_z); - aggregator.add("delta_x", delta_x); - aggregator.add("delta_x_flip", delta_x_flip); - aggregator.add("atilde", atilde); - aggregator.add("atilde_mark", atilde_mark); - aggregator.add("atilde_index", atilde_index); - aggregator.add("nonbasic_mark", nonbasic_mark); - aggregator.add("basic_mark", basic_mark); - aggregator.add("delta_z_mark", delta_z_mark); - aggregator.add("delta_z_indices", delta_z_indices); - aggregator.add("v", v); - aggregator.add("squared_infeasibilities", squared_infeasibilities); - aggregator.add("infeasibility_indices", infeasibility_indices); - aggregator.add("bounded_variables", bounded_variables); - - // Add sparse vector internal arrays to aggregator - aggregator.add("delta_y_sparse.i", delta_y_sparse.i); - aggregator.add("delta_y_sparse.x", delta_y_sparse.x); - aggregator.add("UTsol_sparse.i", UTsol_sparse.i); - aggregator.add("UTsol_sparse.x", UTsol_sparse.x); - aggregator.add("delta_xB_0_sparse.i", delta_xB_0_sparse.i); - aggregator.add("delta_xB_0_sparse.x", delta_xB_0_sparse.x); - aggregator.add("utilde_sparse.i", utilde_sparse.i); - aggregator.add("utilde_sparse.x", utilde_sparse.x); - aggregator.add("scaled_delta_xB_sparse.i", scaled_delta_xB_sparse.i); - aggregator.add("scaled_delta_xB_sparse.x", scaled_delta_xB_sparse.x); - aggregator.add("rhs_sparse.i", rhs_sparse.i); - aggregator.add("rhs_sparse.x", rhs_sparse.x); - aggregator.add("v_sparse.i", v_sparse.i); - aggregator.add("v_sparse.x", v_sparse.x); - aggregator.add("atilde_sparse.i", atilde_sparse.i); - aggregator.add("atilde_sparse.x", atilde_sparse.x); - - // Add A matrix for entering column access during basis update - aggregator.add("A.col_start", lp.A.col_start); - aggregator.add("A.i", lp.A.i); - aggregator.add("A.x", lp.A.x); - // Track iteration interval start time for runtime measurement [[maybe_unused]] f_t interval_start_time = toc(start_time); i_t last_feature_log_iter = iter; - cuopt::scope_guard work_unit_guard([&]() { - i_t remaining_iters = iter - last_feature_log_iter; - if (remaining_iters <= 0) return; - - auto [total_loads, total_stores] = aggregator.collect_and_flush(); - // features.byte_loads = total_loads; - // features.byte_stores = total_stores; - - // f_t now = toc(start_time); - // features.interval_runtime = now - interval_start_time; - // interval_start_time = now; - - // features.iteration = iter; - // features.num_refactors = num_refactors; - // features.num_basis_updates = ft.num_updates(); - // features.sparse_delta_z_count = sparse_delta_z; - // features.dense_delta_z_count = dense_delta_z; - // features.total_bound_flips = total_bound_flips; - // features.num_infeasibilities = infeasibility_indices.size(); - // features.delta_y_nz_percentage = delta_y_nz_percentage; - // features.log_features(settings); - - if (work_unit_context) { - // TEMP; - work_unit_context->record_work_sync_on_horizon((total_loads + total_stores) / 1e8); - } - }); + phase2_work_estimate += ft.work_estimate(); + ft.clear_work_estimate(); + if (work_unit_context) { + work_unit_context->record_work_sync_on_horizon((phase2_work_estimate) / 1e8); + } + phase2_work_estimate = 0.0; if (phase == 2) { settings.log.printf("%5d %+.16e %7d %.8e %.2e %.2f\n", - 0, + iter, compute_user_objective(lp, obj), infeasibility_indices.size(), primal_infeasibility_squared, @@ -2783,7 +2763,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, infeasibility_indices, direction, basic_leaving_index, - max_val); + max_val, + phase2_work_estimate); } else { // Max infeasibility pricing leaving_index = phase2::phase2_pricing( @@ -2883,6 +2864,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, timers.btran_time += timers.stop_timer(); const f_t steepest_edge_norm_check = delta_y_sparse.norm2_squared(); + phase2_work_estimate += 2 * delta_y_sparse.i.size(); if (delta_y_steepest_edge[leaving_index] < settings.steepest_edge_ratio * steepest_edge_norm_check) { constexpr bool verbose = false; @@ -2904,6 +2886,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, for (i_t k = 0; k < nz_delta_y; k++) { if (std::abs(delta_y_sparse.x[k]) > 1e-12) { delta_y_nz0++; } } + phase2_work_estimate += nz_delta_y; delta_y_nz_percentage = delta_y_nz0 / static_cast(m) * 100.0; const bool use_transpose = delta_y_nz_percentage <= 30.0; { @@ -2917,11 +2900,13 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, nonbasic_mark, delta_z_mark, delta_z_indices, - delta_z); + delta_z, + phase2_work_estimate); } else { dense_delta_z++; // delta_zB = sigma*ei delta_y_sparse.to_dense(delta_y); + phase2_work_estimate += delta_y.size(); phase2::compute_reduced_cost_update(lp, basic_list, nonbasic_list, @@ -2930,7 +2915,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, direction, delta_z_mark, delta_z_indices, - delta_z); + delta_z, + phase2_work_estimate); } } timers.delta_z_time += timers.stop_timer(); @@ -2974,14 +2960,15 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, slope, lp.lower, lp.upper, - bounded_variables.underlying(), + bounded_variables, vstatus, nonbasic_list, z, - delta_z.underlying(), - delta_z_indices.underlying(), + delta_z, + delta_z_indices, nonbasic_mark); entering_index = bfrt.compute_step_length(step_length, nonbasic_entering_index); + phase2_work_estimate += bfrt.work_estimate(); if (entering_index == RATIO_TEST_NUMERICAL_ISSUES) { settings.log.printf("Numerical issues encountered in ratio test.\n"); return dual::status_t::NUMERICAL; @@ -2998,26 +2985,37 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, settings.log.printf("No entering variable found. Iter %d\n", iter); settings.log.printf("Scaled infeasibility %e\n", max_val); f_t perturbation = phase2::amount_of_perturbation(lp, objective); + phase2_work_estimate += 2 * n; if (perturbation > 0.0 && phase == 2) { // Try to remove perturbation std::vector unperturbed_y(m); std::vector unperturbed_z(n); + phase2_work_estimate += m + n; phase2::compute_dual_solution_from_basis( - lp, ft, basic_list, nonbasic_list, unperturbed_y, unperturbed_z); + lp, ft, basic_list, nonbasic_list, unperturbed_y, unperturbed_z, phase2_work_estimate); { const f_t dual_infeas = phase2::dual_infeasibility( lp, settings, vstatus, unperturbed_z, settings.tight_tol, settings.dual_tol); + phase2_work_estimate += 3 * n; settings.log.printf("Dual infeasibility after removing perturbation %e\n", dual_infeas); if (dual_infeas <= settings.dual_tol) { settings.log.printf("Removed perturbation of %.2e.\n", perturbation); - z = unperturbed_z; - y = unperturbed_y; + z = unperturbed_z; + y = unperturbed_y; + phase2_work_estimate += 2 * n + 2 * m; perturbation = 0.0; std::vector unperturbed_x(n); - phase2::compute_primal_solution_from_basis( - lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x, xB_workspace); + phase2_work_estimate += n; + phase2::compute_primal_solution_from_basis(lp, + ft, + basic_list, + nonbasic_list, + vstatus, + unperturbed_x, + xB_workspace, + phase2_work_estimate); x = unperturbed_x; primal_infeasibility_squared = phase2::compute_initial_primal_infeasibilities(lp, @@ -3027,11 +3025,14 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, squared_infeasibilities, infeasibility_indices, primal_infeasibility); + phase2_work_estimate += 4 * m + 2 * n; settings.log.printf("Updated primal infeasibility: %e\n", primal_infeasibility); objective = lp.objective; + phase2_work_estimate += 2 * n; // Need to reset the objective value, since we have recomputed x obj = phase2::compute_perturbed_objective(objective, x); + phase2_work_estimate += 2 * n; if (dual_infeas <= settings.dual_tol && primal_infeasibility <= settings.primal_tol) { phase2::prepare_optimality(1, primal_infeasibility, @@ -3056,14 +3057,23 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, settings.log.printf( "Continuing with perturbation removed and steepest edge norms reset\n"); // Clear delta_z before restarting the iteration + phase2_work_estimate += 3 * delta_z_indices.size(); phase2::clear_delta_z( entering_index, leaving_index, delta_z_mark, delta_z_indices, delta_z); continue; } else { std::vector unperturbed_x(n); - phase2::compute_primal_solution_from_basis( - lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x, xB_workspace); + phase2_work_estimate += n; + phase2::compute_primal_solution_from_basis(lp, + ft, + basic_list, + nonbasic_list, + vstatus, + unperturbed_x, + xB_workspace, + phase2_work_estimate); x = unperturbed_x; + phase2_work_estimate += 2 * n; primal_infeasibility_squared = phase2::compute_initial_primal_infeasibilities(lp, settings, @@ -3072,9 +3082,11 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, squared_infeasibilities, infeasibility_indices, primal_infeasibility); + phase2_work_estimate += 4 * m + 2 * n; const f_t orig_dual_infeas = phase2::dual_infeasibility( lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol); + phase2_work_estimate += 3 * n; if (primal_infeasibility <= settings.primal_tol && orig_dual_infeas <= settings.dual_tol) { @@ -3135,8 +3147,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, const f_t dual_infeas = phase2::dual_infeasibility(lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol); + phase2_work_estimate += 3 * n; settings.log.printf("Dual infeasibility %e\n", dual_infeas); const f_t primal_inf = phase2::primal_infeasibility(lp, settings, vstatus, x); + phase2_work_estimate += 3 * n; settings.log.printf("Primal infeasibility %e\n", primal_inf); settings.log.printf("Updates %d\n", ft.num_updates()); settings.log.printf("Steepest edge %e\n", max_val); @@ -3152,8 +3166,14 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, // Update dual variables // y <- y + steplength * delta_y // z <- z + steplength * delta_z - i_t update_dual_variables_status = phase2::update_dual_variables( - delta_y_sparse, delta_z_indices, delta_z, step_length, leaving_index, y, z); + i_t update_dual_variables_status = phase2::update_dual_variables(delta_y_sparse, + delta_z_indices, + delta_z, + step_length, + leaving_index, + y, + z, + phase2_work_estimate); if (update_dual_variables_status == -1) { settings.log.printf("Numerical issues encountered in update_dual_variables.\n"); return dual::status_t::NUMERICAL; @@ -3182,7 +3202,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, delta_x_flip, atilde_mark, atilde, - atilde_index); + atilde_index, + phase2_work_estimate); timers.flip_time += timers.stop_timer(); total_bound_flips += num_flipped; @@ -3199,7 +3220,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, atilde_sparse, delta_xB_0_sparse, delta_x_flip, - x); + x, + phase2_work_estimate); timers.ftran_time += timers.stop_timer(); } @@ -3221,7 +3243,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, x, utilde_sparse, scaled_delta_xB_sparse, - delta_x) == -1) { + delta_x, + phase2_work_estimate) == -1) { settings.log.printf("Failed to compute delta_x. Iter %d\n", iter); return dual::status_t::NUMERICAL; } @@ -3248,7 +3271,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, entering_index, v, v_sparse, - delta_y_steepest_edge); + delta_y_steepest_edge, + phase2_work_estimate); #ifdef STEEPEST_EDGE_DEBUG if (steepest_edge_status == -1) { settings.log.printf("Num updates %d\n", ft.num_updates()); @@ -3260,7 +3284,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, timers.start_timer(); // x <- x + delta_x - phase2::update_primal_variables(scaled_delta_xB_sparse, basic_list, delta_x, entering_index, x); + phase2::update_primal_variables( + scaled_delta_xB_sparse, basic_list, delta_x, entering_index, x, phase2_work_estimate); timers.vector_time += timers.stop_timer(); #ifdef COMPUTE_PRIMAL_RESIDUAL @@ -3275,8 +3300,13 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, timers.start_timer(); // TODO(CMM): Do I also need to update the objective due to the bound flips? // TODO(CMM): I'm using the unperturbed objective here, should this be the perturbed objective? - phase2::update_objective( - basic_list, scaled_delta_xB_sparse.i, lp.objective, delta_x, entering_index, obj); + phase2::update_objective(basic_list, + scaled_delta_xB_sparse.i, + lp.objective, + delta_x, + entering_index, + obj, + phase2_work_estimate); timers.objective_time += timers.stop_timer(); timers.start_timer(); @@ -3291,10 +3321,11 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, x, entering_index, leaving_index, - delta_xB_0_sparse.i.underlying(), - squared_infeasibilities.underlying(), - infeasibility_indices.underlying(), - primal_infeasibility_squared); + delta_xB_0_sparse.i, + squared_infeasibilities, + infeasibility_indices, + primal_infeasibility_squared, + phase2_work_estimate); // Update primal infeasibilities due to changes in basic variables // from the leaving and entering variables phase2::update_primal_infeasibilities(lp, @@ -3303,21 +3334,23 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, x, entering_index, leaving_index, - scaled_delta_xB_sparse.i.underlying(), - squared_infeasibilities.underlying(), - infeasibility_indices.underlying(), - primal_infeasibility_squared); + scaled_delta_xB_sparse.i, + squared_infeasibilities, + infeasibility_indices, + primal_infeasibility_squared, + phase2_work_estimate); // Update the entering variable phase2::update_single_primal_infeasibility(lp.lower, lp.upper, x, settings.primal_tol, - squared_infeasibilities.underlying(), - infeasibility_indices.underlying(), + squared_infeasibilities, + infeasibility_indices, entering_index, primal_infeasibility_squared); - phase2::clean_up_infeasibilities(squared_infeasibilities, infeasibility_indices); + phase2::clean_up_infeasibilities( + squared_infeasibilities, infeasibility_indices, phase2_work_estimate); #if CHECK_PRIMAL_INFEASIBILITIES phase2::check_primal_infeasibilities( @@ -3326,11 +3359,13 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, timers.update_infeasibility_time += timers.stop_timer(); // Clear delta_x - phase2::clear_delta_x(basic_list, entering_index, scaled_delta_xB_sparse, delta_x); + phase2::clear_delta_x( + basic_list, entering_index, scaled_delta_xB_sparse, delta_x, phase2_work_estimate); timers.start_timer(); f_t sum_perturb = 0.0; - phase2::compute_perturbation(lp, settings, delta_z_indices, z, objective, sum_perturb); + phase2::compute_perturbation( + lp, settings, delta_z_indices, z, objective, sum_perturb, phase2_work_estimate); timers.perturb_time += timers.stop_timer(); // Update basis information @@ -3394,12 +3429,21 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, settings.log.printf("Successfully repaired basis. Iteration %d\n", iter); } - phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark); + phase2::reset_basis_mark( + basic_list, nonbasic_list, basic_mark, nonbasic_mark, phase2_work_estimate); if (should_recompute_x) { std::vector unperturbed_x(n); - phase2::compute_primal_solution_from_basis( - lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x, xB_workspace); + phase2_work_estimate += n; + phase2::compute_primal_solution_from_basis(lp, + ft, + basic_list, + nonbasic_list, + vstatus, + unperturbed_x, + xB_workspace, + phase2_work_estimate); x = unperturbed_x; + phase2_work_estimate += 2 * n; } primal_infeasibility_squared = phase2::compute_initial_primal_infeasibilities(lp, @@ -3409,6 +3453,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, squared_infeasibilities, infeasibility_indices, primal_infeasibility); + phase2_work_estimate += 4 * m + 2 * n; } #ifdef CHECK_BASIC_INFEASIBILITIES phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 7); @@ -3435,6 +3480,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, iter++; // Clear delta_z + phase2_work_estimate += 3 * delta_z_indices.size(); phase2::clear_delta_z(entering_index, leaving_index, delta_z_mark, delta_z_indices, delta_z); f_t now = toc(start_time); @@ -3443,27 +3489,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, if ((iter % FEATURE_LOG_INTERVAL) == 0 && work_unit_context) { [[maybe_unused]] i_t iters_elapsed = iter - last_feature_log_iter; - auto [total_loads, total_stores] = aggregator.collect_and_flush(); - // features.byte_loads = total_loads; - // features.byte_stores = total_stores; - - // features.interval_runtime = now - interval_start_time; - // interval_start_time = now; - - // features.iteration = iter; - // features.num_refactors = num_refactors; - // features.num_basis_updates = ft.num_updates(); - // features.sparse_delta_z_count = sparse_delta_z; - // features.dense_delta_z_count = dense_delta_z; - // features.total_bound_flips = total_bound_flips; - // features.num_infeasibilities = infeasibility_indices.size(); - // features.delta_y_nz_percentage = delta_y_nz_percentage; - // features.log_features(settings); - - if (work_unit_context) { - // TEMP; - work_unit_context->record_work_sync_on_horizon((total_loads + total_stores) / 1e8); - } + phase2_work_estimate += ft.work_estimate(); + ft.clear_work_estimate(); + work_unit_context->record_work_sync_on_horizon(phase2_work_estimate / 1e8); + phase2_work_estimate = 0.0; last_feature_log_iter = iter; } diff --git a/cpp/src/dual_simplex/primal.cpp b/cpp/src/dual_simplex/primal.cpp index 98f5f4193b..b1b6be0b75 100644 --- a/cpp/src/dual_simplex/primal.cpp +++ b/cpp/src/dual_simplex/primal.cpp @@ -264,6 +264,7 @@ primal::status_t primal_phase2(i_t phase, assert(lp.lower.size() == n); assert(lp.upper.size() == n); assert(lp.rhs.size() == m); + f_t work_estimate = 0; std::vector basic_list(m); std::vector nonbasic_list; std::vector superbasic_list; @@ -294,8 +295,8 @@ primal::status_t primal_phase2(i_t phase, std::vector q(m); std::vector deficient; std::vector slacks_needed; - i_t rank = - factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed); + i_t rank = factorize_basis( + lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate); if (rank == CONCURRENT_HALT_RETURN) { return primal::status_t::CONCURRENT_LIMIT; } else if (rank != m) { @@ -309,8 +310,10 @@ primal::status_t primal_phase2(i_t phase, basic_list, nonbasic_list, superbasic_list, - vstatus); - rank = factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed); + vstatus, + work_estimate); + rank = factorize_basis( + lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed, work_estimate); if (rank == CONCURRENT_HALT_RETURN) { return primal::status_t::CONCURRENT_LIMIT; } else if (rank == -1) { diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp index cb9834705c..93a726d065 100644 --- a/cpp/src/dual_simplex/right_looking_lu.cpp +++ b/cpp/src/dual_simplex/right_looking_lu.cpp @@ -33,17 +33,20 @@ struct element_t { }; constexpr int kNone = -1; -template +template i_t initialize_degree_data(const csc_matrix_t& A, - const VectorI& column_list, + const std::vector& column_list, std::vector& Cdegree, std::vector& Rdegree, std::vector>& col_count, - std::vector>& row_count) + std::vector>& row_count, + f_t& work_estimate) { const i_t n = column_list.size(); const i_t m = A.m; std::fill(Rdegree.begin(), Rdegree.end(), 0); + work_estimate += Rdegree.size(); + i_t Bnz = 0; for (i_t k = 0; k < n; ++k) { const i_t j = column_list[k]; @@ -55,11 +58,13 @@ i_t initialize_degree_data(const csc_matrix_t& A, Bnz++; } } + work_estimate += 3 * n + 2 * Bnz; for (i_t k = 0; k < n; ++k) { assert(Cdegree[k] <= m && Cdegree[k] >= 0); col_count[Cdegree[k]].push_back(k); } + work_estimate += 3 * n; for (i_t k = 0; k < m; ++k) { assert(Rdegree[k] <= n && Rdegree[k] >= 0); @@ -69,20 +74,24 @@ i_t initialize_degree_data(const csc_matrix_t& A, if (verbose) { printf("Zero degree row %d\n", k); } } } + work_estimate += 4 * m; + return Bnz; } -template +template i_t load_elements(const csc_matrix_t& A, - const VectorI& column_list, + const std::vector& column_list, i_t Bnz, std::vector>& elements, std::vector& first_in_row, - std::vector& first_in_col) + std::vector& first_in_col, + f_t& work_estimate) { const i_t m = A.m; const i_t n = column_list.size(); std::vector last_element_in_row(m, kNone); + work_estimate += m; i_t nz = 0; for (i_t k = 0; k < n; ++k) { @@ -110,6 +119,7 @@ i_t load_elements(const csc_matrix_t& A, nz++; } } + work_estimate += 3 * n + 15 * nz; assert(nz == Bnz); for (i_t j = 0; j < n; j++) { @@ -120,6 +130,7 @@ i_t load_elements(const csc_matrix_t& A, assert(entry->i < m); } } + work_estimate += 2 * n + nz; for (i_t i = 0; i < m; i++) { for (i_t p = first_in_row[i]; p != kNone; p = elements[p].next_in_row) { @@ -129,6 +140,7 @@ i_t load_elements(const csc_matrix_t& A, assert(entry->j >= 0); } } + work_estimate += 2 * m + nz; return 0; } @@ -200,7 +212,8 @@ i_t markowitz_search(const std::vector& Cdegree, f_t threshold_tol, i_t& pivot_i, i_t& pivot_j, - i_t& pivot_p) + i_t& pivot_p, + f_t& work_estimate) { i_t nz = 1; const i_t m = Rdegree.size(); @@ -217,6 +230,7 @@ i_t markowitz_search(const std::vector& Cdegree, assert(Cdegree[j] == nz); const f_t max_in_col = max_in_column[j]; + work_estimate += 3 * nz; for (i_t p = first_in_col[j]; p != kNone; p = elements[p].next_in_column) { element_t* entry = &elements[p]; const i_t i = entry->i; @@ -260,6 +274,7 @@ i_t markowitz_search(const std::vector& Cdegree, assert(row_count[nz].size() >= 0); for (const i_t i : row_count[nz]) { assert(Rdegree[i] == nz); + work_estimate += 5 * nz; #ifdef THRESHOLD_ROOK_PIVOTING const f_t max_in_row_i = max_in_row[i]; #endif @@ -301,9 +316,11 @@ void update_Cdegree_and_col_count(i_t pivot_i, const std::vector& first_in_row, std::vector& Cdegree, std::vector>& col_count, - std::vector>& elements) + std::vector>& elements, + f_t& work_estimate) { // Update Cdegree and col_count + i_t loop_count = 0; for (i_t p = first_in_row[pivot_i]; p != kNone; p = elements[p].next_in_row) { element_t* entry = &elements[p]; const i_t j = entry->j; @@ -317,13 +334,16 @@ void update_Cdegree_and_col_count(i_t pivot_i, // Remove col j from col_count[cdeg] std::swap(*it, col_count[cdeg].back()); col_count[cdeg].pop_back(); + work_estimate += (it - col_count[cdeg].begin()); break; } } cdeg = --Cdegree[j]; assert(cdeg >= 0); if (j != pivot_j && cdeg >= 0) { col_count[cdeg].push_back(j); } + loop_count++; } + work_estimate += 7 * loop_count; Cdegree[pivot_j] = -1; } @@ -333,9 +353,11 @@ void update_Rdegree_and_row_count(i_t pivot_i, const std::vector& first_in_col, std::vector& Rdegree, std::vector>& row_count, - std::vector>& elements) + std::vector>& elements, + f_t& work_estimate) { // Update Rdegree and row_count + i_t loop_count = 0; for (i_t p = first_in_col[pivot_j]; p != kNone; p = elements[p].next_in_column) { element_t* entry = &elements[p]; const i_t i = entry->i; @@ -348,13 +370,16 @@ void update_Rdegree_and_row_count(i_t pivot_i, // Remove row i from row_count[rdeg] std::swap(*it, row_count[rdeg].back()); row_count[rdeg].pop_back(); + work_estimate += (it - row_count[rdeg].begin()); break; } } rdeg = --Rdegree[i]; assert(rdeg >= 0); if (i != pivot_i && rdeg >= 0) { row_count[rdeg].push_back(i); } + loop_count++; } + work_estimate += 7 * loop_count; Rdegree[pivot_i] = -1; } @@ -375,7 +400,8 @@ void schur_complement(i_t pivot_i, std::vector& Cdegree, std::vector>& row_count, std::vector>& col_count, - std::vector>& elements) + std::vector>& elements, + f_t& work_estimate) { for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) { element_t* e = &elements[p1]; @@ -384,8 +410,10 @@ void schur_complement(i_t pivot_i, for (i_t p3 = first_in_row[i]; p3 != kNone; p3 = elements[p3].next_in_row) { row_last = p3; } + work_estimate += 2 * Rdegree[i]; row_last_workspace[i] = row_last; } + work_estimate += 4 * Cdegree[pivot_j]; for (i_t p0 = first_in_row[pivot_i]; p0 != kNone; p0 = elements[p0].next_in_row) { element_t* entry = &elements[p0]; @@ -402,6 +430,7 @@ void schur_complement(i_t pivot_i, column_j_workspace[i] = p1; col_last = p1; } + work_estimate += 5 * Cdegree[j]; for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) { element_t* e = &elements[p1]; @@ -457,6 +486,7 @@ void schur_complement(i_t pivot_i, // Remove row i from row_count[rdeg] std::swap(*it, row_count[rdeg].back()); row_count[rdeg].pop_back(); + work_estimate += 2 * (it - row_count[rdeg].begin()); break; } } @@ -471,6 +501,7 @@ void schur_complement(i_t pivot_i, // Remove col j from col_count[cdeg] std::swap(*it, col_count[cdeg].back()); col_count[cdeg].pop_back(); + work_estimate += 2 * (it - col_count[cdeg].begin()); break; } } @@ -478,6 +509,7 @@ void schur_complement(i_t pivot_i, col_count[cdeg].push_back(j); // Add column j to col_count[cdeg] } } + work_estimate += 10 * Cdegree[pivot_j]; for (i_t p1 = first_in_col[j]; p1 != kNone; p1 = elements[p1].next_in_column) { element_t* e = &elements[p1]; @@ -485,7 +517,9 @@ void schur_complement(i_t pivot_i, assert(e->j == j); column_j_workspace[i] = kNone; } + work_estimate += 5 * Cdegree[j]; } + work_estimate += 10 * Rdegree[pivot_i]; } template @@ -494,16 +528,19 @@ void remove_pivot_row(i_t pivot_i, std::vector& first_in_col, std::vector& first_in_row, std::vector& max_in_column, - std::vector>& elements) + std::vector>& elements, + f_t& work_estimate) { // Remove the pivot row + i_t row_loop_count = 0; for (i_t p0 = first_in_row[pivot_i]; p0 != kNone; p0 = elements[p0].next_in_row) { element_t* e = &elements[p0]; const i_t j = e->j; if (j == pivot_j) { continue; } - i_t last = kNone; - f_t max_in_col_j = 0; + i_t last = kNone; + f_t max_in_col_j = 0; + i_t col_loop_count = 0; for (i_t p = first_in_col[j]; p != kNone; p = elements[p].next_in_column) { element_t* entry = &elements[p]; if (entry->i == pivot_i) { @@ -520,9 +557,13 @@ void remove_pivot_row(i_t pivot_i, if (abs_entryx > max_in_col_j) { max_in_col_j = abs_entryx; } } last = p; + col_loop_count++; } + work_estimate += 3 * col_loop_count; max_in_column[j] = max_in_col_j; + row_loop_count++; } + work_estimate += 5 * row_loop_count; first_in_row[pivot_i] = kNone; } @@ -533,9 +574,11 @@ void remove_pivot_col(i_t pivot_i, std::vector& first_in_col, std::vector& first_in_row, std::vector& max_in_row, - std::vector>& elements) + std::vector>& elements, + f_t& work_estimate) { // Remove the pivot col + i_t col_loop_count = 0; for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) { element_t* e = &elements[p1]; const i_t i = e->i; @@ -543,6 +586,7 @@ void remove_pivot_col(i_t pivot_i, #ifdef THRESHOLD_ROOK_PIVOTING f_t max_in_row_i = 0.0; #endif + i_t row_loop_count = 0; for (i_t p = first_in_row[i]; p != kNone; p = elements[p].next_in_row) { element_t* entry = &elements[p]; if (entry->j == pivot_j) { @@ -562,25 +606,30 @@ void remove_pivot_col(i_t pivot_i, } #endif last = p; + row_loop_count++; } + work_estimate += 3 * row_loop_count; #ifdef THRESHOLD_ROOK_PIVOTING max_in_row[i] = max_in_row_i; #endif + col_loop_count++; } first_in_col[pivot_j] = kNone; + work_estimate += 3 * col_loop_count; } } // namespace -template +template i_t right_looking_lu(const csc_matrix_t& A, const simplex_solver_settings_t& settings, f_t tol, - const VectorI& column_list, - VectorI& q, + const std::vector& column_list, + std::vector& q, csc_matrix_t& L, csc_matrix_t& U, - VectorI& pinv) + std::vector& pinv, + f_t& work_estimate) { raft::common::nvtx::range scope("LU::right_looking_lu"); const i_t n = column_list.size(); @@ -596,23 +645,31 @@ i_t right_looking_lu(const csc_matrix_t& A, std::vector Rdegree(n); // Rdegree[i] is the degree of row i std::vector Cdegree(n); // Cdegree[j] is the degree of column j + work_estimate += 2 * n; std::vector> col_count( n + 1); // col_count[nz] is a list of columns with nz nonzeros in the active submatrix std::vector> row_count( n + 1); // row_count[nz] is a list of rows with nz nonzeros in the active submatrix + work_estimate += 2 * n; - const i_t Bnz = initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count); + const i_t Bnz = + initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate); std::vector> elements(Bnz); std::vector first_in_row(n, kNone); std::vector first_in_col(n, kNone); - load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col); + work_estimate += 2 * n + Bnz; + load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col, work_estimate); std::vector column_j_workspace(n, kNone); std::vector row_last_workspace(n); std::vector max_in_column(n); std::vector max_in_row(m); + work_estimate += 3 * n + m; + initialize_max_in_column(first_in_col, elements, max_in_column); + work_estimate += Bnz; + #ifdef THRESHOLD_ROOK_PIVOTING initialize_max_in_row(first_in_row, elements, max_in_row); #endif @@ -622,6 +679,7 @@ i_t right_looking_lu(const csc_matrix_t& A, Urow.n = Urow.m = n; Urow.row_start.resize(n + 1, -1); i_t Unz = 0; + work_estimate += 2 * n; i_t Lnz = 0; L.x.clear(); @@ -631,6 +689,7 @@ i_t right_looking_lu(const csc_matrix_t& A, std::fill(pinv.begin(), pinv.end(), -1); std::vector qinv(n); std::fill(qinv.begin(), qinv.end(), -1); + work_estimate += 4 * n; i_t pivots = 0; for (i_t k = 0; k < n; ++k) { @@ -657,7 +716,8 @@ i_t right_looking_lu(const csc_matrix_t& A, threshold_tol, pivot_i, pivot_j, - pivot_p); + pivot_p, + work_estimate); if (pivot_i == -1 || pivot_j == -1) { break; } element_t* pivot_entry = &elements[pivot_p]; assert(pivot_i != -1 && pivot_j != -1); @@ -688,6 +748,7 @@ i_t right_looking_lu(const csc_matrix_t& A, Unz++; } } + work_estimate += 4 * (Unz - Urow.row_start[k]); // L <- [L l] L.col_start[k] = Lnz; @@ -708,10 +769,13 @@ i_t right_looking_lu(const csc_matrix_t& A, Lnz++; } } + work_estimate += 4 * (Lnz - L.col_start[k]); // Update Cdegree and col_count - update_Cdegree_and_col_count(pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements); - update_Rdegree_and_row_count(pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements); + update_Cdegree_and_col_count( + pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements, work_estimate); + update_Rdegree_and_row_count( + pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements, work_estimate); // A22 <- A22 - l u^T schur_complement(pivot_i, @@ -730,11 +794,14 @@ i_t right_looking_lu(const csc_matrix_t& A, Cdegree, row_count, col_count, - elements); + elements, + work_estimate); // Remove the pivot row - remove_pivot_row(pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements); - remove_pivot_col(pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements); + remove_pivot_row( + pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements, work_estimate); + remove_pivot_col( + pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements, work_estimate); // Set pivot entry to sentinel value pivot_entry->i = -1; @@ -855,6 +922,7 @@ i_t right_looking_lu(const csc_matrix_t& A, for (i_t i = 0; i < m; ++i) { if (pinv[i] == -1) { pinv[i] = start++; } } + work_estimate += m; // Finalize the permutation q. Do this by first completing the inverse permutation qinv. // Then invert qinv to get the final permutation q. @@ -862,7 +930,9 @@ i_t right_looking_lu(const csc_matrix_t& A, for (i_t j = 0; j < n; ++j) { if (qinv[j] == -1) { qinv[j] = start++; } } + work_estimate += n; inverse_permutation(qinv, q); + work_estimate += 2 * n; return pivots; } @@ -875,6 +945,7 @@ i_t right_looking_lu(const csc_matrix_t& A, for (i_t p = 0; p < Lnz; ++p) { L.i[p] = pinv[L.i[p]]; } + work_estimate += 3 * Lnz; #ifdef CHECK_LOWER_TRIANGULAR for (i_t j = 0; j < n; ++j) { @@ -889,17 +960,23 @@ i_t right_looking_lu(const csc_matrix_t& A, #endif csc_matrix_t U_unpermuted(n, n, 1); + work_estimate += n; Urow.to_compressed_col( U_unpermuted); // Convert Urow to U stored in compressed sparse column format + work_estimate += n + Unz; std::vector row_perm(n); + work_estimate += n; inverse_permutation(pinv, row_perm); + work_estimate += 2 * n; std::vector identity(n); for (i_t k = 0; k < n; k++) { identity[k] = k; } + work_estimate += 2 * n; U_unpermuted.permute_rows_and_cols(identity, q, U); + work_estimate += 3 * U.n + 5 * Unz; #ifdef CHECK_UPPER_TRIANGULAR for (i_t k = 0; k < n; ++k) { @@ -929,6 +1006,7 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, // We return the inverser row permutation vector pinv and the column permutation vector q f_t factorization_start_time = tic(); + f_t work_estimate = 0; const i_t n = A.n; const i_t m = A.m; assert(pinv.size() == m); @@ -946,11 +1024,12 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, column_list[k] = k; } - const i_t Bnz = initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count); + const i_t Bnz = + initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate); std::vector> elements(Bnz); std::vector first_in_row(m, kNone); std::vector first_in_col(n, kNone); - load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col); + load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col, work_estimate); std::vector column_j_workspace(m, kNone); std::vector row_last_workspace(m); @@ -997,7 +1076,8 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, threshold_tol, pivot_i, pivot_j, - pivot_p); + pivot_p, + work_estimate); if (pivot_i == -1 || pivot_j == -1) { settings.log.debug("Breaking can't find a pivot %d\n", k); break; @@ -1016,9 +1096,9 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, // Update Cdegree and col_count update_Cdegree_and_col_count( - pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements); + pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements, work_estimate); update_Rdegree_and_row_count( - pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements); + pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements, work_estimate); // A22 <- A22 - l u^T schur_complement(pivot_i, @@ -1037,12 +1117,14 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, Cdegree, row_count, col_count, - elements); + elements, + work_estimate); // Remove the pivot row remove_pivot_row( - pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements); - remove_pivot_col(pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements); + pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements, work_estimate); + remove_pivot_col( + pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements, work_estimate); // Set pivot entry to sentinel value pivot_entry->i = -1; @@ -1149,25 +1231,15 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE -template int right_looking_lu>( - const csc_matrix_t& A, - const simplex_solver_settings_t& settings, - double tol, - const std::vector& column_list, - std::vector& q, - csc_matrix_t& L, - csc_matrix_t& U, - std::vector& pinv); - -template int right_looking_lu>( - const csc_matrix_t& A, - const simplex_solver_settings_t& settings, - double tol, - const ins_vector& column_list, - ins_vector& q, - csc_matrix_t& L, - csc_matrix_t& U, - ins_vector& pinv); +template int right_looking_lu(const csc_matrix_t& A, + const simplex_solver_settings_t& settings, + double tol, + const std::vector& column_list, + std::vector& q, + csc_matrix_t& L, + csc_matrix_t& U, + std::vector& pinv, + double& work_estimate); template int right_looking_lu_row_permutation_only( const csc_matrix_t& A, diff --git a/cpp/src/dual_simplex/right_looking_lu.hpp b/cpp/src/dual_simplex/right_looking_lu.hpp index 179fff01b2..39182eb3f4 100644 --- a/cpp/src/dual_simplex/right_looking_lu.hpp +++ b/cpp/src/dual_simplex/right_looking_lu.hpp @@ -14,15 +14,16 @@ namespace cuopt::linear_programming::dual_simplex { -template +template i_t right_looking_lu(const csc_matrix_t& A, const simplex_solver_settings_t& settings, f_t tol, - const VectorI& column_list, - VectorI& q, + const std::vector& column_list, + std::vector& q, csc_matrix_t& L, csc_matrix_t& U, - VectorI& pinv); + std::vector& pinv, + f_t& work_estimate); template i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, diff --git a/cpp/src/dual_simplex/singletons.cpp b/cpp/src/dual_simplex/singletons.cpp index 347604dcea..dbba46f6a5 100644 --- a/cpp/src/dual_simplex/singletons.cpp +++ b/cpp/src/dual_simplex/singletons.cpp @@ -7,12 +7,10 @@ #include #include -#include +#include #include -using cuopt::ins_vector; - namespace cuopt::linear_programming::dual_simplex { // Destroys the queue but prints it @@ -27,10 +25,11 @@ void print_queue(std::queue& q) printf("\n"); } -template +template i_t order_singletons(std::queue& singleton_queue, i_t& singletons_found, - row_col_graph_t& G) + row_col_graph_t& G, + f_t& work_estimate) { constexpr i_t kEmpty = -1; while (!singleton_queue.empty()) { @@ -68,6 +67,7 @@ i_t order_singletons(std::queue& singleton_queue, break; } } + work_estimate += 2 * (xend - G.Xp[xpivot]); assert(ypivot != kEmpty); assert(G.Ydeg[ypivot] >= 0); @@ -85,6 +85,7 @@ i_t order_singletons(std::queue& singleton_queue, singleton_queue.push(x); } } + work_estimate += 2 * (yend - G.Yp[ypivot]); // Mark the pivot by flipping the degrees G.Xdeg[xpivot] = FLIP(1); G.Ydeg[ypivot] = FLIP(G.Ydeg[ypivot]); @@ -102,7 +103,8 @@ template void create_row_representation(const csc_matrix_t& A, std::vector& row_start, std::vector& col_index, - std::vector& workspace) + std::vector& workspace, + f_t& work_estimate) { // row counts i_t n = A.n; @@ -114,25 +116,32 @@ void create_row_representation(const csc_matrix_t& A, for (i_t i = 0; i < m; ++i) { workspace[i] = 0; } + work_estimate += m; // Compute row degrees for (i_t p = 0; p < nz; ++p) { workspace[A.i[p]]++; } + work_estimate += 3 * nz; + // Compute rowstart and overwrite workspace cumulative_sum(workspace, row_start); + work_estimate += 4 * workspace.size(); for (i_t j = 0; j < n; ++j) { - for (i_t p = A.col_start[j]; p < A.col_start[j + 1]; ++p) { + const i_t col_start = A.col_start[j]; + const i_t col_end = A.col_start[j + 1]; + for (i_t p = col_start; p < col_end; ++p) { i_t q = workspace[A.i[p]]++; col_index[q] = j; } } + work_estimate += 2 * n + 4 * nz; } // Complete the permuation -template -i_t complete_permutation(i_t singletons, std::vector& Xdeg, VectorI& Xperm) +template +i_t complete_permutation(i_t singletons, std::vector& Xdeg, std::vector& Xperm) { i_t n = Xdeg.size(); assert(Xperm.size() == n); @@ -157,12 +166,13 @@ i_t complete_permutation(i_t singletons, std::vector& Xdeg, VectorI& Xperm) return num_empty; } -template +template i_t find_singletons(const csc_matrix_t& A, i_t& row_singletons, - VectorI& row_perm, + std::vector& row_perm, i_t& col_singletons, - VectorI& col_perm) + std::vector& col_perm, + f_t& work_estimate) { i_t n = A.n; i_t m = A.m; @@ -173,6 +183,7 @@ i_t find_singletons(const csc_matrix_t& A, std::vector Cdeg(n); std::vector Rp(m + 1); std::vector Rj(nz); + work_estimate += 3 * m + n + nz; i_t max_queue_len = std::max(m, n); std::queue singleton_queue; @@ -186,36 +197,38 @@ i_t find_singletons(const csc_matrix_t& A, Rdeg[A.i[p]]++; } } + work_estimate += 2 * n + 2 * nz; // Add all columns of degree 1 to the queue for (i_t j = n - 1; j >= 0; --j) { if (Cdeg[j] == 1) { singleton_queue.push(j); } } + work_estimate += n + singleton_queue.size(); bool row_form = false; i_t singletons_found = 0; col_singletons = 0; if (!singleton_queue.empty()) { // Don't create the row representation unless we found a singleton - create_row_representation(A, Rp, Rj, workspace); + create_row_representation(A, Rp, Rj, workspace, work_estimate); row_form = true; // Find column singletons - auto& col_perm_vec = static_cast&>(col_perm); - auto& row_perm_vec = static_cast&>(row_perm); row_col_graph_t graph{Cdeg.begin(), - col_perm_vec.begin(), - A.col_start.underlying().cbegin(), - A.i.underlying().cbegin(), + col_perm.begin(), + A.col_start.cbegin(), + A.i.cbegin(), Rdeg.begin(), - row_perm_vec.begin(), + row_perm.begin(), Rp.cbegin(), Rj.cbegin()}; #ifdef SINGLETON_DEBUG printf("Searching for column singletons. Initial size %ld\n", singleton_queue.size()); #endif - col_singletons = order_singletons(singleton_queue, singletons_found, graph); + + col_singletons = order_singletons(singleton_queue, singletons_found, graph, work_estimate); + #ifdef SINGLETON_DEBUG printf("Found %d column singletons\n", col_singletons); #endif @@ -225,30 +238,29 @@ i_t find_singletons(const csc_matrix_t& A, for (i_t i = m - 1; i >= 0; --i) { if (Rdeg[i] == 1) { singleton_queue.push(i); } } + work_estimate += m + singleton_queue.size(); row_singletons = 0; if (!singleton_queue.empty()) { if (!row_form) { // If we haven't created the row representation yet, we need to - create_row_representation(A, Rp, Rj, workspace); // use Rdeg as workspace + create_row_representation(A, Rp, Rj, workspace, work_estimate); // use Rdeg as workspace } // Find row singletons - auto& row_perm_vec2 = static_cast&>(row_perm); - auto& col_perm_vec2 = static_cast&>(col_perm); row_col_graph_t graph{Rdeg.begin(), - row_perm_vec2.begin(), + row_perm.begin(), Rp.cbegin(), Rj.cbegin(), Cdeg.begin(), - col_perm_vec2.begin(), - A.col_start.underlying().cbegin(), - A.i.underlying().cbegin()}; + col_perm.begin(), + A.col_start.cbegin(), + A.i.cbegin()}; #ifdef SINGLETON_DEBUG printf("Searching for row singletons %ld\n", singleton_queue.size()); #endif i_t last = singletons_found; - row_singletons = order_singletons(singleton_queue, singletons_found, graph); + row_singletons = order_singletons(singleton_queue, singletons_found, graph, work_estimate); row_singletons = row_singletons - last; #ifdef SINGLETON_DEBUG printf("Found %d row singletons. %d\n", row_singletons, singletons_found); @@ -263,10 +275,12 @@ i_t find_singletons(const csc_matrix_t& A, printf("Col singletons %d\n", col_singletons); #endif i_t num_empty_cols = complete_permutation(singletons_found, Cdeg, col_perm); + work_estimate += 2 * Cdeg.size(); #ifdef SINGLETON_DEBUG printf("Completed col perm. %d empty cols. Starting row perm\n", num_empty_cols); #endif i_t num_empty_rows = complete_permutation(singletons_found, Rdeg, row_perm); + work_estimate += 2 * Rdeg.size(); #ifdef SINGLETON_DEBUG printf("Empty rows %d Empty columns %d\n", num_empty_rows, num_empty_cols); #endif @@ -277,35 +291,28 @@ i_t find_singletons(const csc_matrix_t& A, template struct row_col_graph_t; -template int order_singletons(std::queue& singleton_queue, - int& singletons_found, - row_col_graph_t& G); +template int order_singletons(std::queue& singleton_queue, + int& singletons_found, + row_col_graph_t& G, + double& work_estimate); // \param [in,out] workspace - size m template void create_row_representation(const csc_matrix_t& A, std::vector& row_start, std::vector& col_index, - std::vector& workspace); + std::vector& workspace, + double& work_estimate); // Complete the permuation -template int complete_permutation>(int singletons, - std::vector& Xdeg, - std::vector& Xperm); -template int complete_permutation>(int singletons, - std::vector& Xdeg, - ins_vector& Xperm); - -template int find_singletons>(const csc_matrix_t& A, - int& row_singletons, - std::vector& row_perm, - int& col_singleton, - std::vector& col_perm); - -template int find_singletons>(const csc_matrix_t& A, - int& row_singletons, - ins_vector& row_perm, - int& col_singleton, - ins_vector& col_perm); - +template int complete_permutation(int singletons, + std::vector& Xdeg, + std::vector& Xperm); + +template int find_singletons(const csc_matrix_t& A, + int& row_singletons, + std::vector& row_perm, + int& col_singleton, + std::vector& col_perm, + double& work_estimate); #endif } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/singletons.hpp b/cpp/src/dual_simplex/singletons.hpp index 2a1ac3e55f..9bbe9270f7 100644 --- a/cpp/src/dual_simplex/singletons.hpp +++ b/cpp/src/dual_simplex/singletons.hpp @@ -27,26 +27,29 @@ struct row_col_graph_t { typename std::vector::const_iterator Yi; }; -template +template i_t order_singletons(std::queue& singleton_queue, i_t& singletons_found, - row_col_graph_t& G); + row_col_graph_t& G, + f_t& work_estimate); // \param [in,out] workspace - size m template void create_row_representationon(const csc_matrix_t& A, std::vector& row_start, std::vector& col_index, - std::vector& workspace); + std::vector& workspace, + f_t& work_estimate); // Complete the permuation -template -i_t complete_permutationn(i_t singletons, std::vector& Xdeg, std::vector& Xperm); +template +i_t complete_permutation(i_t singletons, std::vector& Xdeg, std::vector& Xperm); -template +template i_t find_singletons(const csc_matrix_t& A, i_t& row_singletons, - VectorI& row_perm, + std::vector& row_perm, i_t& col_singleton, - VectorI& col_perm); + std::vector& col_perm, + f_t& work_estimate); } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/sparse_matrix.cpp b/cpp/src/dual_simplex/sparse_matrix.cpp index 6d9cf2b809..ec8aee09c4 100644 --- a/cpp/src/dual_simplex/sparse_matrix.cpp +++ b/cpp/src/dual_simplex/sparse_matrix.cpp @@ -8,12 +8,9 @@ // #include #include #include -#include #include -using cuopt::ins_vector; - // #include // #include @@ -37,8 +34,8 @@ void csc_matrix_t::reallocate(i_t new_nz) this->nz_max = new_nz; } -template -void cumulative_sum(std::vector& inout, OutputVector& output) +template +void cumulative_sum(std::vector& inout, std::vector& output) { i_t n = inout.size(); assert(output.size() == n + 1); @@ -171,6 +168,7 @@ void csc_matrix_t::append_column(const std::vector& x) this->n++; } +// Work = 4*x.i.size() template void csc_matrix_t::append_column(const sparse_vector_t& x) { @@ -193,6 +191,7 @@ void csc_matrix_t::append_column(const sparse_vector_t& x) this->n++; } +// Work is 5*x_nz template void csc_matrix_t::append_column(i_t x_nz, i_t* i, f_t* x) { @@ -212,6 +211,7 @@ void csc_matrix_t::append_column(i_t x_nz, i_t* i, f_t* x) this->n++; } +// Work = 6*nz + 2*n template i_t csc_matrix_t::transpose(csc_matrix_t& AT) const { @@ -658,8 +658,8 @@ i_t csr_matrix_t::check_matrix(std::string matrix_name) const } // x <- x + alpha * A(:, j) -template -void scatter_dense(const csc_matrix_t& A, i_t j, f_t alpha, VectorF& x) +template +void scatter_dense(const csc_matrix_t& A, i_t j, f_t alpha, std::vector& x) { const i_t col_start = A.col_start[j]; const i_t col_end = A.col_start[j + 1]; @@ -671,9 +671,13 @@ void scatter_dense(const csc_matrix_t& A, i_t j, f_t alpha, VectorF& x } // x <- x + alpha * A(:, j) -template -void scatter_dense( - const csc_matrix_t& A, i_t j, f_t alpha, VectorF& x, VectorI& mark, VectorI& indices) +template +void scatter_dense(const csc_matrix_t& A, + i_t j, + f_t alpha, + std::vector& x, + std::vector& mark, + std::vector& indices) { const i_t col_start = A.col_start[j]; const i_t col_end = A.col_start[j + 1]; @@ -933,10 +937,7 @@ template class csc_matrix_t; template class csr_matrix_t; -template void cumulative_sum>(std::vector& inout, - std::vector& output); -template void cumulative_sum>(std::vector& inout, - ins_vector& output); +template void cumulative_sum(std::vector& inout, std::vector& output); template int coo_to_csc(const std::vector& Ai, const std::vector& Aj, @@ -952,31 +953,17 @@ template int scatter(const csc_matrix_t& A, csc_matrix_t& C, int nz); -template void scatter_dense>(const csc_matrix_t& A, - int j, - double alpha, - std::vector& x); - -template void scatter_dense, std::vector>( - const csc_matrix_t& A, - int j, - double alpha, - std::vector& x, - std::vector& mark, - std::vector& indices); - -template void scatter_dense>(const csc_matrix_t& A, - int j, - double alpha, - ins_vector& x); - -template void scatter_dense, ins_vector>( - const csc_matrix_t& A, - int j, - double alpha, - ins_vector& x, - ins_vector& mark, - ins_vector& indices); +template void scatter_dense(const csc_matrix_t& A, + int j, + double alpha, + std::vector& x); + +template void scatter_dense(const csc_matrix_t& A, + int j, + double alpha, + std::vector& x, + std::vector& mark, + std::vector& indices); template int multiply(const csc_matrix_t& A, const csc_matrix_t& B, diff --git a/cpp/src/dual_simplex/sparse_matrix.hpp b/cpp/src/dual_simplex/sparse_matrix.hpp index 8f8f622518..56e3ca82c6 100644 --- a/cpp/src/dual_simplex/sparse_matrix.hpp +++ b/cpp/src/dual_simplex/sparse_matrix.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include @@ -17,9 +16,6 @@ #include #include -// Import instrumented vector -using cuopt::ins_vector; - namespace cuopt::linear_programming::dual_simplex { template @@ -129,12 +125,12 @@ class csc_matrix_t { return true; } - i_t m; // number of rows - i_t n; // number of columns - i_t nz_max; // maximum number of entries - ins_vector col_start; // column pointers (size n + 1) - ins_vector i; // row indices, size nz_max - ins_vector x; // numerical values, size nz_max + i_t m; // number of rows + i_t n; // number of columns + i_t nz_max; // maximum number of entries + std::vector col_start; // column pointers (size n + 1) + std::vector i; // row indices, size nz_max + std::vector x; // numerical values, size nz_max static_assert(std::is_signed_v); // Require signed integers (we make use of this // to avoid extra space / computation) @@ -177,18 +173,18 @@ class csr_matrix_t { return true; } - i_t nz_max; // maximum number of nonzero entries - i_t m; // number of rows - i_t n; // number of cols - ins_vector row_start; // row pointers (size m + 1) - ins_vector j; // column indices, size nz_max - ins_vector x; // numerical values, size nz_max + i_t nz_max; // maximum number of nonzero entries + i_t m; // number of rows + i_t n; // number of cols + std::vector row_start; // row pointers (size m + 1) + std::vector j; // column indices, size nz_max + std::vector x; // numerical values, size nz_max static_assert(std::is_signed_v); }; -template -void cumulative_sum(std::vector& inout, OutputVector& output); +template +void cumulative_sum(std::vector& inout, std::vector& output); template i_t coo_to_csc(const std::vector& Ai, @@ -207,12 +203,16 @@ i_t scatter(const csc_matrix_t& A, i_t nz); // x <- x + alpha * A(:, j) -template -void scatter_dense(const csc_matrix_t& A, i_t j, f_t alpha, VectorF& x); +template +void scatter_dense(const csc_matrix_t& A, i_t j, f_t alpha, std::vector& x); -template -void scatter_dense( - const csc_matrix_t& A, i_t j, f_t alpha, VectorF& x, VectorI& mark, VectorI& indices); +template +void scatter_dense(const csc_matrix_t& A, + i_t j, + f_t alpha, + std::vector& x, + std::vector& mark, + std::vector& indices); // Compute C = A*B where C is m x n, A is m x k, and B = k x n // Do this by computing C(:, j) = A*B(:, j) = sum (i=1 to k) A(:, k)*B(i, j) diff --git a/cpp/src/dual_simplex/sparse_vector.cpp b/cpp/src/dual_simplex/sparse_vector.cpp index 1c6a15eb21..0d34d4c390 100644 --- a/cpp/src/dual_simplex/sparse_vector.cpp +++ b/cpp/src/dual_simplex/sparse_vector.cpp @@ -13,8 +13,6 @@ namespace cuopt::linear_programming::dual_simplex { -using cuopt::ins_vector; - template sparse_vector_t::sparse_vector_t(const csc_matrix_t& A, i_t col) { @@ -70,17 +68,12 @@ void sparse_vector_t::from_dense(const std::vector& in) n = in.size(); i.reserve(n); x.reserve(n); - auto& i_vec = i.underlying(); - auto& x_vec = x.underlying(); for (i_t k = 0; k < n; ++k) { if (in[k] != 0) { - i_vec.push_back(k); - x_vec.push_back(in[k]); + i.push_back(k); + x.push_back(in[k]); } } - const size_t nz = i_vec.size(); - i.byte_stores += nz * sizeof(i_t); - x.byte_stores += nz * sizeof(f_t); } template @@ -92,8 +85,8 @@ void sparse_vector_t::to_csc(csc_matrix_t& A) const A.col_start.resize(2); A.col_start[0] = 0; A.col_start[1] = i.size(); - A.i = i.underlying(); - A.x = x.underlying(); + A.i = i; + A.x = x; } template @@ -107,17 +100,6 @@ void sparse_vector_t::to_dense(std::vector& x_dense) const } } -template -void sparse_vector_t::to_dense(ins_vector& x_dense) const -{ - x_dense.clear(); - x_dense.resize(n, 0.0); - const i_t nz = i.size(); - for (i_t k = 0; k < nz; ++k) { - x_dense[i[k]] = x[k]; - } -} - template void sparse_vector_t::scatter(std::vector& x_dense) const { @@ -128,29 +110,12 @@ void sparse_vector_t::scatter(std::vector& x_dense) const } } -template -void sparse_vector_t::scatter(ins_vector& x_dense) const -{ - // Assumes x_dense is already cleared - auto& sv_i = i.underlying(); - auto& sv_x = x.underlying(); - auto& dense = x_dense.underlying(); - const i_t nz = sv_i.size(); - for (i_t k = 0; k < nz; ++k) { - dense[sv_i[k]] += sv_x[k]; - } - i.byte_loads += nz * sizeof(i_t); - x.byte_loads += nz * sizeof(f_t); - x_dense.byte_loads += nz * sizeof(f_t); - x_dense.byte_stores += nz * sizeof(f_t); -} - template void sparse_vector_t::inverse_permute_vector(const std::vector& p) { assert(p.size() == n); i_t nz = i.size(); - ins_vector i_perm(nz); + std::vector i_perm(nz); for (i_t k = 0; k < nz; ++k) { i_perm[k] = p[i[k]]; } @@ -166,7 +131,7 @@ void sparse_vector_t::inverse_permute_vector(const std::vector& p i_t nz = i.size(); y.n = n; y.x = x; - ins_vector i_perm(nz); + std::vector i_perm(nz); for (i_t k = 0; k < nz; ++k) { i_perm[k] = p[i[k]]; } @@ -232,8 +197,8 @@ void sparse_vector_t::sort() } else { // Use a n log n sort const i_t nz = i.size(); - ins_vector i_sorted(nz); - ins_vector x_sorted(nz); + std::vector i_sorted(nz); + std::vector x_sorted(nz); std::vector perm(nz); for (i_t k = 0; k < nz; ++k) { perm[k] = k; diff --git a/cpp/src/dual_simplex/sparse_vector.hpp b/cpp/src/dual_simplex/sparse_vector.hpp index fadd2df338..6ea642ee07 100644 --- a/cpp/src/dual_simplex/sparse_vector.hpp +++ b/cpp/src/dual_simplex/sparse_vector.hpp @@ -9,15 +9,11 @@ #include #include -#include #include namespace cuopt::linear_programming::dual_simplex { -// Import instrumented vector type -using cuopt::ins_vector; - // A sparse vector stored as a list of nonzero coefficients and their indices template class sparse_vector_t { @@ -37,13 +33,9 @@ class sparse_vector_t { void to_csc(csc_matrix_t& A) const; // convert a sparse vector into a dense vector. Dense vector is cleared and resized. void to_dense(std::vector& x_dense) const; - // convert a sparse vector into an instrumented dense vector. - void to_dense(ins_vector& x_dense) const; // scatter a sparse vector into a dense vector. Assumes x_dense is already cleared or // preinitialized void scatter(std::vector& x_dense) const; - // scatter into instrumented vector - void scatter(ins_vector& x_dense) const; // inverse permute the current sparse vector void inverse_permute_vector(const std::vector& p); // inverse permute a sparse vector into another sparse vector @@ -71,8 +63,8 @@ class sparse_vector_t { void squeeze(sparse_vector_t& y) const; i_t n; - ins_vector i; - ins_vector x; + std::vector i; + std::vector x; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/triangle_solve.cpp b/cpp/src/dual_simplex/triangle_solve.cpp index 2f280295bf..ad09ae30cf 100644 --- a/cpp/src/dual_simplex/triangle_solve.cpp +++ b/cpp/src/dual_simplex/triangle_solve.cpp @@ -26,19 +26,22 @@ template i_t reach(const sparse_vector_t& b, const std::optional>& pinv, csc_matrix_t& G, - std::vector& xi) + std::vector& xi, + f_t& work_estimate) { const i_t m = G.m; i_t top = m; const i_t bnz = b.i.size(); for (i_t p = 0; p < bnz; ++p) { if (!MARKED(G.col_start, b.i[p])) { // start a DFS at unmarked node i - top = depth_first_search(b.i[p], pinv, G, top, xi, xi.begin() + m); + top = depth_first_search(b.i[p], pinv, G, top, xi, xi.begin() + m, work_estimate); } } + work_estimate += 4 * bnz; for (i_t p = top; p < m; ++p) { // restore G MARK(G.col_start, xi[p]); } + work_estimate += 3 * (m - top); return top; } @@ -61,7 +64,8 @@ i_t depth_first_search(i_t j, csc_matrix_t& G, i_t top, std::vector& xi, - typename std::vector::iterator pstack) + typename std::vector::iterator pstack, + f_t& work_estimate) { i_t head = 0; xi[0] = j; // Initialize the recursion stack @@ -75,10 +79,12 @@ i_t depth_first_search(i_t j, // Point to the first outgoing edge of node j pstack[head] = (jnew < 0) ? 0 : UNFLIP(G.col_start[jnew]); } - done = 1; // Node j is done if no unvisited neighbors - i_t p2 = (jnew < 0) ? 0 : UNFLIP(G.col_start[jnew + 1]); - for (i_t p = pstack[head]; p < p2; ++p) { // Examine all neighbors of j - i_t i = G.i[p]; // Consider neighbor i + done = 1; // Node j is done if no unvisited neighbors + i_t p2 = (jnew < 0) ? 0 : UNFLIP(G.col_start[jnew + 1]); + const i_t psav = pstack[head]; + i_t p; + for (p = psav; p < p2; ++p) { // Examine all neighbors of j + i_t i = G.i[p]; // Consider neighbor i if (MARKED(G.col_start, i)) { continue; // skip visited node i } @@ -87,6 +93,7 @@ i_t depth_first_search(i_t j, done = 0; // node j is not done break; // break to start dfs at node i } + work_estimate += 3 * (p - psav) + 10; if (done) { pstack[head] = 0; // restore pstack so it can be used again in other routines xi[head] = 0; // restore xi so it can be used again in other routines @@ -102,18 +109,23 @@ i_t sparse_triangle_solve(const sparse_vector_t& b, const std::optional>& pinv, std::vector& xi, csc_matrix_t& G, - f_t* x) + f_t* x, + f_t& work_estimate) { i_t m = G.m; assert(b.n == m); - i_t top = reach(b, pinv, G, xi); + i_t top = reach(b, pinv, G, xi, work_estimate); for (i_t p = top; p < m; ++p) { x[xi[p]] = 0; // Clear x vector } + work_estimate += 2 * (m - top); + const i_t bnz = b.i.size(); for (i_t p = 0; p < bnz; ++p) { x[b.i[p]] = b.x[p]; // Scatter b } + work_estimate += 3 * bnz; + for (i_t px = top; px < m; ++px) { i_t j = xi[px]; // x(j) is nonzero i_t J = pinv ? ((*pinv)[j]) : j; // j maps to column J of G @@ -131,6 +143,7 @@ i_t sparse_triangle_solve(const sparse_vector_t& b, end = G.col_start[J + 1] - 1; } x[j] /= Gjj; + work_estimate += 4 * (end - p) + 7; for (; p < end; ++p) { x[G.i[p]] -= G.x[p] * x[j]; // x(i) -= G(i,j) * x(j) } @@ -147,26 +160,30 @@ i_t sparse_triangle_solve(const sparse_vector_t& b, template int reach(const sparse_vector_t& b, const std::optional>& pinv, csc_matrix_t& G, - std::vector& xi); + std::vector& xi, + double& work_estimate); template int depth_first_search(int j, const std::optional>& pinv, csc_matrix_t& G, int top, std::vector& xi, - std::vector::iterator pstack); + std::vector::iterator pstack, + double& work_estimate); template int sparse_triangle_solve(const sparse_vector_t& b, const std::optional>& pinv, std::vector& xi, csc_matrix_t& G, - double* x); + double* x, + double& work_estimate); template int sparse_triangle_solve(const sparse_vector_t& b, const std::optional>& pinv, std::vector& xi, csc_matrix_t& G, - double* x); + double* x, + double& work_estimate); #endif } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/triangle_solve.hpp b/cpp/src/dual_simplex/triangle_solve.hpp index 20af92cc93..811ed8b927 100644 --- a/cpp/src/dual_simplex/triangle_solve.hpp +++ b/cpp/src/dual_simplex/triangle_solve.hpp @@ -25,126 +25,91 @@ namespace cuopt::linear_programming::dual_simplex { // Solve L*x = b. On input x contains the right-hand side b, on output the // solution -template -i_t lower_triangular_solve(const csc_matrix_t& L, VectorF& x) +template +i_t lower_triangular_solve(const csc_matrix_t& L, std::vector& x, f_t& work_estimate) { i_t n = L.n; assert(x.size() == n); - auto& L_cs = L.col_start.underlying(); - auto& L_i = L.i.underlying(); - auto& L_x = L.x.underlying(); - - size_t nnz_processed = 0; for (i_t j = 0; j < n; ++j) { - i_t col_start = L_cs[j]; - i_t col_end = L_cs[j + 1]; + i_t col_start = L.col_start[j]; + i_t col_end = L.col_start[j + 1]; if (x[j] != 0.0) { - x[j] /= L_x[col_start]; - auto x_j = x[j]; // hoist this load out of the loop + x[j] /= L.x[col_start]; + const f_t x_j = x[j]; // hoist this load out of the loop // as the compiler cannot guess that x[j] never aliases to x[L.i[p]] for (i_t p = col_start + 1; p < col_end; ++p) { - x[L_i[p]] -= L_x[p] * x_j; + x[L.i[p]] -= L.x[p] * x_j; } - nnz_processed += col_end - col_start; } } - - L.col_start.byte_loads += (n + 1) * sizeof(i_t); - L.i.byte_loads += nnz_processed * sizeof(i_t); - L.x.byte_loads += nnz_processed * sizeof(f_t); - + work_estimate += 3 * n + 3 * L.col_start[n]; return 0; } // Solve L'*x = b. On input x contains the right-hand side b, on output the // solution -template -i_t lower_triangular_transpose_solve(const csc_matrix_t& L, VectorF& x) +template +i_t lower_triangular_transpose_solve(const csc_matrix_t& L, + std::vector& x, + f_t& work_estimate) { const i_t n = L.n; assert(x.size() == n); - auto& L_cs = L.col_start.underlying(); - auto& L_i = L.i.underlying(); - auto& L_x = L.x.underlying(); - for (i_t j = n - 1; j >= 0; --j) { - const i_t col_start = L_cs[j] + 1; - const i_t col_end = L_cs[j + 1]; + const i_t col_start = L.col_start[j] + 1; + const i_t col_end = L.col_start[j + 1]; for (i_t p = col_start; p < col_end; ++p) { - x[j] -= L_x[p] * x[L_i[p]]; + x[j] -= L.x[p] * x[L.i[p]]; } - x[j] /= L_x[L_cs[j]]; + x[j] /= L.x[L.col_start[j]]; } - - const size_t total_nnz = L_cs[n]; - L.col_start.byte_loads += (n + 1) * sizeof(i_t); - L.i.byte_loads += total_nnz * sizeof(i_t); - L.x.byte_loads += total_nnz * sizeof(f_t); - + work_estimate += 6 * n + 4 * L.col_start[n]; return 0; } // Solve U*x = b. On input x contains the right-hand side b, on output the // solution -template -i_t upper_triangular_solve(const csc_matrix_t& U, VectorF& x) +template +i_t upper_triangular_solve(const csc_matrix_t& U, std::vector& x, f_t& work_estimate) { const i_t n = U.n; assert(x.size() == n); - - auto& U_cs = U.col_start.underlying(); - auto& U_i = U.i.underlying(); - auto& U_x = U.x.underlying(); - - size_t nnz_processed = 0; for (i_t j = n - 1; j >= 0; --j) { - const i_t col_start = U_cs[j]; - const i_t col_end = U_cs[j + 1] - 1; + const i_t col_start = U.col_start[j]; + const i_t col_end = U.col_start[j + 1] - 1; if (x[j] != 0.0) { - x[j] /= U_x[col_end]; - auto x_j = x[j]; // same x_j load hoisting + x[j] /= U.x[col_end]; + const f_t x_j = x[j]; // same x_j load hoisting for (i_t p = col_start; p < col_end; ++p) { - x[U_i[p]] -= U_x[p] * x_j; + x[U.i[p]] -= U.x[p] * x_j; } - nnz_processed += col_end - col_start; + work_estimate += 3 * (col_end - col_start) + 3; } } - - U.col_start.byte_loads += (n + 1) * sizeof(i_t); - U.i.byte_loads += nnz_processed * sizeof(i_t); - U.x.byte_loads += nnz_processed * sizeof(f_t); - + work_estimate += 3 * n; return 0; } // Solve U'*x = b. On input x contains the right-hand side b, on output the // solution -template -i_t upper_triangular_transpose_solve(const csc_matrix_t& U, VectorF& x) +template +i_t upper_triangular_transpose_solve(const csc_matrix_t& U, + std::vector& x, + f_t& work_estimate) { const i_t n = U.n; assert(x.size() == n); - - auto& U_cs = U.col_start.underlying(); - auto& U_i = U.i.underlying(); - auto& U_x = U.x.underlying(); - for (i_t j = 0; j < n; ++j) { - const i_t col_start = U_cs[j]; - const i_t col_end = U_cs[j + 1] - 1; + const i_t col_start = U.col_start[j]; + const i_t col_end = U.col_start[j + 1] - 1; for (i_t p = col_start; p < col_end; ++p) { - x[j] -= U_x[p] * x[U_i[p]]; + x[j] -= U.x[p] * x[U.i[p]]; } - x[j] /= U_x[col_end]; + x[j] /= U.x[col_end]; } - - const size_t total_nnz = U_cs[n]; - U.col_start.byte_loads += (n + 1) * sizeof(i_t); - U.i.byte_loads += total_nnz * sizeof(i_t); - U.x.byte_loads += total_nnz * sizeof(f_t); - + work_estimate += 4 * n + 4 * U.col_start[n]; return 0; } @@ -159,7 +124,8 @@ template i_t reach(const sparse_vector_t& b, const std::optional>& pinv, csc_matrix_t& G, - std::vector& xi); + std::vector& xi, + f_t& work_estimate); // \brief Performs a depth-first search starting from node j in the graph // defined by G @@ -182,7 +148,8 @@ i_t depth_first_search(i_t j, csc_matrix_t& G, i_t top, std::vector& xi, - typename std::vector::iterator pstack); + typename std::vector::iterator pstack, + f_t& work_estimate); // \brief sparse_triangle_solve solve L*x = b or U*x = b where L is a sparse lower // triangular matrix @@ -201,6 +168,7 @@ i_t sparse_triangle_solve(const sparse_vector_t& b, const std::optional>& pinv, std::vector& xi, csc_matrix_t& G, - f_t* x); + f_t* x, + f_t& work_estimate); } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/vector_math.cpp b/cpp/src/dual_simplex/vector_math.cpp index 8ea31d3e39..0db09b00b6 100644 --- a/cpp/src/dual_simplex/vector_math.cpp +++ b/cpp/src/dual_simplex/vector_math.cpp @@ -57,6 +57,7 @@ f_t dot(const std::vector& x, const std::vector& y) return dot; } +// Work = 3*min(nz_x, nz_y) template f_t sparse_dot( i_t const* xind, f_t const* xval, i_t nx, i_t const* yind, i_t ny, f_t const* y_scatter_val) @@ -123,8 +124,48 @@ f_t sparse_dot(const std::vector& xind, return dot; } -// NOTE: permute_vector, inverse_permute_vector, and inverse_permutation are now -// templated on vector types and defined in the header file. +// Computes x = P*b or x=b(p) in MATLAB. +// Work is 3*n +template +i_t permute_vector(const std::vector& p, const std::vector& b, std::vector& x) +{ + i_t n = p.size(); + assert(x.size() == n); + assert(b.size() == n); + for (i_t k = 0; k < n; ++k) { + x[k] = b[p[k]]; + } + return 0; +} + +// Computes x = P'*b or x(p) = b in MATLAB. +// Work is 3 * n +template +i_t inverse_permute_vector(const std::vector& p, + const std::vector& b, + std::vector& x) +{ + i_t n = p.size(); + assert(x.size() == n); + assert(b.size() == n); + for (i_t k = 0; k < n; ++k) { + x[p[k]] = b[k]; + } + return 0; +} + +// Computes pinv from p. Or pinv(p) = 1:n in MATLAB +// Work is 2*n +template +i_t inverse_permutation(const std::vector& p, std::vector& pinv) +{ + i_t n = p.size(); + if (pinv.size() != n) { pinv.resize(n); } + for (i_t k = 0; k < n; ++k) { + pinv[p[k]] = k; + } + return 0; +} #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE @@ -160,8 +201,14 @@ template double sparse_dot(int const* xind, template double sparse_dot( int* xind, double* xval, int nx, int* yind, double* yval, int ny); -// NOTE: permute_vector, inverse_permute_vector, and inverse_permutation are now -// templated on vector types and defined in the header file, so no explicit instantiation needed. +template int permute_vector(const std::vector& p, + const std::vector& b, + std::vector& x); +template int inverse_permute_vector(const std::vector& p, + const std::vector& b, + std::vector& x); +template int inverse_permutation(const std::vector& p, std::vector& pinv); + #endif } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/vector_math.hpp b/cpp/src/dual_simplex/vector_math.hpp index 6bf573e892..478e024e51 100644 --- a/cpp/src/dual_simplex/vector_math.hpp +++ b/cpp/src/dual_simplex/vector_math.hpp @@ -57,41 +57,16 @@ template f_t sparse_dot(i_t* xind, f_t* xval, i_t nx, i_t* yind, f_t* yval, i_t ny); // Computes x = P*b or x=b(p) in MATLAB. -template -int permute_vector(const VectorI& p, const VectorF_in& b, VectorF_out& x) -{ - auto n = p.size(); - assert(x.size() == n); - assert(b.size() == n); - for (decltype(n) k = 0; k < n; ++k) { - x[k] = b[p[k]]; - } - return 0; -} - +template +i_t permute_vector(const std::vector& p, const std::vector& b, std::vector& x); // Computes x = P'*b or x(p) = b in MATLAB. -template -int inverse_permute_vector(const VectorI& p, const VectorF_in& b, VectorF_out& x) -{ - auto n = p.size(); - assert(x.size() == n); - assert(b.size() == n); - for (decltype(n) k = 0; k < n; ++k) { - x[p[k]] = b[k]; - } - return 0; -} +template +i_t inverse_permute_vector(const std::vector& p, + const std::vector& b, + std::vector& x); // Computes pinv from p. Or pinv(p) = 1:n in MATLAB -template -int inverse_permutation(const VectorI_in& p, VectorI_out& pinv) -{ - auto n = p.size(); - if (pinv.size() != n) { pinv.resize(n); } - for (decltype(n) k = 0; k < n; ++k) { - pinv[p[k]] = k; - } - return 0; -} +template +i_t inverse_permutation(const std::vector& p, std::vector& pinv); } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/mip_heuristics/problem/problem.cu b/cpp/src/mip_heuristics/problem/problem.cu index df7f37c541..d77e2e5f65 100644 --- a/cpp/src/mip_heuristics/problem/problem.cu +++ b/cpp/src/mip_heuristics/problem/problem.cu @@ -2090,9 +2090,9 @@ void problem_t::get_host_user_problem( user_problem.objective = cuopt::host_copy(objective_coefficients, stream); dual_simplex::csr_matrix_t csr_A(m, n, nz); - csr_A.x = ins_vector(cuopt::host_copy(coefficients, stream)); - csr_A.j = ins_vector(cuopt::host_copy(variables, stream)); - csr_A.row_start = ins_vector(cuopt::host_copy(offsets, stream)); + csr_A.x = std::vector(cuopt::host_copy(coefficients, stream)); + csr_A.j = std::vector(cuopt::host_copy(variables, stream)); + csr_A.row_start = std::vector(cuopt::host_copy(offsets, stream)); csr_A.to_compressed_col(user_problem.A); diff --git a/cpp/src/pdlp/translate.hpp b/cpp/src/pdlp/translate.hpp index 6ecb37c6e6..aebe87b140 100644 --- a/cpp/src/pdlp/translate.hpp +++ b/cpp/src/pdlp/translate.hpp @@ -30,9 +30,9 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( user_problem.objective = cuopt::host_copy(model.objective_coefficients, handle_ptr->get_stream()); dual_simplex::csr_matrix_t csr_A(m, n, nz); - csr_A.x = ins_vector(cuopt::host_copy(model.coefficients, handle_ptr->get_stream())); - csr_A.j = ins_vector(cuopt::host_copy(model.variables, handle_ptr->get_stream())); - csr_A.row_start = ins_vector(cuopt::host_copy(model.offsets, handle_ptr->get_stream())); + csr_A.x = std::vector(cuopt::host_copy(model.coefficients, handle_ptr->get_stream())); + csr_A.j = std::vector(cuopt::host_copy(model.variables, handle_ptr->get_stream())); + csr_A.row_start = std::vector(cuopt::host_copy(model.offsets, handle_ptr->get_stream())); csr_A.to_compressed_col(user_problem.A); @@ -121,9 +121,9 @@ void translate_to_crossover_problem(const detail::problem_t& problem, dual_simplex::csr_matrix_t csr_A( problem.n_constraints, problem.n_variables, problem.nnz); - csr_A.x = ins_vector(cuopt::host_copy(problem.coefficients, stream)); - csr_A.j = ins_vector(cuopt::host_copy(problem.variables, stream)); - csr_A.row_start = ins_vector(cuopt::host_copy(problem.offsets, stream)); + csr_A.x = std::vector(cuopt::host_copy(problem.coefficients, stream)); + csr_A.j = std::vector(cuopt::host_copy(problem.variables, stream)); + csr_A.row_start = std::vector(cuopt::host_copy(problem.offsets, stream)); stream.synchronize(); CUOPT_LOG_DEBUG("Converting to compressed column");