From a8ebd6a47d482642ca58dfd1905077bef58af0e8 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Tue, 26 Aug 2025 16:17:12 +0000 Subject: [PATCH 1/3] fix allocations in related_var_count --- .../linear_programming/cuopt/run_mip.cpp | 38 ++++++++++++++++--- cpp/src/mip/problem/problem.cu | 12 ++++-- cpp/src/mip/solve.cu | 11 +++--- cpp/src/utilities/cuda_helpers.cuh | 25 ++++++++++++ 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index 713d55f16b..58a7799084 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -28,7 +28,10 @@ #include #include +#include +#include #include +#include #include @@ -256,7 +259,9 @@ void run_single_file_mp(std::string file_path, { std::cout << "running file " << file_path << " on gpu : " << device << std::endl; auto memory_resource = make_async(); - rmm::mr::set_current_device_resource(memory_resource.get()); + auto limiting_adaptor = + rmm::mr::limiting_resource_adaptor(memory_resource.get(), 6ULL * 1024ULL * 1024ULL * 1024ULL); + rmm::mr::set_current_device_resource(&limiting_adaptor); int sol_found = run_single_file(file_path, device, batch_id, @@ -340,6 +345,15 @@ int main(int argc, char* argv[]) .scan<'g', double>() .default_value(std::numeric_limits::max()); + program.add_argument("--memory-limit") + .help("memory limit in MB") + .scan<'g', double>() + .default_value(0); + + program.add_argument("--track-allocations") + .help("track allocations (t/f)") + .default_value(std::string("f")); + // Parse arguments try { program.parse_args(argc, argv); @@ -362,10 +376,12 @@ int main(int argc, char* argv[]) std::string result_file; int batch_num = -1; - bool heuristics_only = program.get("--heuristics-only")[0] == 't'; - int num_cpu_threads = program.get("--num-cpu-threads"); - bool write_log_file = program.get("--write-log-file")[0] == 't'; - bool log_to_console = program.get("--log-to-console")[0] == 't'; + bool heuristics_only = program.get("--heuristics-only")[0] == 't'; + int num_cpu_threads = program.get("--num-cpu-threads"); + bool write_log_file = program.get("--write-log-file")[0] == 't'; + bool log_to_console = program.get("--log-to-console")[0] == 't'; + double memory_limit = program.get("--memory-limit"); + bool track_allocations = program.get("--track-allocations")[0] == 't'; if (program.is_used("--out-dir")) { out_dir = program.get("--out-dir"); @@ -469,7 +485,17 @@ int main(int argc, char* argv[]) merge_result_files(out_dir, result_file, n_gpus, batch_num); } else { auto memory_resource = make_async(); - rmm::mr::set_current_device_resource(memory_resource.get()); + if (memory_limit > 0) { + auto limiting_adaptor = + rmm::mr::limiting_resource_adaptor(memory_resource.get(), memory_limit * 1024ULL * 1024ULL); + rmm::mr::set_current_device_resource(&limiting_adaptor); + } else if (track_allocations) { + rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource.get(), + /*capture_stacks=*/true); + rmm::mr::set_current_device_resource(&tracking_adaptor); + } else { + rmm::mr::set_current_device_resource(memory_resource.get()); + } run_single_file(path, 0, 0, diff --git a/cpp/src/mip/problem/problem.cu b/cpp/src/mip/problem/problem.cu index 1a5f76b038..fa1312a8ec 100644 --- a/cpp/src/mip/problem/problem.cu +++ b/cpp/src/mip/problem/problem.cu @@ -21,6 +21,7 @@ #include "problem_kernels.cuh" #include +#include #include #include @@ -810,16 +811,21 @@ void problem_t::compute_related_variables(double time_limit) handle_ptr->sync_stream(); + // previously used constants were based on 40GB of memory. Scale accordingly on smaller GPUs + // We can't rely on querying free memory or allocation try/catch + // since this would break determinism guarantees (GPU may be shared by other processes) + f_t size_factor = std::min(1.0, cuopt::get_device_memory_size() / 1e9 / 40.0); + // TODO: determine optimal number of slices based on available GPU memory? This used to be 2e9 / // n_variables - i_t max_slice_size = 6e8 / n_variables; + i_t max_slice_size = 6e8 * size_factor / n_variables; rmm::device_uvector varmap(max_slice_size * n_variables, handle_ptr->get_stream()); rmm::device_uvector offsets(max_slice_size * n_variables, handle_ptr->get_stream()); related_variables.resize(0, handle_ptr->get_stream()); // TODO: this used to be 1e8 - related_variables.reserve(1e8, handle_ptr->get_stream()); // reserve space + related_variables.reserve(1e8 * size_factor, handle_ptr->get_stream()); // reserve space related_variables_offsets.resize(n_variables + 1, handle_ptr->get_stream()); related_variables_offsets.set_element_to_zero_async(0, handle_ptr->get_stream()); @@ -863,7 +869,7 @@ void problem_t::compute_related_variables(double time_limit) auto current_time = std::chrono::high_resolution_clock::now(); // if the related variable array would wind up being too large for available memory, abort // TODO this used to be 1e9 - if (related_variables.size() > 1e9 || + if (related_variables.size() > 1e9 * size_factor || std::chrono::duration_cast(current_time - start_time).count() > time_limit) { CUOPT_LOG_DEBUG( diff --git a/cpp/src/mip/solve.cu b/cpp/src/mip/solve.cu index de84e2c236..e0cb65cdeb 100644 --- a/cpp/src/mip/solve.cu +++ b/cpp/src/mip/solve.cu @@ -267,12 +267,13 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, } catch (const cuopt::logic_error& e) { CUOPT_LOG_ERROR("Error in solve_mip: %s", e.what()); return mip_solution_t{e, op_problem.get_handle_ptr()->get_stream()}; - } catch (const std::bad_alloc& e) { - CUOPT_LOG_ERROR("Error in solve_mip: %s", e.what()); - return mip_solution_t{ - cuopt::logic_error("Memory allocation failed", cuopt::error_type_t::RuntimeError), - op_problem.get_handle_ptr()->get_stream()}; } + // catch (const std::bad_alloc& e) { + // CUOPT_LOG_ERROR("Error in solve_mip: %s", e.what()); + // return mip_solution_t{ + // cuopt::logic_error("Memory allocation failed", cuopt::error_type_t::RuntimeError), + // op_problem.get_handle_ptr()->get_stream()}; + // } } template diff --git a/cpp/src/utilities/cuda_helpers.cuh b/cpp/src/utilities/cuda_helpers.cuh index 3de8206993..d70eb2d525 100644 --- a/cpp/src/utilities/cuda_helpers.cuh +++ b/cpp/src/utilities/cuda_helpers.cuh @@ -24,6 +24,8 @@ #include #include #include +#include +#include namespace cuopt { @@ -208,4 +210,27 @@ DI void sorted_insert(T* array, T item, int curr_size, int max_size) array[0] = item; } +inline size_t get_device_memory_size() +{ + // Otherwise, we need to get the free memory from the device + size_t free_mem, total_mem; + cudaMemGetInfo(&free_mem, &total_mem); + + auto res = rmm::mr::get_current_device_resource(); + auto limiting_adaptor = + dynamic_cast*>(res); + // Did we specifiy an explicit memory limit? + if (limiting_adaptor) { + printf("limiting_adaptor->get_allocation_limit(): %fMiB\n", + limiting_adaptor->get_allocation_limit() / (double)1e6); + printf("used_mem: %fMiB\n", limiting_adaptor->get_allocated_bytes() / (double)1e6); + printf("free_mem: %fMiB\n", + (limiting_adaptor->get_allocation_limit() - limiting_adaptor->get_allocated_bytes()) / + (double)1e6); + return std::min(total_mem, limiting_adaptor->get_allocation_limit()); + } else { + return total_mem; + } +} + } // namespace cuopt From 94d4f967117ed0cf95fecff3f2fa5b2867e858a8 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Tue, 26 Aug 2025 16:23:45 +0000 Subject: [PATCH 2/3] remove leftover --- cpp/src/mip/solve.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/mip/solve.cu b/cpp/src/mip/solve.cu index e0cb65cdeb..de84e2c236 100644 --- a/cpp/src/mip/solve.cu +++ b/cpp/src/mip/solve.cu @@ -267,13 +267,12 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, } catch (const cuopt::logic_error& e) { CUOPT_LOG_ERROR("Error in solve_mip: %s", e.what()); return mip_solution_t{e, op_problem.get_handle_ptr()->get_stream()}; + } catch (const std::bad_alloc& e) { + CUOPT_LOG_ERROR("Error in solve_mip: %s", e.what()); + return mip_solution_t{ + cuopt::logic_error("Memory allocation failed", cuopt::error_type_t::RuntimeError), + op_problem.get_handle_ptr()->get_stream()}; } - // catch (const std::bad_alloc& e) { - // CUOPT_LOG_ERROR("Error in solve_mip: %s", e.what()); - // return mip_solution_t{ - // cuopt::logic_error("Memory allocation failed", cuopt::error_type_t::RuntimeError), - // op_problem.get_handle_ptr()->get_stream()}; - // } } template From 1d2a04da4ea4233a62e18b10f37a9f97c49ca79a Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Wed, 27 Aug 2025 07:44:29 +0000 Subject: [PATCH 3/3] fix argparse bad_any_cast --- benchmarks/linear_programming/cuopt/run_mip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index 58a7799084..c0a3cd113e 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -348,7 +348,7 @@ int main(int argc, char* argv[]) program.add_argument("--memory-limit") .help("memory limit in MB") .scan<'g', double>() - .default_value(0); + .default_value(0.0); program.add_argument("--track-allocations") .help("track allocations (t/f)")