From 32c057e1c6cd2626a672b06054400e90f8a6e3d7 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 25 Jul 2025 08:48:59 +0000 Subject: [PATCH 01/11] Fix assert failures on MIP problems losing their integer variables after presolve --- .../mip/diversity/recombiners/recombiner.cuh | 14 +++++++------ cpp/src/mip/presolve/lb_probing_cache.cu | 18 +++++++++-------- cpp/src/mip/presolve/probing_cache.cu | 20 ++++++++++--------- cpp/src/mip/problem/problem.cu | 4 ++++ 4 files changed, 33 insertions(+), 23 deletions(-) diff --git a/cpp/src/mip/diversity/recombiners/recombiner.cuh b/cpp/src/mip/diversity/recombiners/recombiner.cuh index 94ca34ea18..2ef67c42a6 100644 --- a/cpp/src/mip/diversity/recombiners/recombiner.cuh +++ b/cpp/src/mip/diversity/recombiners/recombiner.cuh @@ -92,12 +92,14 @@ class recombiner_t { reset(a.problem_ptr->n_integer_vars, a.handle_ptr); const i_t TPB = 128; i_t n_blocks = (a.problem_ptr->n_integer_vars + TPB - 1) / TPB; - assign_same_variables_kernel - <<get_stream()>>>(a.view(), - b.view(), - offspring.view(), - cuopt::make_span(remaining_indices), - n_remaining.data()); + if (a.problem_ptr->n_integer_vars > 0) { + assign_same_variables_kernel + <<get_stream()>>>(a.view(), + b.view(), + offspring.view(), + cuopt::make_span(remaining_indices), + n_remaining.data()); + } i_t remaining_variables = this->n_remaining.value(a.handle_ptr->get_stream()); auto vec_remaining_indices = diff --git a/cpp/src/mip/presolve/lb_probing_cache.cu b/cpp/src/mip/presolve/lb_probing_cache.cu index 598a4c6bce..6af3a8fd0e 100644 --- a/cpp/src/mip/presolve/lb_probing_cache.cu +++ b/cpp/src/mip/presolve/lb_probing_cache.cu @@ -286,14 +286,16 @@ inline std::vector compute_prioritized_integer_indices( cuopt_assert(res, "The activity computation must be feasible during probing cache!"); CUOPT_LOG_INFO("prioritized integer_indices n_integer_vars %d", problem.pb->n_integer_vars); // compute the min var slack - compute_min_slack_per_var - <<n_integer_vars, 128, 0, problem.handle_ptr->get_stream()>>>( - problem.pb->view(), - make_span_2(bound_presolve.cnst_slack), - make_span(min_slack_per_var), - make_span(different_coefficient), - make_span(max_excess_per_var), - make_span(max_n_violated_per_constraint)); + if (problem.pb->n_integer_vars > 0) { + compute_min_slack_per_var + <<n_integer_vars, 128, 0, problem.handle_ptr->get_stream()>>>( + problem.pb->view(), + make_span_2(bound_presolve.cnst_slack), + make_span(min_slack_per_var), + make_span(different_coefficient), + make_span(max_excess_per_var), + make_span(max_n_violated_per_constraint)); + } auto iterator = thrust::make_zip_iterator(thrust::make_tuple( max_n_violated_per_constraint.begin(), max_excess_per_var.begin(), min_slack_per_var.begin())); // sort the vars diff --git a/cpp/src/mip/presolve/probing_cache.cu b/cpp/src/mip/presolve/probing_cache.cu index 36140c5b31..e1ea6a92bd 100644 --- a/cpp/src/mip/presolve/probing_cache.cu +++ b/cpp/src/mip/presolve/probing_cache.cu @@ -328,15 +328,17 @@ inline std::vector compute_prioritized_integer_indices( cuopt_assert(res, "The activity computation must be feasible during probing cache!"); CUOPT_LOG_DEBUG("prioritized integer_indices n_integer_vars %d", problem.n_integer_vars); // compute the min var slack - compute_min_slack_per_var - <<get_stream()>>>( - problem.view(), - make_span(bound_presolve.upd.min_activity), - make_span(bound_presolve.upd.max_activity), - make_span(min_slack_per_var), - make_span(different_coefficient), - make_span(max_excess_per_var), - make_span(max_n_violated_per_constraint)); + if (problem.n_integer_vars > 0) { + compute_min_slack_per_var + <<get_stream()>>>( + problem.view(), + make_span(bound_presolve.upd.min_activity), + make_span(bound_presolve.upd.max_activity), + make_span(min_slack_per_var), + make_span(different_coefficient), + make_span(max_excess_per_var), + make_span(max_n_violated_per_constraint)); + } auto iterator = thrust::make_zip_iterator(thrust::make_tuple( max_n_violated_per_constraint.begin(), max_excess_per_var.begin(), min_slack_per_var.begin())); // sort the vars diff --git a/cpp/src/mip/problem/problem.cu b/cpp/src/mip/problem/problem.cu index 888388ef18..029d9ccad8 100644 --- a/cpp/src/mip/problem/problem.cu +++ b/cpp/src/mip/problem/problem.cu @@ -1145,6 +1145,10 @@ problem_t problem_t::get_problem_after_fixing_vars( rmm::device_uvector& variable_map, const raft::handle_t* handle_ptr) { + // Don't do anything if no variables are to be removed (may happen if presolve reduces a MIP to a + // LP problem) + if (variables_to_fix.size() == 0) { return *this; } + auto start_time = std::chrono::high_resolution_clock::now(); cuopt_assert(n_variables == assignment.size(), "Assignment size issue"); problem_t problem(*this, true); From 731405de39a8bfcdb56a4bf9850d4929d7414eff Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 25 Jul 2025 08:50:21 +0000 Subject: [PATCH 02/11] fix bug causing initial solutions to be scaled regardless of setting --- cpp/src/mip/solve.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/mip/solve.cu b/cpp/src/mip/solve.cu index dcfcdd0b1e..841770c4db 100644 --- a/cpp/src/mip/solve.cu +++ b/cpp/src/mip/solve.cu @@ -125,10 +125,12 @@ mip_solution_t run_mip(detail::problem_t& problem, running_mip); cuopt_func_call(auto saved_problem = scaled_problem); - if (settings.mip_scaling) { scaling.scale_problem(); } - if (settings.initial_solutions.size() > 0) { - for (const auto& initial_solution : settings.initial_solutions) { - scaling.scale_primal(*initial_solution); + if (settings.mip_scaling) { + scaling.scale_problem(); + if (settings.initial_solutions.size() > 0) { + for (const auto& initial_solution : settings.initial_solutions) { + scaling.scale_primal(*initial_solution); + } } } // only call preprocess on scaled problem, so we can compute feasibility on the original problem From c55e0805c7cd5bd2bad616d1893b0f87b9fa489f Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 25 Jul 2025 09:11:58 +0000 Subject: [PATCH 03/11] added test --- cpp/tests/mip/empty_fixed_problems_test.cu | 7 +++++ datasets/mip/mip-presolved-to-lp.mps | 32 ++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 datasets/mip/mip-presolved-to-lp.mps diff --git a/cpp/tests/mip/empty_fixed_problems_test.cu b/cpp/tests/mip/empty_fixed_problems_test.cu index 06ad24df2f..30d1ecf1d5 100644 --- a/cpp/tests/mip/empty_fixed_problems_test.cu +++ b/cpp/tests/mip/empty_fixed_problems_test.cu @@ -78,4 +78,11 @@ TEST(mip_solve, empty_max_problem_with_objective_test) EXPECT_NEAR(obj_val, 11, 1e-5); } +TEST(mip_solve, mip_presolved_to_lp) +{ + auto [termination_status, obj_val, lb] = test_mps_file("mip/mip-presolved-to-lp.mps", 5, false); + EXPECT_EQ(termination_status, mip_termination_status_t::Optimal); + EXPECT_NEAR(obj_val, 0, 1e-5); +} + } // namespace cuopt::linear_programming::test diff --git a/datasets/mip/mip-presolved-to-lp.mps b/datasets/mip/mip-presolved-to-lp.mps new file mode 100644 index 0000000000..755bcc328e --- /dev/null +++ b/datasets/mip/mip-presolved-to-lp.mps @@ -0,0 +1,32 @@ +NAME LP_PROBLEM +ROWS + N OBJ + E R001 + E R002 + E R003 + L R004 + L R005 + L R006 + L R007 +COLUMNS + X001 OBJ 1.000000 + X001 R004 -1.000000 + X001 R006 -1.000000 + X002 OBJ 1.000000 + X002 R005 -1.000000 + X002 R007 -1.000000 + X003 R001 1.000000 + X003 R004 1.000000 + X003 R006 -1.000000 + X004 R002 1.000000 + X004 R005 1.000000 + X004 R007 -1.000000 + X005 R001 -1.000000 + X005 R002 -1.000000 + X005 R003 1.000000 +RHS +BOUNDS + LO BND1 X005 0.000000 + UP BND1 X005 1.000000 + BV BND1 X005 +ENDATA From f81356d4cb91544c024d5e1da0d79612bb815306 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 25 Jul 2025 12:56:24 +0000 Subject: [PATCH 04/11] run PDLP when MIP problem reduced to LP --- cpp/src/mip/solver.cu | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu index a60aa77b5a..ee4feb83fa 100644 --- a/cpp/src/mip/solver.cu +++ b/cpp/src/mip/solver.cu @@ -23,6 +23,8 @@ #include "local_search/rounding/simple_rounding.cuh" #include "solver.cuh" +#include + #include #include #include @@ -124,6 +126,25 @@ solution_t mip_solver_t::run_solver() return sol; } + // if the problem was reduced to a LP: run PDLP + if (context.problem_ptr->n_integer_vars == 0) { + CUOPT_LOG_INFO("Problem reduced to a LP, running PDLP"); + pdlp_solver_settings_t pdlp_settings{}; + pdlp_settings.time_limit = timer_.remaining_time(); + detail::pdlp_solver_t solver(*context.problem_ptr, pdlp_settings, false); + auto start_time = std::chrono::high_resolution_clock::now(); + auto opt_sol = solver.run_solver(start_time); + solution_t sol(*context.problem_ptr); + sol.copy_new_assignment(host_copy(opt_sol.get_primal_solution())); + if (opt_sol.get_termination_status() == pdlp_termination_status_t::Optimal || + opt_sol.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible || + opt_sol.get_termination_status() == pdlp_termination_status_t::DualInfeasible) { + sol.set_problem_fully_reduced(); + } + context.problem_ptr->post_process_solution(sol); + return sol; + } + namespace dual_simplex = cuopt::linear_programming::dual_simplex; std::future branch_and_bound_status_future; dual_simplex::user_problem_t branch_and_bound_problem; From c20c9b5b8137798dd0162e43ff1144da740b7bc5 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 25 Jul 2025 12:56:47 +0000 Subject: [PATCH 05/11] Revert "Fix assert failures on MIP problems losing their integer variables after presolve" This reverts commit 32c057e1c6cd2626a672b06054400e90f8a6e3d7. --- .../mip/diversity/recombiners/recombiner.cuh | 14 ++++++------- cpp/src/mip/presolve/lb_probing_cache.cu | 18 ++++++++--------- cpp/src/mip/presolve/probing_cache.cu | 20 +++++++++---------- cpp/src/mip/problem/problem.cu | 4 ---- 4 files changed, 23 insertions(+), 33 deletions(-) diff --git a/cpp/src/mip/diversity/recombiners/recombiner.cuh b/cpp/src/mip/diversity/recombiners/recombiner.cuh index 2ef67c42a6..94ca34ea18 100644 --- a/cpp/src/mip/diversity/recombiners/recombiner.cuh +++ b/cpp/src/mip/diversity/recombiners/recombiner.cuh @@ -92,14 +92,12 @@ class recombiner_t { reset(a.problem_ptr->n_integer_vars, a.handle_ptr); const i_t TPB = 128; i_t n_blocks = (a.problem_ptr->n_integer_vars + TPB - 1) / TPB; - if (a.problem_ptr->n_integer_vars > 0) { - assign_same_variables_kernel - <<get_stream()>>>(a.view(), - b.view(), - offspring.view(), - cuopt::make_span(remaining_indices), - n_remaining.data()); - } + assign_same_variables_kernel + <<get_stream()>>>(a.view(), + b.view(), + offspring.view(), + cuopt::make_span(remaining_indices), + n_remaining.data()); i_t remaining_variables = this->n_remaining.value(a.handle_ptr->get_stream()); auto vec_remaining_indices = diff --git a/cpp/src/mip/presolve/lb_probing_cache.cu b/cpp/src/mip/presolve/lb_probing_cache.cu index 6af3a8fd0e..598a4c6bce 100644 --- a/cpp/src/mip/presolve/lb_probing_cache.cu +++ b/cpp/src/mip/presolve/lb_probing_cache.cu @@ -286,16 +286,14 @@ inline std::vector compute_prioritized_integer_indices( cuopt_assert(res, "The activity computation must be feasible during probing cache!"); CUOPT_LOG_INFO("prioritized integer_indices n_integer_vars %d", problem.pb->n_integer_vars); // compute the min var slack - if (problem.pb->n_integer_vars > 0) { - compute_min_slack_per_var - <<n_integer_vars, 128, 0, problem.handle_ptr->get_stream()>>>( - problem.pb->view(), - make_span_2(bound_presolve.cnst_slack), - make_span(min_slack_per_var), - make_span(different_coefficient), - make_span(max_excess_per_var), - make_span(max_n_violated_per_constraint)); - } + compute_min_slack_per_var + <<n_integer_vars, 128, 0, problem.handle_ptr->get_stream()>>>( + problem.pb->view(), + make_span_2(bound_presolve.cnst_slack), + make_span(min_slack_per_var), + make_span(different_coefficient), + make_span(max_excess_per_var), + make_span(max_n_violated_per_constraint)); auto iterator = thrust::make_zip_iterator(thrust::make_tuple( max_n_violated_per_constraint.begin(), max_excess_per_var.begin(), min_slack_per_var.begin())); // sort the vars diff --git a/cpp/src/mip/presolve/probing_cache.cu b/cpp/src/mip/presolve/probing_cache.cu index e1ea6a92bd..36140c5b31 100644 --- a/cpp/src/mip/presolve/probing_cache.cu +++ b/cpp/src/mip/presolve/probing_cache.cu @@ -328,17 +328,15 @@ inline std::vector compute_prioritized_integer_indices( cuopt_assert(res, "The activity computation must be feasible during probing cache!"); CUOPT_LOG_DEBUG("prioritized integer_indices n_integer_vars %d", problem.n_integer_vars); // compute the min var slack - if (problem.n_integer_vars > 0) { - compute_min_slack_per_var - <<get_stream()>>>( - problem.view(), - make_span(bound_presolve.upd.min_activity), - make_span(bound_presolve.upd.max_activity), - make_span(min_slack_per_var), - make_span(different_coefficient), - make_span(max_excess_per_var), - make_span(max_n_violated_per_constraint)); - } + compute_min_slack_per_var + <<get_stream()>>>( + problem.view(), + make_span(bound_presolve.upd.min_activity), + make_span(bound_presolve.upd.max_activity), + make_span(min_slack_per_var), + make_span(different_coefficient), + make_span(max_excess_per_var), + make_span(max_n_violated_per_constraint)); auto iterator = thrust::make_zip_iterator(thrust::make_tuple( max_n_violated_per_constraint.begin(), max_excess_per_var.begin(), min_slack_per_var.begin())); // sort the vars diff --git a/cpp/src/mip/problem/problem.cu b/cpp/src/mip/problem/problem.cu index 029d9ccad8..888388ef18 100644 --- a/cpp/src/mip/problem/problem.cu +++ b/cpp/src/mip/problem/problem.cu @@ -1145,10 +1145,6 @@ problem_t problem_t::get_problem_after_fixing_vars( rmm::device_uvector& variable_map, const raft::handle_t* handle_ptr) { - // Don't do anything if no variables are to be removed (may happen if presolve reduces a MIP to a - // LP problem) - if (variables_to_fix.size() == 0) { return *this; } - auto start_time = std::chrono::high_resolution_clock::now(); cuopt_assert(n_variables == assignment.size(), "Assignment size issue"); problem_t problem(*this, true); From 79fc52c2cc7a4fe02f4c2b98e0783d0044a93ae6 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Fri, 25 Jul 2025 10:06:23 -0400 Subject: [PATCH 06/11] Add CTK 12.9 fatbin flags to maintain existing binary sizes (#58) Authors: - Robert Maynard (https://github.com/robertmaynard) - Ramakrishnap (https://github.com/rgsl888prabhu) Approvers: - Ramakrishnap (https://github.com/rgsl888prabhu) URL: https://github.com/NVIDIA/cuopt/pull/58 --- cpp/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2286bd6dcc..b7da821a82 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -106,12 +106,15 @@ message("-- Building for GPU_ARCHS = ${CMAKE_CUDA_ARCHITECTURES}") # make the flags global in order to propagate flags to test cmake files set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --expt-extended-lambda") -if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8.0) +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.0) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -static-global-template-stub=false") endif() list(APPEND CUOPT_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xcompiler=-Werror) list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall -Wno-error=non-template-friend) list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=-compress-all) +if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.0) + list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=--compress-level=3) +endif() list(APPEND CUOPT_CUDA_FLAGS -fopenmp) From a1669334d2c8f16abe4a7ade70ec7754bd52183b Mon Sep 17 00:00:00 2001 From: Scott Brenner Date: Fri, 25 Jul 2025 11:50:49 -0700 Subject: [PATCH 07/11] Bump actions/checkout in nightly.yaml to v4 (#230) Bumps the version of actions/checkout in nightly.yaml to the latest, v4 - https://github.com/actions/checkout?tab=readme-ov-file#checkout-v4 Authors: - Scott Brenner (https://github.com/ScottBrenner) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/NVIDIA/cuopt/pull/230 --- .github/workflows/nightly.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml index 97a2e8dada..a20baba5b6 100644 --- a/.github/workflows/nightly.yaml +++ b/.github/workflows/nightly.yaml @@ -14,7 +14,7 @@ jobs: include: - cuopt_version: "25.08" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Trigger Pipeline env: GH_TOKEN: ${{ github.token }} @@ -61,7 +61,7 @@ jobs: include: - cuopt_version: "25.08" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Trigger Test env: GH_TOKEN: ${{ github.token }} From de128e98f20580bb02d78afa846ea7ff35a9ce2b Mon Sep 17 00:00:00 2001 From: Ramakrishnap <42624703+rgsl888prabhu@users.noreply.github.com> Date: Fri, 25 Jul 2025 15:49:32 -0500 Subject: [PATCH 08/11] Add helm chart for cuopt service (#224) The helm chart adds an option to users to deploy cuOpt in kubernetes with basic setting. Users can always modify or create a helm chart as per their requirement. ## Issue closes #124 Authors: - Ramakrishnap (https://github.com/rgsl888prabhu) Approvers: - Trevor McKay (https://github.com/tmckayus) URL: https://github.com/NVIDIA/cuopt/pull/224 --- .pre-commit-config.yaml | 4 +- ci/release/update-version-cuopt.sh | 5 + helmchart/cuopt-server/Chart.yaml | 17 ++++ helmchart/cuopt-server/README.md | 66 +++++++++++++ helmchart/cuopt-server/templates/NOTES.txt | 34 +++++++ helmchart/cuopt-server/templates/_helpers.tpl | 62 ++++++++++++ .../cuopt-server/templates/deployment.yaml | 88 +++++++++++++++++ helmchart/cuopt-server/templates/ingress.yaml | 59 +++++++++++ helmchart/cuopt-server/templates/service.yaml | 15 +++ .../templates/serviceaccount.yaml | 12 +++ helmchart/cuopt-server/values.yaml | 97 +++++++++++++++++++ 11 files changed, 458 insertions(+), 1 deletion(-) create mode 100644 helmchart/cuopt-server/Chart.yaml create mode 100644 helmchart/cuopt-server/README.md create mode 100644 helmchart/cuopt-server/templates/NOTES.txt create mode 100644 helmchart/cuopt-server/templates/_helpers.tpl create mode 100644 helmchart/cuopt-server/templates/deployment.yaml create mode 100644 helmchart/cuopt-server/templates/ingress.yaml create mode 100644 helmchart/cuopt-server/templates/service.yaml create mode 100644 helmchart/cuopt-server/templates/serviceaccount.yaml create mode 100644 helmchart/cuopt-server/values.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c218b4470e..31d21ee02c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: hooks: - id: end-of-file-fixer files: \.(mps|json|yaml|yml|txt)$ - exclude: ^datasets/.*\.(mps|json|yaml|yml|txt)$ + exclude: ^(datasets|helmchart)/.*\.(mps|json|yaml|yml|txt)$ - id: trailing-whitespace files: \.(mps|json|yaml|yml|txt)$ exclude: ^datasets/.*\.(mps|json|yaml|yml|txt)$ @@ -27,6 +27,8 @@ repos: - id: check-executables-have-shebangs - id: check-json - id: check-yaml + files: \.(yaml)$ + exclude: ^(helmchart)/.*\.(yaml)$ - repo: https://github.com/PyCQA/isort rev: 5.12.0 hooks: diff --git a/ci/release/update-version-cuopt.sh b/ci/release/update-version-cuopt.sh index 3565623991..3e2216fa4a 100755 --- a/ci/release/update-version-cuopt.sh +++ b/ci/release/update-version-cuopt.sh @@ -110,6 +110,11 @@ sed_runner "s/\(cuopt-sh-client==\)[0-9]\+\.[0-9]\+\.\\*/\1${PY_NEXT_SHORT_TAG}. sed_runner 's/cuopt-server=[0-9][0-9].[0-9][0-9] cuopt-sh-client=[0-9][0-9].[0-9][0-9] python=[0-9].[0-9][0-9] cuda-version=[0-9][0-9].[0-9]/cuopt-server='${NEXT_SHORT_TAG}' cuopt-sh-client='${NEXT_SHORT_TAG}' python=3.12 cuda-version=12.8/g' README.md sed_runner 's|cuopt:[0-9]\{2\}\.[0-9]\{1,2\}\.[0-9]\+\(-cuda12\.8-\)\(py[0-9]\+\)|cuopt:'"${DOCKER_TAG}"'\1\2|g' README.md +# Update Helm chart files +sed_runner 's/\(tag: "\)[0-9][0-9]\.[0-9]\+\.[0-9]\+\(-cuda12\.8-py3\.12"\)/\1'${DOCKER_TAG}'\2/g' helmchart/cuopt-server/values.yaml +sed_runner 's/\(appVersion: \)[0-9][0-9]\.[0-9]\+\.[0-9]\+/\1'${DOCKER_TAG}'/g' helmchart/cuopt-server/Chart.yaml +sed_runner 's/\(version: \)[0-9][0-9]\.[0-9]\+\.[0-9]\+/\1'${DOCKER_TAG}'/g' helmchart/cuopt-server/Chart.yaml + DEPENDENCIES=( libcuopt cuopt diff --git a/helmchart/cuopt-server/Chart.yaml b/helmchart/cuopt-server/Chart.yaml new file mode 100644 index 0000000000..8fa7495290 --- /dev/null +++ b/helmchart/cuopt-server/Chart.yaml @@ -0,0 +1,17 @@ +apiVersion: v2 +appVersion: 25.8.0 +description: A Helm chart for NVIDIA cuOpt Server with GPU support +home: https://docs.nvidia.com/cuopt/user-guide/latest/resources.html +keywords: +- nvidia +- cuopt +- optimization +- gpu +maintainers: +- email: cuopt@nvidia.com + name: cuopt-maintainer +name: cuopt-server +sources: +- https://docs.nvidia.com/cuopt/user-guide/latest/resources.html +type: application +version: 25.8.0 diff --git a/helmchart/cuopt-server/README.md b/helmchart/cuopt-server/README.md new file mode 100644 index 0000000000..0b1a1cec28 --- /dev/null +++ b/helmchart/cuopt-server/README.md @@ -0,0 +1,66 @@ +# cuOpt Server Helm Chart + +This Helm chart deploys the NVIDIA cuOpt Server with GPU support on Kubernetes. + +## Prerequisites + +- Kubernetes cluster with GPU nodes +- NVIDIA device plugin installed on the cluster +- NVIDIA GPU Operator (recommended) or manual GPU driver installation +- Helm 3.x installed + +## Selecting the Container Image + +- To use a specific version of the cuOpt server, update the `image.tag` field in `values.yaml`. +- If the desired version is not available as a release, you may use a nightly image. +- All available container tags can be found on [Docker Hub](https://hub.docker.com/r/nvidia/cuopt/tags). +## Installation + +### 1. Add the chart repository (if publishing to a repository) +```bash +helm repo add cuopt-server https://your-repo-url +helm repo update +``` + +### 2. Install the chart +```bash +# Install with default values +helm install cuopt-server ./cuopt-server + +# Install with custom values +helm install cuopt-server ./cuopt-server -f custom-values.yaml + +# Install with inline overrides +helm install cuopt-server ./cuopt-server \ + --set resources.requests.nvidia.com/gpu=2 \ + --set resources.limits.nvidia.com/gpu=2 +``` + +## Usage + +### Port Forwarding (for ClusterIP service) +```bash +kubectl port-forward service/cuopt-server 5000:5000 +``` + +### Accessing the Service +Once deployed, you can access the cuOpt server API at: +- `http://localhost:5000` (with port forwarding) +- Or through the service endpoint within the cluster + +### Testing the Deployment +```bash +# Check pod status +kubectl get pods -l app.kubernetes.io/name=cuopt-server + +# View logs +kubectl logs -l app.kubernetes.io/name=cuopt-server + +# Check GPU allocation +kubectl describe pod -l app.kubernetes.io/name=cuopt-server +``` + +## Uninstall + +```bash +helm uninstall cuopt-server \ No newline at end of file diff --git a/helmchart/cuopt-server/templates/NOTES.txt b/helmchart/cuopt-server/templates/NOTES.txt new file mode 100644 index 0000000000..6e258353e2 --- /dev/null +++ b/helmchart/cuopt-server/templates/NOTES.txt @@ -0,0 +1,34 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "cuopt-server.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "cuopt-server.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "cuopt-server.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "{{ include "cuopt-server.selectorLabels" . }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:5000 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 5000:$CONTAINER_PORT +{{- end }} + +2. To check the status of your cuOpt server deployment: + kubectl get pods -l "{{ include "cuopt-server.selectorLabels" . }}" + +3. To view the logs: + kubectl logs -l "{{ include "cuopt-server.selectorLabels" . }}" + +4. Important Notes: + - This deployment requires GPU nodes in your cluster + - Make sure your cluster has the NVIDIA device plugin installed + - The server will be running the command: python -m cuopt_server.cuopt_service + - Port 5000 is exposed for the cuOpt service API \ No newline at end of file diff --git a/helmchart/cuopt-server/templates/_helpers.tpl b/helmchart/cuopt-server/templates/_helpers.tpl new file mode 100644 index 0000000000..d35eb80829 --- /dev/null +++ b/helmchart/cuopt-server/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "cuopt-server.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "cuopt-server.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "cuopt-server.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "cuopt-server.labels" -}} +helm.sh/chart: {{ include "cuopt-server.chart" . }} +{{ include "cuopt-server.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "cuopt-server.selectorLabels" -}} +app.kubernetes.io/name: {{ include "cuopt-server.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "cuopt-server.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "cuopt-server.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/helmchart/cuopt-server/templates/deployment.yaml b/helmchart/cuopt-server/templates/deployment.yaml new file mode 100644 index 0000000000..5e51f58825 --- /dev/null +++ b/helmchart/cuopt-server/templates/deployment.yaml @@ -0,0 +1,88 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "cuopt-server.fullname" . }} + labels: + {{- include "cuopt-server.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "cuopt-server.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "cuopt-server.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "cuopt-server.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- if .Values.command }} + command: + {{- toYaml .Values.command | nindent 12 }} + {{- end }} + ports: + - name: http + containerPort: {{ .Values.service.targetPort }} + protocol: TCP + livenessProbe: + httpGet: + path: /v2/health/live + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /v2/health/ready + port: http + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.env }} + env: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if .Values.persistence.enabled }} + volumeMounts: + - name: storage + mountPath: /data + {{- end }} + {{- if .Values.persistence.enabled }} + volumes: + - name: storage + persistentVolumeClaim: + claimName: {{ include "cuopt-server.fullname" . }}-pvc + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helmchart/cuopt-server/templates/ingress.yaml b/helmchart/cuopt-server/templates/ingress.yaml new file mode 100644 index 0000000000..97a98fd36f --- /dev/null +++ b/helmchart/cuopt-server/templates/ingress.yaml @@ -0,0 +1,59 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "cuopt-server.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class")) }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "cuopt-server.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helmchart/cuopt-server/templates/service.yaml b/helmchart/cuopt-server/templates/service.yaml new file mode 100644 index 0000000000..9327414beb --- /dev/null +++ b/helmchart/cuopt-server/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "cuopt-server.fullname" . }} + labels: + {{- include "cuopt-server.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} + protocol: TCP + name: http + selector: + {{- include "cuopt-server.selectorLabels" . | nindent 4 }} \ No newline at end of file diff --git a/helmchart/cuopt-server/templates/serviceaccount.yaml b/helmchart/cuopt-server/templates/serviceaccount.yaml new file mode 100644 index 0000000000..a49d5ce20a --- /dev/null +++ b/helmchart/cuopt-server/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "cuopt-server.serviceAccountName" . }} + labels: + {{- include "cuopt-server.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helmchart/cuopt-server/values.yaml b/helmchart/cuopt-server/values.yaml new file mode 100644 index 0000000000..450865862b --- /dev/null +++ b/helmchart/cuopt-server/values.yaml @@ -0,0 +1,97 @@ +# Default values for cuopt-server. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: nvidia/cuopt + pullPolicy: IfNotPresent + tag: "25.8.0-cuda12.8-py3.12" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 5000 + targetPort: 5000 + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: cuopt-server.local + paths: + - path: / + pathType: Prefix + tls: [] + # - secretName: cuopt-server-tls + # hosts: + # - cuopt-server.local + +resources: + limits: + nvidia.com/gpu: 1 + + requests: + nvidia.com/gpu: 1 + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +nodeSelector: + # Uncomment to schedule on nodes with GPU + # accelerator: nvidia-tesla-k80 + +tolerations: [] + +affinity: {} + +# Command to run the cuOpt service +command: + - python + - -m + - cuopt_server.cuopt_service + - -p + - "5000" + +# Environment variables +env: [] + +# Persistent storage (if needed) +persistence: + enabled: false + accessMode: ReadWriteOnce + size: 1Gi + # storageClass: "" From 5e711853556b93c8c71ee8bd1f7c5384ed905a32 Mon Sep 17 00:00:00 2001 From: Ramakrishnap <42624703+rgsl888prabhu@users.noreply.github.com> Date: Fri, 25 Jul 2025 16:31:10 -0500 Subject: [PATCH 09/11] Add nightly container support (#180) Adds workflow for container build, test and push for nightly and release. Nightly container would have the tag for example 25.8.0a-cuda12.8-py3.12 And these will be published to https://hub.docker.com/r/nvidia/cuopt/tags and ngc internal registry. This PR also removes several unused workflows and also update license header. ## Issue closes #123 Authors: - Ramakrishnap (https://github.com/rgsl888prabhu) Approvers: - Trevor McKay (https://github.com/tmckayus) - Gil Forsyth (https://github.com/gforsyth) URL: https://github.com/NVIDIA/cuopt/pull/180 --- .github/workflows/build.yaml | 29 +-- .github/workflows/build_images.yaml | 102 +++++++++ .../workflows/build_test_publish_images.yaml | 185 ++++++++++++++++ .github/workflows/service_nightly.yaml | 201 ------------------ .github/workflows/test_images.yaml | 62 ++++++ ci/docker/Dockerfile | 103 +++++++++ ci/docker/README.md | 13 ++ ci/docker/context/README.md | 1 + ci/docker/create_multiarch_manifest.sh | 104 +++++++++ ci/docker/test_image.sh | 59 +++++ .../cuopt_server/tests/utils/utils.py | 36 +++- 11 files changed, 668 insertions(+), 227 deletions(-) create mode 100644 .github/workflows/build_images.yaml create mode 100644 .github/workflows/build_test_publish_images.yaml delete mode 100644 .github/workflows/service_nightly.yaml create mode 100644 .github/workflows/test_images.yaml create mode 100644 ci/docker/Dockerfile create mode 100644 ci/docker/README.md create mode 100644 ci/docker/context/README.md create mode 100644 ci/docker/create_multiarch_manifest.sh create mode 100644 ci/docker/test_image.sh diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 6f8b0485ff..c0ac2cc451 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -207,23 +207,12 @@ jobs: date: ${{ inputs.date }} package-name: cuopt_sh_client package-type: python - service-container: - if: inputs.build_type == 'nightly' - needs: [wheel-build-cuopt, wheel-build-cuopt-server] - runs-on: ubuntu-latest - steps: - - name: Checkout code repo - uses: actions/checkout@v3 - with: - ref: ${{ inputs.sha }} - fetch-depth: 0 # unshallow fetch for setuptools-scm - persist-credentials: false - - name: build service - env: - GH_TOKEN: ${{ github.token }} - run: | - gh workflow run service_nightly.yaml \ - -f branch=${{ inputs.branch }} \ - -f sha=${{ inputs.sha }} \ - -f date=${{ inputs.date }} \ - -f build_type=${{ inputs.build_type }} + build-images: + needs: [wheel-publish-cuopt, wheel-publish-cuopt-server] + uses: ./.github/workflows/build_test_publish_images.yaml + secrets: inherit + with: + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + build_type: ${{ inputs.build_type || 'branch' }} diff --git a/.github/workflows/build_images.yaml b/.github/workflows/build_images.yaml new file mode 100644 index 0000000000..7964272644 --- /dev/null +++ b/.github/workflows/build_images.yaml @@ -0,0 +1,102 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Build and push image variant + +on: + workflow_call: + inputs: + ARCHES: + required: true + type: string + CUDA_VER: + required: true + type: string + CUOPT_VER: + required: true + type: string + IMAGE_TAG_PREFIX: + required: true + type: string + LINUX_VER: + required: true + type: string + PYTHON_VER: + required: true + type: string + +jobs: + build: + strategy: + matrix: + ARCH: ["${{ inputs.ARCHES }}"] + CUDA_VER: ["${{ inputs.CUDA_VER }}"] + PYTHON_VER: ["${{ inputs.PYTHON_VER }}"] + LINUX_VER: ["${{ inputs.LINUX_VER }}"] + fail-fast: false + runs-on: "linux-${{ matrix.ARCH }}-cpu4" + steps: + - name: Checkout code repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.CUOPT_DOCKERHUB_USERNAME }} + password: ${{ secrets.CUOPT_DOCKERHUB_TOKEN }} + - name: Copy License and Version files + run: | + cp ./LICENSE ./ci/docker/context/LICENSE + cp ./VERSION ./ci/docker/context/VERSION + cp ./thirdparty/THIRD_PARTY_LICENSES ./ci/docker/context/THIRD_PARTY_LICENSES + - name: Login to NGC + uses: docker/login-action@v3 + with: + registry: "nvcr.io" + username: "$oauthtoken" + password: ${{ secrets.CUOPT_NGC_DOCKER_KEY }} + - name: Set up Docker Context for Buildx + id: buildx-context + run: | + docker context create builders + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver: docker + endpoint: ./ci/docker/context + - name: Trim CUDA and Python versions + id: trim + run: | + echo "CUDA_SHORT=$(echo '${{ inputs.CUDA_VER }}' | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT + echo "PYTHON_SHORT=$(echo '${{ inputs.PYTHON_VER }}' | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT + - name: Build image and push to DockerHub and NGC + uses: docker/build-push-action@v6 + with: + context: ./ci/docker/context + file: ./ci/docker/Dockerfile + push: true + pull: true + build-args: | + CUDA_VER=${{ inputs.CUDA_VER }} + PYTHON_SHORT_VER=${{ steps.trim.outputs.PYTHON_SHORT }} + CUOPT_VER=${{ inputs.CUOPT_VER }} + LINUX_VER=${{ inputs.LINUX_VER }} + tags: nvidia/cuopt:${{ inputs.IMAGE_TAG_PREFIX }}-cuda${{ steps.trim.outputs.CUDA_SHORT }}-py${{ steps.trim.outputs.PYTHON_SHORT }}-${{ matrix.ARCH }} + + - name: Push image to NGC + run: | + docker tag nvidia/cuopt:${{ inputs.IMAGE_TAG_PREFIX }}-cuda${{ steps.trim.outputs.CUDA_SHORT }}-py${{ steps.trim.outputs.PYTHON_SHORT }}-${{ matrix.ARCH }} nvcr.io/nvstaging/nvaie/cuopt:${{ inputs.IMAGE_TAG_PREFIX }}-cuda${{ steps.trim.outputs.CUDA_SHORT }}-py${{ steps.trim.outputs.PYTHON_SHORT }}-${{ matrix.ARCH }} + docker push nvcr.io/nvstaging/nvaie/cuopt:${{ inputs.IMAGE_TAG_PREFIX }}-cuda${{ steps.trim.outputs.CUDA_SHORT }}-py${{ steps.trim.outputs.PYTHON_SHORT }}-${{ matrix.ARCH }} diff --git a/.github/workflows/build_test_publish_images.yaml b/.github/workflows/build_test_publish_images.yaml new file mode 100644 index 0000000000..aaac5991e0 --- /dev/null +++ b/.github/workflows/build_test_publish_images.yaml @@ -0,0 +1,185 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Build, Test and Publish cuopt images + +on: + workflow_call: + inputs: + branch: + type: string + date: + type: string + sha: + type: string + build_type: + type: string + arch: + type: string + default: '["amd64", "arm64"]' + description: 'JSON array of architectures to build for' + cuda_ver: + type: string + default: '["12.8.0"]' + description: 'JSON array of CUDA versions to build for' + python_ver: + type: string + default: '["3.12.11"]' + description: 'JSON array of Python versions to build for' + linux_ver: + type: string + default: '["22.04"]' + description: 'JSON array of Linux versions to build for' + + +defaults: + run: + shell: bash + +permissions: + actions: read + checks: none + contents: read + deployments: none + discussions: none + id-token: write + issues: none + packages: read + pages: none + pull-requests: read + repository-projects: none + security-events: none + statuses: none + +jobs: + compute-matrix: + runs-on: ubuntu-latest + container: + image: rapidsai/ci-conda:25.08-latest + outputs: + MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} + CUOPT_VER: ${{ steps.compute-cuopt-ver.outputs.CUOPT_VER }} + IMAGE_TAG_PREFIX: ${{ steps.compute-cuopt-ver.outputs.IMAGE_TAG_PREFIX }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 # unshallow fetch for setuptools-scm + persist-credentials: false + + - name: Compute matrix + id: compute-matrix + run: | + MATRIX=$(jq -c '.' <> $GITHUB_OUTPUT + + - name: Install gha-tools + run: | + mkdir -p /tmp/gha-tools + curl -s -L 'https://github.com/rapidsai/gha-tools/releases/latest/download/tools.tar.gz' | tar -xz -C /tmp/gha-tools + echo "/tmp/gha-tools" >> "${GITHUB_PATH}" + + - name: Compute cuopt version + id: compute-cuopt-ver + run: | + ver=$(rapids-generate-version) + # Remove starting 0s from version 25.08.0a18 -> 25.8.0a18 + CUOPT_VER=$(echo "$ver" | sed -E 's/\.0+([0-9])/\.\1/g') + echo "CUOPT_VER=$CUOPT_VER" >> $GITHUB_OUTPUT + if rapids-is-release-build; then + IMAGE_TAG_PREFIX="$CUOPT_VER" + else + IMAGE_TAG_PREFIX=$(echo "$CUOPT_VER" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)a.*/\1a/') + fi + echo "IMAGE_TAG_PREFIX=$IMAGE_TAG_PREFIX" >> $GITHUB_OUTPUT + + build-images: + name: Build images + needs: compute-matrix + secrets: inherit + strategy: + matrix: ${{ fromJson(needs.compute-matrix.outputs.MATRIX) }} + uses: ./.github/workflows/build_images.yaml + with: + ARCHES: ${{ matrix.arch }} + CUDA_VER: ${{ matrix.cuda_ver }} + CUOPT_VER: ${{ needs.compute-matrix.outputs.CUOPT_VER }} + IMAGE_TAG_PREFIX: ${{ needs.compute-matrix.outputs.IMAGE_TAG_PREFIX }} + LINUX_VER: ${{ matrix.linux_ver }} + PYTHON_VER: ${{ matrix.python_ver }} + + build-cuopt-multiarch-manifest: + name: Build cuopt multiarch manifest + needs: [build-images, compute-matrix] + strategy: + matrix: + CUDA_VER: ${{ fromJson(needs.compute-matrix.outputs.MATRIX).cuda_ver }} + PYTHON_VER: ${{ fromJson(needs.compute-matrix.outputs.MATRIX).python_ver }} + runs-on: ubuntu-latest + steps: + - name: Checkout code repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.CUOPT_DOCKERHUB_USERNAME }} + password: ${{ secrets.CUOPT_DOCKERHUB_TOKEN }} + - name: Login to NGC + uses: docker/login-action@v3 + with: + registry: "nvcr.io" + username: "$oauthtoken" + password: ${{ secrets.CUOPT_NGC_DOCKER_KEY }} + - name: Trim CUDA and Python versions + id: trim + run: | + echo "CUDA_SHORT=$(echo '${{ matrix.CUDA_VER }}' | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT + echo "PYTHON_SHORT=$(echo '${{ matrix.PYTHON_VER }}' | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT + - name: Create multiarch manifest + shell: bash + env: + CUOPT_VER: ${{ needs.compute-matrix.outputs.CUOPT_VER }} + CUDA_SHORT: ${{ steps.trim.outputs.CUDA_SHORT }} + PYTHON_SHORT: ${{ steps.trim.outputs.PYTHON_SHORT }} + IMAGE_TAG_PREFIX: ${{ needs.compute-matrix.outputs.IMAGE_TAG_PREFIX }} + BUILD_TYPE: ${{ inputs.build_type }} + run: bash ci/docker/create_multiarch_manifest.sh + + test-images: + name: Test images + needs: [build-cuopt-multiarch-manifest, compute-matrix] + secrets: inherit + strategy: + matrix: + CUDA_VER: ${{ fromJson(needs.compute-matrix.outputs.MATRIX).cuda_ver }} + PYTHON_VER: ${{ fromJson(needs.compute-matrix.outputs.MATRIX).python_ver }} + ARCH: ${{ fromJson(needs.compute-matrix.outputs.MATRIX).arch }} + uses: ./.github/workflows/test_images.yaml + with: + ARCH: ${{ matrix.ARCH }} + CUDA_VER: ${{ matrix.CUDA_VER }} + PYTHON_VER: ${{ matrix.PYTHON_VER }} + IMAGE_TAG_PREFIX: ${{ needs.compute-matrix.outputs.IMAGE_TAG_PREFIX }} diff --git a/.github/workflows/service_nightly.yaml b/.github/workflows/service_nightly.yaml deleted file mode 100644 index 94071d1bf4..0000000000 --- a/.github/workflows/service_nightly.yaml +++ /dev/null @@ -1,201 +0,0 @@ -name: Build Managed service docker, deploy and test - -on: - workflow_dispatch: - inputs: - branch: - type: string - date: - type: string - sha: - type: string - build_type: - type: string - -defaults: - run: - shell: bash - -permissions: - actions: read - checks: none - contents: read - deployments: none - discussions: none - id-token: write - issues: none - packages: read - pages: none - pull-requests: read - repository-projects: none - security-events: none - statuses: none - -jobs: - managed-service-nightly-amd: - name: Managed service nightly build for AMD64 architecture - env: - GH_TOKEN: ${{ github.token }} - RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} - RAPIDS_CUDA_VERSION: "12.5.1" - RAPIDS_PY_VERSION: "3.12" - DOCKER_BUILDKIT: 1 - runs-on: "linux-amd64-cpu4" - steps: - - uses: aws-actions/configure-aws-credentials@v2 - with: - role-to-assume: ${{ vars.AWS_ROLE_ARN }} - aws-region: ${{ vars.AWS_REGION }} - role-duration-seconds: 43200 # 12h - - - name: Checkout code repo - uses: actions/checkout@v4 - with: - ref: ${{ inputs.sha }} - fetch-depth: 0 # unshallow fetch for setuptools-scm - persist-credentials: false - - - name: Standardize repository information - uses: rapidsai/shared-actions/rapids-github-info@main - with: - branch: ${{ inputs.branch }} - date: ${{ inputs.date }} - sha: ${{ inputs.sha }} - - - name: Docker login to nvcr.io - uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 - with: - registry: "nvcr.io" - username: "$oauthtoken" - password: ${{ secrets.CUOPT_PRD_NGC_DOCKER_KEY }} - - - name: Install aws and python - run: | - set -x - sudo apt-get update -y && sudo apt-get install -y software-properties-common && sudo add-apt-repository -y ppa:deadsnakes/ppa - sudo apt-get install -y awscli python3.12 - - - name: Install GHA tools - run: | - git clone https://github.com/rapidsai/gha-tools.git -b main /tmp/gha-tools - echo "/tmp/gha-tools/tools" >> "${GITHUB_PATH}" - - - name: Download latest artifacts from S3 - run: | - # make rapids-download-wheels-from-github download everything to the same directory - export RAPIDS_UNZIP_DIR="$(pwd)/wheels" - mkdir "${RAPIDS_UNZIP_DIR}" - - # download latest wheels built from build.yaml - RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - RAPIDS_PY_WHEEL_NAME="cuopt_mps_parser" rapids-download-wheels-from-github python - RAPIDS_PY_WHEEL_NAME="cuopt_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github python - RAPIDS_PY_WHEEL_NAME="cuopt_server_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github python - RAPIDS_PY_WHEEL_NAME="libcuopt_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github cpp - - - name: Generate git commit file for tracking the container - run: | - bash container-builder/bin/make_git_info.sh ./ - - - name: Build cuopt self hosted service docker image - uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671 - with: - context: . - push: true - tags: nvcr.io/j9mrpofbmtxd/test/cuopt:25.08 - file: ci/build-service.Dockerfile - build-args: | - nspect_id="NSPECT-LZ5P-VOVE" - arch=amd - - - name: Push image to prod env - run: | - docker tag nvcr.io/j9mrpofbmtxd/test/cuopt:25.08 nvcr.io/0616513341838337/cuopt:nightly - docker tag nvcr.io/j9mrpofbmtxd/test/cuopt:25.08 nvcr.io/0616513341838337/cuopt:25.08 - - docker push nvcr.io/0616513341838337/cuopt:nightly - docker push nvcr.io/0616513341838337/cuopt:25.08 - - managed-service-nightly-arm: - name: Managed service nightly build for ARM architecture - env: - GH_TOKEN: ${{ github.token }} - RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} - RAPIDS_CUDA_VERSION: "12.5.1" - RAPIDS_PY_VERSION: "3.12" - DOCKER_BUILDKIT: 1 - runs-on: "linux-arm64-cpu4" - steps: - - uses: aws-actions/configure-aws-credentials@v2 - with: - role-to-assume: ${{ vars.AWS_ROLE_ARN }} - aws-region: ${{ vars.AWS_REGION }} - role-duration-seconds: 43200 # 12h - - - name: Checkout code repo - uses: actions/checkout@v4 - with: - ref: ${{ inputs.sha }} - fetch-depth: 0 # unshallow fetch for setuptools-scm - persist-credentials: false - - - name: Standardize repository information - uses: rapidsai/shared-actions/rapids-github-info@main - with: - branch: ${{ inputs.branch }} - date: ${{ inputs.date }} - sha: ${{ inputs.sha }} - - - name: Docker login to nvcr.io - uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 - with: - registry: "nvcr.io" - username: "$oauthtoken" - password: ${{ secrets.CUOPT_PRD_NGC_DOCKER_KEY }} - - - name: Install aws and python - run: | - set -x - sudo apt-get update -y && sudo apt-get install -y software-properties-common && sudo add-apt-repository -y ppa:deadsnakes/ppa - sudo apt-get install -y awscli python3.12 - - - name: Install GHA tools - run: | - git clone https://github.com/rapidsai/gha-tools.git -b main /tmp/gha-tools - echo "/tmp/gha-tools/tools" >> "${GITHUB_PATH}" - - - name: Download latest artifacts from S3 - run: | - # make rapids-download-wheels-from-github download everything to the same directory - export RAPIDS_UNZIP_DIR="$(pwd)/wheels" - mkdir "${RAPIDS_UNZIP_DIR}" - - # download latest wheels built from build.yaml - RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - RAPIDS_PY_WHEEL_NAME="cuopt_mps_parser" rapids-download-wheels-from-github python - RAPIDS_PY_WHEEL_NAME="cuopt_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github python - RAPIDS_PY_WHEEL_NAME="cuopt_server_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github python - RAPIDS_PY_WHEEL_NAME="libcuopt_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github cpp - - - name: Generate git commit file for tracking the container - run: | - bash container-builder/bin/make_git_info.sh ./ - - - name: Build cuopt self hosted service docker image - uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671 - with: - context: . - push: true - tags: nvcr.io/j9mrpofbmtxd/test/cuopt:25.08.arm - file: ci/build-service.Dockerfile - build-args: | - nspect_id="NSPECT-LZ5P-VOVE" - arch=arm - - - name: Push image to prod env - run: | - docker tag nvcr.io/j9mrpofbmtxd/test/cuopt:25.08.arm nvcr.io/0616513341838337/cuopt:nightly.arm - docker tag nvcr.io/j9mrpofbmtxd/test/cuopt:25.08.arm nvcr.io/0616513341838337/cuopt:25.08.arm - - docker push nvcr.io/0616513341838337/cuopt:nightly.arm - docker push nvcr.io/0616513341838337/cuopt:25.08.arm diff --git a/.github/workflows/test_images.yaml b/.github/workflows/test_images.yaml new file mode 100644 index 0000000000..24e669bf7e --- /dev/null +++ b/.github/workflows/test_images.yaml @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, + +name: Test images + +on: + workflow_call: + inputs: + ARCH: + required: true + type: string + CUDA_VER: + required: true + type: string + PYTHON_VER: + required: true + type: string + IMAGE_TAG_PREFIX: + required: true + type: string + + +jobs: + + prepare: + runs-on: ubuntu-latest + outputs: + CUDA_SHORT: ${{ steps.trim.outputs.CUDA_SHORT }} + PYTHON_SHORT: ${{ steps.trim.outputs.PYTHON_SHORT }} + steps: + - name: Trim versions + id: trim + run: | + CUDA_SHORT=$(echo "${{ inputs.CUDA_VER }}" | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/') + PYTHON_SHORT=$(echo "${{ inputs.PYTHON_VER }}" | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/') + + echo "CUDA_SHORT=$CUDA_SHORT" >> $GITHUB_OUTPUT + echo "PYTHON_SHORT=$PYTHON_SHORT" >> $GITHUB_OUTPUT + + test: + runs-on: "linux-${{ inputs.ARCH }}-gpu-a100-latest-1" + needs: prepare + container: + image: "nvidia/cuopt:${{ inputs.IMAGE_TAG_PREFIX }}-cuda${{ needs.prepare.outputs.CUDA_SHORT }}-py${{ needs.prepare.outputs.PYTHON_SHORT }}" + options: --user root + steps: + - name: Checkout code repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Test cuopt + run: | + bash ./ci/docker/test_image.sh diff --git a/ci/docker/Dockerfile b/ci/docker/Dockerfile new file mode 100644 index 0000000000..7810aa4de1 --- /dev/null +++ b/ci/docker/Dockerfile @@ -0,0 +1,103 @@ +# syntax=docker/dockerfile:1.2 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG CUDA_VER=unset +ARG CUOPT_VER=unset +ARG PYTHON_SHORT_VER=unset +ARG LINUX_VER=unset + +FROM nvidia/cuda:${CUDA_VER}-runtime-ubuntu${LINUX_VER} AS cuda-libs + +# Install cuOpt +FROM nvidia/cuda:${CUDA_VER}-base-ubuntu${LINUX_VER} AS python-env + +ARG CUDA_VER +ARG CUOPT_VER +ARG PYTHON_SHORT_VER + +ENV DEBIAN_FRONTEND=noninteractive + +# gcc is required for building psutils +RUN apt-get update && apt-get install -y --no-install-recommends build-essential software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get install -y --no-install-recommends \ + wget \ + unzip \ + gcc \ + python${PYTHON_SHORT_VER} \ + python${PYTHON_SHORT_VER}-dev \ + python${PYTHON_SHORT_VER}-venv \ + && rm -rf /var/lib/apt/lists/* && \ + python${PYTHON_SHORT_VER} -m ensurepip --upgrade && \ + python${PYTHON_SHORT_VER} -m pip install --upgrade pip + +ENV DEBIAN_FRONTEND="" + +RUN ln -sf /usr/bin/python${PYTHON_SHORT_VER} /usr/bin/python && \ + groupadd -r cuopt && \ + useradd -r -g cuopt cuopt && \ + chown -R cuopt:cuopt /usr/local/lib/python${PYTHON_SHORT_VER}/dist-packages + +USER cuopt + +FROM python-env AS install-env + +WORKDIR /home/cuopt + +ARG CUOPT_VER +ARG PYTHON_SHORT_VER + +RUN cuda_suffix=cu$(echo ${CUDA_VER} | cut -d'.' -f1) && \ + cuda_major_minor=$(echo ${CUDA_VER} | cut -d'.' -f1-2) && \ + python -m pip install \ + --extra-index-url https://pypi.nvidia.com \ + --extra-index-url https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \ + --user \ + --no-cache-dir \ + "cuopt-server-${cuda_suffix}==${CUOPT_VER}" \ + "cuopt-sh-client==${CUOPT_VER}" \ + "nvidia-cuda-runtime-${cuda_suffix}==${cuda_major_minor}.*" && \ + python -m pip list + +USER root + +# Remove gcc to save space, gcc was required for building psutils +RUN apt-get purge -y gcc && rm -rf /var/lib/apt/lists/* + +USER cuopt + +COPY ./LICENSE /home/cuopt/LICENSE +COPY ./VERSION /home/cuopt/VERSION +COPY ./THIRD_PARTY_LICENSES /home/cuopt/THIRD_PARTY_LICENSES + +FROM install-env AS cuopt-final + +ARG PYTHON_SHORT_VER + +# Set environment variables in .bashrc for all future shells +RUN echo 'export PATH="/usr/local/cuda/bin:/usr/bin:/usr/local/bin:/usr/local/nvidia/bin/:/home/cuopt/.local/lib/python${PYTHON_SHORT_VER}/dist-packages/libcuopt/bin/:/home/cuopt/.local/bin:$PATH"' >> /home/cuopt/.bashrc && \ + echo 'export LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:/usr/lib/aarch64-linux-gnu:/usr/local/cuda/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/wsl/lib:/usr/lib/wsl/lib/libnvidia-container:/usr/lib/nvidia:/usr/lib/nvidia-current:/home/cuopt/.local/lib/python${PYTHON_SHORT_VER}/dist-packages/libcuopt/lib/:/home/cuopt/.local/lib/python${PYTHON_SHORT_VER}/dist-packages/rapids_logger/lib64:${LD_LIBRARY_PATH}"' >> /home/cuopt/.bashrc + + +# Create a .bash_profile that sources .bashrc if it exists +RUN echo 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' > /home/cuopt/.bash_profile + +COPY --from=cuda-libs /usr/local/cuda/lib64/libnvrtc* /usr/local/cuda/lib64/ +COPY --from=cuda-libs /usr/local/cuda/lib64/libnvJitLink* /usr/local/cuda/lib64/ + +# Use a shell as entrypoint to handle both service and interactive modes +ENTRYPOINT ["/bin/bash", "-c"] +CMD ["python -m cuopt_server.cuopt_service"] diff --git a/ci/docker/README.md b/ci/docker/README.md new file mode 100644 index 0000000000..0645da0722 --- /dev/null +++ b/ci/docker/README.md @@ -0,0 +1,13 @@ +# Container Image build and test suite + +## context + +Add all the files and data for the buildx context to ``context`` folder like entrypoint script, and others. + +## test + +To test the container image, run the [test_image.sh](test_image.sh) script as shown below from the latest github repo: + +```bash +docker run -it --rm --gpus all -u root --volume $PWD:/repo -w /repo --entrypoint "/bin/bash" nvidia/cuopt:[TAG] ./ci/docker/test_image.sh +``` \ No newline at end of file diff --git a/ci/docker/context/README.md b/ci/docker/context/README.md new file mode 100644 index 0000000000..84191ae9ef --- /dev/null +++ b/ci/docker/context/README.md @@ -0,0 +1 @@ +Place holder for docker context. \ No newline at end of file diff --git a/ci/docker/create_multiarch_manifest.sh b/ci/docker/create_multiarch_manifest.sh new file mode 100644 index 0000000000..a579e9e32e --- /dev/null +++ b/ci/docker/create_multiarch_manifest.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Function to check if a Docker image exists in the registry +check_image_exists() { + local image=$1 + echo "Checking if image exists: $image" + + # Try to pull the image manifest to check if it exists + if docker manifest inspect "$image" >/dev/null 2>&1; then + echo "✓ Image exists: $image" + return 0 + else + echo "✗ Image does not exist: $image" + return 1 + fi +} + +# Function to create manifest with error checking +create_manifest() { + local manifest_name=$1 + local amd64_image=$2 + local arm64_image=$3 + + echo "Creating manifest: $manifest_name" + + # Check if both architecture images exist + if ! check_image_exists "$amd64_image"; then + echo "Error: AMD64 image not found: $amd64_image" + return 1 + fi + + if ! check_image_exists "$arm64_image"; then + echo "Error: ARM64 image not found: $arm64_image" + return 1 + fi + + # Create the manifest + echo "Creating multi-arch manifest..." + docker manifest create --amend "$manifest_name" "$amd64_image" "$arm64_image" + + # Annotate with architecture information + echo "Annotating ARM64 architecture..." + docker manifest annotate "$manifest_name" "$arm64_image" --arch arm64 + + echo "Annotating AMD64 architecture..." + docker manifest annotate "$manifest_name" "$amd64_image" --arch amd64 + + # Push the manifest + echo "Pushing manifest: $manifest_name" + docker manifest push "$manifest_name" + + echo "✓ Successfully created and pushed manifest: $manifest_name" +} + +# Create manifest for dockerhub and nvstaging +echo "=== Creating Docker Hub manifests ===" +create_manifest \ + "nvidia/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}" \ + "nvidia/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-amd64" \ + "nvidia/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-arm64" + +echo "=== Creating NVCR staging manifests ===" +create_manifest \ + "nvcr.io/nvstaging/nvaie/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}" \ + "nvcr.io/nvstaging/nvaie/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-amd64" \ + "nvcr.io/nvstaging/nvaie/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-arm64" + +# Only create latest manifests for release builds +if [[ "${BUILD_TYPE}" == "release" ]]; then + echo "=== Creating latest manifests for release build ===" + + echo "Creating Docker Hub latest manifest..." + create_manifest \ + "nvidia/cuopt:latest-cuda${CUDA_SHORT}-py${PYTHON_SHORT}" \ + "nvidia/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-amd64" \ + "nvidia/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-arm64" + + echo "Creating NVCR staging latest manifest..." + create_manifest \ + "nvcr.io/nvstaging/nvaie/cuopt:latest-cuda${CUDA_SHORT}-py${PYTHON_SHORT}" \ + "nvcr.io/nvstaging/nvaie/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-amd64" \ + "nvcr.io/nvstaging/nvaie/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-arm64" +else + echo "Skipping latest manifest creation (BUILD_TYPE=${BUILD_TYPE}, not 'release')" +fi + +echo "=== Multi-architecture manifest creation completed ===" \ No newline at end of file diff --git a/ci/docker/test_image.sh b/ci/docker/test_image.sh new file mode 100644 index 0000000000..1a3272cdca --- /dev/null +++ b/ci/docker/test_image.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +chsh -s /bin/bash cuopt + +# Install dependencies +apt-get update +DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends file bzip2 + +# Download test data +bash datasets/linear_programming/download_pdlp_test_dataset.sh +bash datasets/mip/download_miplib_test_dataset.sh +pushd ./datasets +./get_test_data.sh --solomon +./get_test_data.sh --tsp +popd + +# Create symlink to cuopt +ln -sf "$(pwd)" /home/cuopt/cuopt + +# Set permissions since the repo is mounted on root +chmod -R a+w "$(pwd)" + +# If this script is being run as root, use 'su - cuopt -c ""' to run each command as cuopt. + +# Change to cuopt home directory and then to cuopt repo +cat > /home/cuopt/test.sh < Date: Mon, 28 Jul 2025 09:17:05 +0000 Subject: [PATCH 10/11] run concurrent LP instead --- cpp/src/linear_programming/solve.cu | 10 ++++++++-- cpp/src/linear_programming/solve.cuh | 7 +++++++ cpp/src/mip/solver.cu | 20 ++++++++++++-------- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/cpp/src/linear_programming/solve.cu b/cpp/src/linear_programming/solve.cu index df3d3d1e1b..c069859978 100644 --- a/cpp/src/linear_programming/solve.cu +++ b/cpp/src/linear_programming/solve.cu @@ -467,7 +467,7 @@ void run_dual_simplex_thread( template optimization_problem_solution_t run_concurrent( - optimization_problem_t& op_problem, + const optimization_problem_t& op_problem, detail::problem_t& problem, pdlp_solver_settings_t const& settings, bool is_batch_mode) @@ -540,7 +540,7 @@ optimization_problem_solution_t run_concurrent( template optimization_problem_solution_t solve_lp_with_method( - optimization_problem_t& op_problem, + const optimization_problem_t& op_problem, detail::problem_t& problem, pdlp_solver_settings_t const& settings, bool is_batch_mode) @@ -714,6 +714,12 @@ optimization_problem_solution_t solve_lp( bool problem_checking, \ bool use_pdlp_solver_mode); \ \ + template optimization_problem_solution_t solve_lp_with_method( \ + const optimization_problem_t& op_problem, \ + detail::problem_t& problem, \ + pdlp_solver_settings_t const& settings, \ + bool is_batch_mode = false); \ + \ template optimization_problem_t mps_data_model_to_optimization_problem( \ raft::handle_t const* handle_ptr, \ const cuopt::mps_parser::mps_data_model_t& data_model); diff --git a/cpp/src/linear_programming/solve.cuh b/cpp/src/linear_programming/solve.cuh index bd7eee8df4..3024d6774b 100644 --- a/cpp/src/linear_programming/solve.cuh +++ b/cpp/src/linear_programming/solve.cuh @@ -30,4 +30,11 @@ cuopt::linear_programming::optimization_problem_t mps_data_model_to_op raft::handle_t const* handle_ptr, const cuopt::mps_parser::mps_data_model_t& data_model); +template +cuopt::linear_programming::optimization_problem_solution_t solve_lp_with_method( + const optimization_problem_t& op_problem, + detail::problem_t& problem, + pdlp_solver_settings_t const& settings, + bool is_batch_mode = false); + } // namespace cuopt::linear_programming diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu index ee4feb83fa..82fa6317c3 100644 --- a/cpp/src/mip/solver.cu +++ b/cpp/src/mip/solver.cu @@ -24,6 +24,7 @@ #include "solver.cuh" #include +#include #include #include @@ -126,16 +127,19 @@ solution_t mip_solver_t::run_solver() return sol; } - // if the problem was reduced to a LP: run PDLP + // if the problem was reduced to a LP: run concurrent LP if (context.problem_ptr->n_integer_vars == 0) { - CUOPT_LOG_INFO("Problem reduced to a LP, running PDLP"); - pdlp_solver_settings_t pdlp_settings{}; - pdlp_settings.time_limit = timer_.remaining_time(); - detail::pdlp_solver_t solver(*context.problem_ptr, pdlp_settings, false); - auto start_time = std::chrono::high_resolution_clock::now(); - auto opt_sol = solver.run_solver(start_time); + CUOPT_LOG_INFO("Problem reduced to a LP, running concurrent LP"); + pdlp_solver_settings_t settings{}; + settings.time_limit = timer_.remaining_time(); + settings.method = method_t::Concurrent; + + auto opt_sol = solve_lp_with_method( + *context.problem_ptr->original_problem_ptr, *context.problem_ptr, settings); + solution_t sol(*context.problem_ptr); - sol.copy_new_assignment(host_copy(opt_sol.get_primal_solution())); + sol.copy_new_assignment( + host_copy(opt_sol.get_primal_solution())); // TODO: check if this is correct if (opt_sol.get_termination_status() == pdlp_termination_status_t::Optimal || opt_sol.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible || opt_sol.get_termination_status() == pdlp_termination_status_t::DualInfeasible) { From a5e678e6a0e202d8939655623fff7b2dff1973e2 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Mon, 28 Jul 2025 09:19:51 +0000 Subject: [PATCH 11/11] remove todo --- cpp/src/mip/solver.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu index 82fa6317c3..0f2117991f 100644 --- a/cpp/src/mip/solver.cu +++ b/cpp/src/mip/solver.cu @@ -138,8 +138,7 @@ solution_t mip_solver_t::run_solver() *context.problem_ptr->original_problem_ptr, *context.problem_ptr, settings); solution_t sol(*context.problem_ptr); - sol.copy_new_assignment( - host_copy(opt_sol.get_primal_solution())); // TODO: check if this is correct + sol.copy_new_assignment(host_copy(opt_sol.get_primal_solution())); if (opt_sol.get_termination_status() == pdlp_termination_status_t::Optimal || opt_sol.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible || opt_sol.get_termination_status() == pdlp_termination_status_t::DualInfeasible) {