From a37c249279b2588fc5a6fa5a813c6035b3ac08d5 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 1 Apr 2026 14:52:48 -0500 Subject: [PATCH 01/18] Add changes to collect core dumps --- ci/cuopt_coredumps.sh | 71 ++++++++++++++++++++++++++++++++++ ci/test_cpp.sh | 7 ++++ ci/test_cpp_memcheck.sh | 7 ++++ ci/test_notebooks.sh | 7 ++++ ci/test_python.sh | 7 ++++ ci/test_self_hosted_service.sh | 7 ++++ ci/test_skills_assets.sh | 7 ++++ ci/test_wheel_cuopt.sh | 7 ++++ ci/test_wheel_cuopt_server.sh | 7 ++++ 9 files changed, 127 insertions(+) create mode 100644 ci/cuopt_coredumps.sh diff --git a/ci/cuopt_coredumps.sh b/ci/cuopt_coredumps.sh new file mode 100644 index 0000000000..650382174b --- /dev/null +++ b/ci/cuopt_coredumps.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Enable core dumps during CI test scripts and collect files into +# ${RAPIDS_ARTIFACTS_DIR}/coredumps so rapids-upload-artifacts-dir uploads them (S3). +# +# Shells: source this file from repo ci/ scripts, then call cuopt_enable_coredumps +# and trap cuopt_collect_coredumps on EXIT. + +cuopt_enable_coredumps() { + local ws base pattern + ws="${GITHUB_WORKSPACE:-${PWD}}" + base="${RAPIDS_ARTIFACTS_DIR:-${ws}/artifacts}" + export CUOPT_COREDUMP_DIR="${base}/coredumps" + mkdir -p "${CUOPT_COREDUMP_DIR}" + + ulimit -c unlimited 2>/dev/null || true + + if [[ -w /proc/sys/kernel/core_pattern ]]; then + echo "${CUOPT_COREDUMP_DIR}/core.%e.%p.%t" >/proc/sys/kernel/core_pattern 2>/dev/null || true + fi + + pattern="$(cat /proc/sys/kernel/core_pattern 2>/dev/null || echo n/a)" + if declare -F rapids-logger &>/dev/null; then + rapids-logger "Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit -c=$(ulimit -c) core_pattern=${pattern}" + else + echo "Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit -c=$(ulimit -c) core_pattern=${pattern}" + fi +} + +cuopt_collect_coredumps() { + local ws base dest n_before n_after f rel dest_name dest_path + ws="${GITHUB_WORKSPACE:-${PWD}}" + base="${RAPIDS_ARTIFACTS_DIR:-${ws}/artifacts}" + dest="${base}/coredumps" + mkdir -p "${dest}" + + n_before="$(find "${dest}" -type f 2>/dev/null | wc -l | tr -d '[:space:]')" + + while IFS= read -r -d '' f; do + [[ -f "${f}" ]] || continue + case "${f}" in + "${dest}/"*) continue ;; + esac + rel="${f#"${ws}"/}" + if [[ "${rel}" == "${f}" ]]; then + rel="$(basename "${f}")" + fi + dest_name="${rel//\//_}" + dest_path="${dest}/${dest_name}" + if [[ -e "${dest_path}" ]]; then + dest_path="${dest}/${dest_name}.${RANDOM}" + fi + cp -a "${f}" "${dest_path}" 2>/dev/null || true + done < <( + find "${ws}" \ + \( -path '*/.git/*' -o -path '*/opt/conda/*' -o -path '*/conda_pkgs/*' -o -path '*/artifacts/coredumps/*' \) -prune -o \ + \( -name 'core' -o -name 'core.*' \) -type f -print0 2>/dev/null + ) + + n_after="$(find "${dest}" -type f 2>/dev/null | wc -l | tr -d '[:space:]')" + if [[ "${n_after}" -gt "${n_before}" ]]; then + if declare -F rapids-logger &>/dev/null; then + rapids-logger "Wrote $((n_after - n_before)) core file(s) into ${dest} (${n_after} total)" + else + echo "cuOpt coredumps: ${n_after} file(s) in ${dest}" + fi + fi +} diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 653c44133a..570f4816d9 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -5,6 +5,13 @@ set -euo pipefail +_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ci/cuopt_coredumps.sh +source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" +cuopt_enable_coredumps +trap 'cuopt_collect_coredumps || true' EXIT +unset _CUOPT_CI_DIR + . /opt/conda/etc/profile.d/conda.sh rapids-logger "Configuring conda strict channel priority" diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh index bc4bce4cbc..48199eecef 100755 --- a/ci/test_cpp_memcheck.sh +++ b/ci/test_cpp_memcheck.sh @@ -10,6 +10,13 @@ fi set -euo pipefail +_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ci/cuopt_coredumps.sh +source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" +cuopt_enable_coredumps +trap 'cuopt_collect_coredumps || true' EXIT +unset _CUOPT_CI_DIR + . /opt/conda/etc/profile.d/conda.sh rapids-logger "Configuring conda strict channel priority" diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index 22c41af84c..8e0df1b4df 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -5,6 +5,13 @@ set -euo pipefail +_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ci/cuopt_coredumps.sh +source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" +cuopt_enable_coredumps +trap 'cuopt_collect_coredumps || true' EXIT +unset _CUOPT_CI_DIR + . /opt/conda/etc/profile.d/conda.sh rapids-logger "Configuring conda strict channel priority" diff --git a/ci/test_python.sh b/ci/test_python.sh index 4f91c83334..aec654ad17 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -5,6 +5,13 @@ set -euo pipefail +_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ci/cuopt_coredumps.sh +source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" +cuopt_enable_coredumps +trap 'cuopt_collect_coredumps || true' EXIT +unset _CUOPT_CI_DIR + . /opt/conda/etc/profile.d/conda.sh rapids-logger "Configuring conda strict channel priority" diff --git a/ci/test_self_hosted_service.sh b/ci/test_self_hosted_service.sh index 601b45326b..c9b7d509b6 100755 --- a/ci/test_self_hosted_service.sh +++ b/ci/test_self_hosted_service.sh @@ -5,6 +5,13 @@ set -euo pipefail +_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ci/cuopt_coredumps.sh +source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" +cuopt_enable_coredumps +trap 'cuopt_collect_coredumps || true' EXIT +unset _CUOPT_CI_DIR + source rapids-init-pip # Download the cuopt built in the previous step diff --git a/ci/test_skills_assets.sh b/ci/test_skills_assets.sh index c75645cb93..ccb5161a11 100755 --- a/ci/test_skills_assets.sh +++ b/ci/test_skills_assets.sh @@ -10,6 +10,13 @@ set -euo pipefail +_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ci/cuopt_coredumps.sh +source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" +cuopt_enable_coredumps +trap 'cuopt_collect_coredumps || true' EXIT +unset _CUOPT_CI_DIR + # Use rapids-logger in CI; fall back to echo for local testing if command -v rapids-logger &>/dev/null; then log() { rapids-logger "$*"; } diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh index a327082e83..33a5300581 100755 --- a/ci/test_wheel_cuopt.sh +++ b/ci/test_wheel_cuopt.sh @@ -5,6 +5,13 @@ set -euo pipefail +_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ci/cuopt_coredumps.sh +source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" +cuopt_enable_coredumps +trap 'cuopt_collect_coredumps || true' EXIT +unset _CUOPT_CI_DIR + # sets up a constraints file for 'pip' and puts its location in an exported variable PIP_EXPORT, # so those constraints will affect all future 'pip install' calls source rapids-init-pip diff --git a/ci/test_wheel_cuopt_server.sh b/ci/test_wheel_cuopt_server.sh index a76969b965..19c4bf7c1a 100755 --- a/ci/test_wheel_cuopt_server.sh +++ b/ci/test_wheel_cuopt_server.sh @@ -5,6 +5,13 @@ set -eou pipefail +_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=ci/cuopt_coredumps.sh +source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" +cuopt_enable_coredumps +trap 'cuopt_collect_coredumps || true' EXIT +unset _CUOPT_CI_DIR + source rapids-init-pip # Download the packages built in the previous step From 2fd437fc2b2e0a03a6eeef246fc8a0502b187a06 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 1 Apr 2026 15:29:06 -0500 Subject: [PATCH 02/18] fix style --- ci/test_cpp.sh | 2 +- ci/test_cpp_memcheck.sh | 2 +- ci/test_notebooks.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 570f4816d9..bd65f927a3 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh index 48199eecef..d9ddbc7edd 100755 --- a/ci/test_cpp_memcheck.sh +++ b/ci/test_cpp_memcheck.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 if [[ "$(date +%A)" != "Friday" ]]; then diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index 8e0df1b4df..b7897e4032 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail From 0da2a0de87acf39c0c734720f1cd5d9db4286357 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 1 Apr 2026 15:33:19 -0500 Subject: [PATCH 03/18] add core dump test --- ci/run_ctests.sh | 17 ++++++++++ cpp/tests/utilities/CMakeLists.txt | 3 ++ cpp/tests/utilities/coredump_sanity_test.cpp | 34 ++++++++++++++++++++ 3 files changed, 54 insertions(+) create mode 100644 cpp/tests/utilities/coredump_sanity_test.cpp diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh index fc1de8e1b4..0287fcbc61 100755 --- a/ci/run_ctests.sh +++ b/ci/run_ctests.sh @@ -21,6 +21,23 @@ else exit 1 fi +# Run first: intentional SIGSEGV to validate core dump collection (ci/cuopt_coredumps.sh). +# Expect a non-zero exit (e.g. 139). The same binary is invoked again in the loop without +# CUOPT_TEST_COREDUMP and skips the fatal case. +if [[ -x "${GTEST_DIR}/COREDUMP_SANITY_TEST" ]]; then + echo "Running COREDUMP_SANITY_TEST with CUOPT_TEST_COREDUMP=1 (expected fatal signal)" + set +e + CUOPT_TEST_COREDUMP=1 "${GTEST_DIR}/COREDUMP_SANITY_TEST" "$@" + _coredump_ret=$? + set -e + if [[ "${_coredump_ret}" -eq 0 ]]; then + echo "ERROR: COREDUMP_SANITY_TEST exited 0 with CUOPT_TEST_COREDUMP=1; expected crash" >&2 + exit 1 + fi +else + echo "Skipping COREDUMP_SANITY_TEST (binary not found)" +fi + for gt in "${GTEST_DIR}"/*_TEST; do test_name=$(basename "${gt}") echo "Running gtest ${test_name}" diff --git a/cpp/tests/utilities/CMakeLists.txt b/cpp/tests/utilities/CMakeLists.txt index 5f9e6d5e82..73020cf0a9 100644 --- a/cpp/tests/utilities/CMakeLists.txt +++ b/cpp/tests/utilities/CMakeLists.txt @@ -5,3 +5,6 @@ # Add CLI end-to-end test ConfigureTest(CLI_TEST test_cli.cpp) + +# Opt-in SIGSEGV for validating core dump collection (CUOPT_TEST_COREDUMP=1) +ConfigureTest(COREDUMP_SANITY_TEST coredump_sanity_test.cpp) diff --git a/cpp/tests/utilities/coredump_sanity_test.cpp b/cpp/tests/utilities/coredump_sanity_test.cpp new file mode 100644 index 0000000000..31fafc61f7 --- /dev/null +++ b/cpp/tests/utilities/coredump_sanity_test.cpp @@ -0,0 +1,34 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#include +#include +#include + +namespace { + +bool coredump_env_enabled() +{ + const char* v = std::getenv("CUOPT_TEST_COREDUMP"); + if (v == nullptr || v[0] == '\0') { return false; } + return std::strcmp(v, "0") != 0; +} + +} // namespace + +// Opt-in crash to validate CI core dump collection (see ci/cuopt_coredumps.sh). +// Normal CI: skipped. To reproduce locally or in a one-off job: +// CUOPT_TEST_COREDUMP=1 ulimit -c unlimited ./COREDUMP_SANITY_TEST +TEST(CoredumpSanity, IntentionalSegfaultWhenEnvSet) +{ + if (!coredump_env_enabled()) { + GTEST_SKIP() << "Set CUOPT_TEST_COREDUMP=1 to intentionally SIGSEGV for core dump checks."; + } + std::raise(SIGSEGV); +} From 5f44aac20985ea0a678c2c27f5b50613047d80b6 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 1 Apr 2026 15:34:04 -0500 Subject: [PATCH 04/18] testing --- cpp/tests/utilities/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/utilities/CMakeLists.txt b/cpp/tests/utilities/CMakeLists.txt index 73020cf0a9..89958f32e1 100644 --- a/cpp/tests/utilities/CMakeLists.txt +++ b/cpp/tests/utilities/CMakeLists.txt @@ -1,5 +1,5 @@ # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on From 7276e858bee48edcc73907ae77fb6d58768f6aae Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 6 Apr 2026 11:56:35 -0500 Subject: [PATCH 05/18] enable core dump for all --- ci/cuopt_coredumps.sh | 174 +++++++++++++++++++++++++++++++-- ci/run_ctests.sh | 8 +- ci/test_cpp.sh | 13 ++- ci/test_cpp_memcheck.sh | 1 + ci/test_notebooks.sh | 1 + ci/test_python.sh | 1 + ci/test_self_hosted_service.sh | 1 + ci/test_skills_assets.sh | 1 + ci/test_wheel_cuopt.sh | 1 + ci/test_wheel_cuopt_server.sh | 1 + 10 files changed, 190 insertions(+), 12 deletions(-) diff --git a/ci/cuopt_coredumps.sh b/ci/cuopt_coredumps.sh index 650382174b..f5d294d9f4 100644 --- a/ci/cuopt_coredumps.sh +++ b/ci/cuopt_coredumps.sh @@ -4,37 +4,197 @@ # SPDX-License-Identifier: Apache-2.0 # Enable core dumps during CI test scripts and collect files into -# ${RAPIDS_ARTIFACTS_DIR}/coredumps so rapids-upload-artifacts-dir uploads them (S3). +# ${RAPIDS_ARTIFACTS_DIR}/${CUOPT_GDB_CORE_ARTIFACT_DIR} so rapids-upload-artifacts-dir +# uploads them to S3 as: +# {rapids-matrix-prefix}.{cuopt-gdb-cores_JOB_cudaVER_pyVER_arch_BUILDTYPE} +# (tarball of that directory). The trailing segment includes a job label, resolved in order: +# 1) CUOPT_CI_JOB_LABEL if set (workflow/setter can export the real GitHub job id). +# 2) GITHUB_JOB if it looks like a caller job id (not generic RAPIDS callee ids such as tests). +# 3) The ci/test_*.sh that sourced this file: label derived by naming rules (new drivers need +# no edits here — e.g. test_foo.sh → conda-foo-tests, test_wheel_bar.sh → wheel-bar-tests). +# Then CUDA / Python / arch / build_type from RAPIDS CI env. # -# Shells: source this file from repo ci/ scripts, then call cuopt_enable_coredumps -# and trap cuopt_collect_coredumps on EXIT. +# Coredump smoke (fails the job immediately after setup so you can verify artifacts): +# • On GitHub Actions (GITHUB_ACTIONS=true): runs unless CUOPT_CI_COREDUMP_SMOKE is 0|false|no|off. +# • Elsewhere: runs only if CUOPT_CI_COREDUMP_SMOKE is 1|true|yes|on (local ad-hoc testing). +# cuopt_coredump_smoke_fail_job_if_enabled SIGSEGVs a child, EXIT trap collects cores, exit 139. +# +# Shells: source this file from repo ci/ scripts, then call cuopt_enable_coredumps and trap +# cuopt_collect_coredumps on EXIT, then cuopt_coredump_smoke_fail_job_if_enabled. + +# Set in cuopt_enable_coredumps; collect reuses when non-empty. +CUOPT_GDB_CORE_ARTIFACT_DIR= + +# Reusable RAPIDS workflows often run a job literally named "tests" / "build" — not unique. +cuopt__github_job_is_generic() { + case "${1:-}" in + "" | tests | build | compute-matrix | prepare | package) return 0 ;; + *) return 1 ;; + esac +} + +# test_cpp.sh → conda-cpp-tests; test_wheel_cuopt_server.sh → wheel-cuopt-server-tests; etc. +cuopt__job_label_from_entry_script_basename() { + local b="$1" + b="${b%.sh}" + case "$b" in + test_wheel_*) + b="${b#test_wheel_}" + echo "wheel-${b//_/-}-tests" + ;; + test_self_hosted_*) + b="${b#test_self_hosted_}" + echo "self-hosted-${b//_/-}-tests" + ;; + test_skills_*) + b="${b#test_skills_}" + echo "conda-skills-${b//_/-}" + ;; + test_*memcheck) + b="${b#test_}" + echo "conda-${b//_/-}" + ;; + test_*) + b="${b#test_}" + echo "conda-${b//_/-}-tests" + ;; + *) + echo "unknown-job" + ;; + esac +} + +cuopt__find_ci_entry_test_script_basename() { + local i f base + for ((i = 0; i < ${#BASH_SOURCE[@]}; i++)); do + f="${BASH_SOURCE[$i]}" + base="$(basename "${f}")" + [[ "${base}" == "cuopt_coredumps.sh" ]] && continue + case "${base}" in + test_*.sh) echo "${base}"; return ;; + esac + done + echo "" +} + +cuopt__infer_ci_job_label_from_call_stack() { + local nb + nb="$(cuopt__find_ci_entry_test_script_basename)" + if [[ -n "${nb}" ]]; then + cuopt__job_label_from_entry_script_basename "${nb}" + return + fi + echo "unknown-job" +} + +cuopt__resolve_ci_job_label() { + if [[ -n "${CUOPT_CI_JOB_LABEL:-}" ]]; then + echo "${CUOPT_CI_JOB_LABEL}" + return + fi + if [[ -n "${GITHUB_JOB:-}" ]] && ! cuopt__github_job_is_generic "${GITHUB_JOB}"; then + echo "${GITHUB_JOB}" + return + fi + cuopt__infer_ci_job_label_from_call_stack +} + +cuopt__gdb_core_artifact_basename() { + local job cuda_ver py_ver arch_ bt + job="$(cuopt__resolve_ci_job_label)" + job="${job//[^a-zA-Z0-9_-]/_}" + cuda_ver="${RAPIDS_CUDA_VERSION:-unknown}" + cuda_ver="${cuda_ver//[^a-zA-Z0-9._-]/_}" + py_ver="${RAPIDS_PY_VERSION:-na}" + py_ver="${py_ver//[^a-zA-Z0-9._-]/_}" + arch_="$(arch 2>/dev/null || true)" + [[ -z "${arch_}" ]] && arch_="$(uname -m)" + arch_="${arch_//[^a-zA-Z0-9_-]/_}" + bt="${RAPIDS_BUILD_TYPE:-na}" + bt="${bt//[^a-zA-Z0-9_-]/_}" + echo "cuopt-gdb-cores_${job}_cuda${cuda_ver}_py${py_ver}_${arch_}_${bt}" +} cuopt_enable_coredumps() { local ws base pattern ws="${GITHUB_WORKSPACE:-${PWD}}" base="${RAPIDS_ARTIFACTS_DIR:-${ws}/artifacts}" - export CUOPT_COREDUMP_DIR="${base}/coredumps" + export CUOPT_CI_JOB_LABEL="$(cuopt__resolve_ci_job_label)" + CUOPT_GDB_CORE_ARTIFACT_DIR="$(cuopt__gdb_core_artifact_basename)" + export CUOPT_GDB_CORE_ARTIFACT_DIR + export CUOPT_COREDUMP_DIR="${base}/${CUOPT_GDB_CORE_ARTIFACT_DIR}" mkdir -p "${CUOPT_COREDUMP_DIR}" + local pattern_target="${CUOPT_COREDUMP_DIR}/core.%e.%p.%t" + + # Raise soft limit to match hard limit when possible (some shells default to 0). ulimit -c unlimited 2>/dev/null || true + ulimit -H -c unlimited 2>/dev/null || true + # When unset, ask the kernel for broad core dump contents (Linux 4.6+; ignored elsewhere). + if [[ -z "${COREDUMP_FILTER:-}" ]]; then + export COREDUMP_FILTER=0xff + fi + + # Prefer writing cores as files under CUOPT_COREDUMP_DIR (often fails in unprivileged Docker). if [[ -w /proc/sys/kernel/core_pattern ]]; then - echo "${CUOPT_COREDUMP_DIR}/core.%e.%p.%t" >/proc/sys/kernel/core_pattern 2>/dev/null || true + echo "${pattern_target}" >/proc/sys/kernel/core_pattern 2>/dev/null || true + fi + if command -v sysctl >/dev/null 2>&1; then + sysctl -q -w "kernel.core_pattern=${pattern_target}" 2>/dev/null || true fi pattern="$(cat /proc/sys/kernel/core_pattern 2>/dev/null || echo n/a)" if declare -F rapids-logger &>/dev/null; then rapids-logger "Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit -c=$(ulimit -c) core_pattern=${pattern}" + if [[ "${pattern}" == \|* ]]; then + rapids-logger "WARNING: core_pattern pipes to a collector (e.g. apport); cores may not appear as files under ${CUOPT_COREDUMP_DIR}. Use a writable core_pattern or a privileged runner if needed." + fi else echo "Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit -c=$(ulimit -c) core_pattern=${pattern}" + if [[ "${pattern}" == \|* ]]; then + echo "WARNING: core_pattern pipes to a collector; files may not land in ${CUOPT_COREDUMP_DIR}" >&2 + fi fi } +# Deliberate SIGSEGV in a subprocess, then exit so CI fails and EXIT trap still runs collection. +cuopt_coredump_smoke_fail_job_if_enabled() { + case "${CUOPT_CI_COREDUMP_SMOKE:-}" in + 0 | false | FALSE | no | NO | off | OFF) return 0 ;; + esac + + local run_smoke=0 + if [[ "${GITHUB_ACTIONS:-}" == "true" ]]; then + run_smoke=1 + fi + case "${CUOPT_CI_COREDUMP_SMOKE:-}" in + 1 | true | TRUE | yes | YES | on | ON) run_smoke=1 ;; + esac + if [[ "${run_smoke}" -eq 0 ]]; then + return 0 + fi + + if declare -F rapids-logger &>/dev/null; then + rapids-logger "Coredump smoke: SIGSEGV in child bash (GITHUB_ACTIONS=${GITHUB_ACTIONS:-unset}, CUOPT_CI_COREDUMP_SMOKE=${CUOPT_CI_COREDUMP_SMOKE:-unset}); exiting 139" + else + echo "Coredump smoke: SIGSEGV child; exiting 139 (GITHUB_ACTIONS=${GITHUB_ACTIONS:-unset})" >&2 + fi + # Crash the child only: if the main script died from SIGSEGV, the EXIT trap would not run. + bash -c 'kill -SEGV "$$"' || true + # Brief pause so the kernel can finish writing the core file before the parent exits. + sleep 0.2 + exit 139 +} + cuopt_collect_coredumps() { local ws base dest n_before n_after f rel dest_name dest_path ws="${GITHUB_WORKSPACE:-${PWD}}" base="${RAPIDS_ARTIFACTS_DIR:-${ws}/artifacts}" - dest="${base}/coredumps" + if [[ -z "${CUOPT_GDB_CORE_ARTIFACT_DIR:-}" ]]; then + CUOPT_GDB_CORE_ARTIFACT_DIR="$(cuopt__gdb_core_artifact_basename)" + fi + dest="${base}/${CUOPT_GDB_CORE_ARTIFACT_DIR}" mkdir -p "${dest}" n_before="$(find "${dest}" -type f 2>/dev/null | wc -l | tr -d '[:space:]')" @@ -56,7 +216,7 @@ cuopt_collect_coredumps() { cp -a "${f}" "${dest_path}" 2>/dev/null || true done < <( find "${ws}" \ - \( -path '*/.git/*' -o -path '*/opt/conda/*' -o -path '*/conda_pkgs/*' -o -path '*/artifacts/coredumps/*' \) -prune -o \ + \( -path '*/.git/*' -o -path '*/opt/conda/*' -o -path '*/conda_pkgs/*' -o -path "${dest}/*" \) -prune -o \ \( -name 'core' -o -name 'core.*' \) -type f -print0 2>/dev/null ) diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh index 0287fcbc61..d808ed5403 100755 --- a/ci/run_ctests.sh +++ b/ci/run_ctests.sh @@ -38,11 +38,17 @@ else echo "Skipping COREDUMP_SANITY_TEST (binary not found)" fi +shopt -s nullglob for gt in "${GTEST_DIR}"/*_TEST; do test_name=$(basename "${gt}") echo "Running gtest ${test_name}" - "${gt}" "$@" + if ! "${gt}" "$@"; then + _g_rc=$? + echo "ERROR: gtest ${test_name} failed (exit ${_g_rc}); stopping run_ctests.sh" >&2 + exit "${_g_rc}" + fi done +shopt -u nullglob # Run C_API_TEST with CPU memory for local solves (excluding time limit tests) if [ -x "${GTEST_DIR}/C_API_TEST" ]; then diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index bd65f927a3..d391f47e09 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -11,6 +11,7 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR +cuopt_coredump_smoke_fail_job_if_enabled . /opt/conda/etc/profile.d/conda.sh @@ -51,8 +52,6 @@ pushd "${RAPIDS_DATASET_ROOT_DIR}" ./get_test_data.sh popd -EXITCODE=0 -trap "EXITCODE=1" ERR set +e # Run gtests from libcuopt-tests package @@ -60,6 +59,12 @@ export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/ rapids-logger "Run gtests" timeout 40m ./ci/run_ctests.sh +EXITCODE=$? +set -e -rapids-logger "Test script exiting with value: $EXITCODE" -exit ${EXITCODE} +if [[ "${EXITCODE}" -ne 0 ]]; then + rapids-logger "run_ctests.sh failed (exit ${EXITCODE}); skipping remaining steps" +fi + +rapids-logger "Test script exiting with value: ${EXITCODE}" +exit "${EXITCODE}" diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh index d9ddbc7edd..aab3bfb6b8 100755 --- a/ci/test_cpp_memcheck.sh +++ b/ci/test_cpp_memcheck.sh @@ -16,6 +16,7 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR +cuopt_coredump_smoke_fail_job_if_enabled . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index b7897e4032..424e8a6cd3 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -11,6 +11,7 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR +cuopt_coredump_smoke_fail_job_if_enabled . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_python.sh b/ci/test_python.sh index aec654ad17..6e13f19032 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -11,6 +11,7 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR +cuopt_coredump_smoke_fail_job_if_enabled . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_self_hosted_service.sh b/ci/test_self_hosted_service.sh index c9b7d509b6..7150356764 100755 --- a/ci/test_self_hosted_service.sh +++ b/ci/test_self_hosted_service.sh @@ -11,6 +11,7 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR +cuopt_coredump_smoke_fail_job_if_enabled source rapids-init-pip diff --git a/ci/test_skills_assets.sh b/ci/test_skills_assets.sh index ccb5161a11..ec934b079d 100755 --- a/ci/test_skills_assets.sh +++ b/ci/test_skills_assets.sh @@ -16,6 +16,7 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR +cuopt_coredump_smoke_fail_job_if_enabled # Use rapids-logger in CI; fall back to echo for local testing if command -v rapids-logger &>/dev/null; then diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh index 33a5300581..1b8d83a176 100755 --- a/ci/test_wheel_cuopt.sh +++ b/ci/test_wheel_cuopt.sh @@ -11,6 +11,7 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR +cuopt_coredump_smoke_fail_job_if_enabled # sets up a constraints file for 'pip' and puts its location in an exported variable PIP_EXPORT, # so those constraints will affect all future 'pip install' calls diff --git a/ci/test_wheel_cuopt_server.sh b/ci/test_wheel_cuopt_server.sh index 19c4bf7c1a..62f4a5109c 100755 --- a/ci/test_wheel_cuopt_server.sh +++ b/ci/test_wheel_cuopt_server.sh @@ -11,6 +11,7 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR +cuopt_coredump_smoke_fail_job_if_enabled source rapids-init-pip From 33ab5f4b137d4d11a84dbea7c8cc1b3c91fe60ed Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 6 Apr 2026 11:59:05 -0500 Subject: [PATCH 06/18] style --- ci/cuopt_coredumps.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/cuopt_coredumps.sh b/ci/cuopt_coredumps.sh index f5d294d9f4..26fe7247a1 100644 --- a/ci/cuopt_coredumps.sh +++ b/ci/cuopt_coredumps.sh @@ -119,7 +119,8 @@ cuopt_enable_coredumps() { local ws base pattern ws="${GITHUB_WORKSPACE:-${PWD}}" base="${RAPIDS_ARTIFACTS_DIR:-${ws}/artifacts}" - export CUOPT_CI_JOB_LABEL="$(cuopt__resolve_ci_job_label)" + CUOPT_CI_JOB_LABEL="$(cuopt__resolve_ci_job_label)" + export CUOPT_CI_JOB_LABEL CUOPT_GDB_CORE_ARTIFACT_DIR="$(cuopt__gdb_core_artifact_basename)" export CUOPT_GDB_CORE_ARTIFACT_DIR export CUOPT_COREDUMP_DIR="${base}/${CUOPT_GDB_CORE_ARTIFACT_DIR}" From 484d1b87402c5edf0c5a9a36247b1c33bf3a9cdd Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 6 Apr 2026 13:32:30 -0500 Subject: [PATCH 07/18] check fixes --- ci/cuopt_coredumps.sh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ci/cuopt_coredumps.sh b/ci/cuopt_coredumps.sh index 26fe7247a1..c5e815c9dd 100644 --- a/ci/cuopt_coredumps.sh +++ b/ci/cuopt_coredumps.sh @@ -7,9 +7,12 @@ # ${RAPIDS_ARTIFACTS_DIR}/${CUOPT_GDB_CORE_ARTIFACT_DIR} so rapids-upload-artifacts-dir # uploads them to S3 as: # {rapids-matrix-prefix}.{cuopt-gdb-cores_JOB_cudaVER_pyVER_arch_BUILDTYPE} -# (tarball of that directory). The trailing segment includes a job label, resolved in order: +# RAPIDS rapids-upload-to-s3 tgz-streams each directory (gzip-compressed tar); the object +# name often has no .tar.gz suffix in listings, but downloads are still archives. Very small +# sizes (~100 B) usually mean an almost-empty archive (no core files landed on disk). The +# trailing segment includes a job label, resolved in order: # 1) CUOPT_CI_JOB_LABEL if set (workflow/setter can export the real GitHub job id). -# 2) GITHUB_JOB if it looks like a caller job id (not generic RAPIDS callee ids such as tests). +# 2) GITHUB_JOB if it looks like a caller job id (not generic RAPIDS callee ids such as tests/test). # 3) The ci/test_*.sh that sourced this file: label derived by naming rules (new drivers need # no edits here — e.g. test_foo.sh → conda-foo-tests, test_wheel_bar.sh → wheel-bar-tests). # Then CUDA / Python / arch / build_type from RAPIDS CI env. @@ -25,10 +28,12 @@ # Set in cuopt_enable_coredumps; collect reuses when non-empty. CUOPT_GDB_CORE_ARTIFACT_DIR= -# Reusable RAPIDS workflows often run a job literally named "tests" / "build" — not unique. +# Reusable RAPIDS workflows often use non-unique job ids ("tests", "test", "build", …). +# GITHUB_JOB=test (singular) is common; treating it as meaningful produced labels like +# "test" and hid script-based names (wheel-cuopt-tests, conda-cpp-tests). cuopt__github_job_is_generic() { case "${1:-}" in - "" | tests | build | compute-matrix | prepare | package) return 0 ;; + "" | test | tests | build | compute-matrix | prepare | package) return 0 ;; *) return 1 ;; esac } From 43b91f357d8b8b6144c4ba76b08edf43871f7d51 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 6 Apr 2026 15:47:32 -0500 Subject: [PATCH 08/18] remove testing --- ci/cuopt_coredumps.sh | 36 +----------------------------- ci/run_ctests.sh | 17 -------------- ci/test_cpp.sh | 1 - cpp/tests/utilities/CMakeLists.txt | 3 --- 4 files changed, 1 insertion(+), 56 deletions(-) diff --git a/ci/cuopt_coredumps.sh b/ci/cuopt_coredumps.sh index c5e815c9dd..44b2e7030a 100644 --- a/ci/cuopt_coredumps.sh +++ b/ci/cuopt_coredumps.sh @@ -17,13 +17,8 @@ # no edits here — e.g. test_foo.sh → conda-foo-tests, test_wheel_bar.sh → wheel-bar-tests). # Then CUDA / Python / arch / build_type from RAPIDS CI env. # -# Coredump smoke (fails the job immediately after setup so you can verify artifacts): -# • On GitHub Actions (GITHUB_ACTIONS=true): runs unless CUOPT_CI_COREDUMP_SMOKE is 0|false|no|off. -# • Elsewhere: runs only if CUOPT_CI_COREDUMP_SMOKE is 1|true|yes|on (local ad-hoc testing). -# cuopt_coredump_smoke_fail_job_if_enabled SIGSEGVs a child, EXIT trap collects cores, exit 139. -# # Shells: source this file from repo ci/ scripts, then call cuopt_enable_coredumps and trap -# cuopt_collect_coredumps on EXIT, then cuopt_coredump_smoke_fail_job_if_enabled. +# cuopt_collect_coredumps on EXIT. # Set in cuopt_enable_coredumps; collect reuses when non-empty. CUOPT_GDB_CORE_ARTIFACT_DIR= @@ -164,35 +159,6 @@ cuopt_enable_coredumps() { fi } -# Deliberate SIGSEGV in a subprocess, then exit so CI fails and EXIT trap still runs collection. -cuopt_coredump_smoke_fail_job_if_enabled() { - case "${CUOPT_CI_COREDUMP_SMOKE:-}" in - 0 | false | FALSE | no | NO | off | OFF) return 0 ;; - esac - - local run_smoke=0 - if [[ "${GITHUB_ACTIONS:-}" == "true" ]]; then - run_smoke=1 - fi - case "${CUOPT_CI_COREDUMP_SMOKE:-}" in - 1 | true | TRUE | yes | YES | on | ON) run_smoke=1 ;; - esac - if [[ "${run_smoke}" -eq 0 ]]; then - return 0 - fi - - if declare -F rapids-logger &>/dev/null; then - rapids-logger "Coredump smoke: SIGSEGV in child bash (GITHUB_ACTIONS=${GITHUB_ACTIONS:-unset}, CUOPT_CI_COREDUMP_SMOKE=${CUOPT_CI_COREDUMP_SMOKE:-unset}); exiting 139" - else - echo "Coredump smoke: SIGSEGV child; exiting 139 (GITHUB_ACTIONS=${GITHUB_ACTIONS:-unset})" >&2 - fi - # Crash the child only: if the main script died from SIGSEGV, the EXIT trap would not run. - bash -c 'kill -SEGV "$$"' || true - # Brief pause so the kernel can finish writing the core file before the parent exits. - sleep 0.2 - exit 139 -} - cuopt_collect_coredumps() { local ws base dest n_before n_after f rel dest_name dest_path ws="${GITHUB_WORKSPACE:-${PWD}}" diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh index d808ed5403..01508ffc01 100755 --- a/ci/run_ctests.sh +++ b/ci/run_ctests.sh @@ -21,23 +21,6 @@ else exit 1 fi -# Run first: intentional SIGSEGV to validate core dump collection (ci/cuopt_coredumps.sh). -# Expect a non-zero exit (e.g. 139). The same binary is invoked again in the loop without -# CUOPT_TEST_COREDUMP and skips the fatal case. -if [[ -x "${GTEST_DIR}/COREDUMP_SANITY_TEST" ]]; then - echo "Running COREDUMP_SANITY_TEST with CUOPT_TEST_COREDUMP=1 (expected fatal signal)" - set +e - CUOPT_TEST_COREDUMP=1 "${GTEST_DIR}/COREDUMP_SANITY_TEST" "$@" - _coredump_ret=$? - set -e - if [[ "${_coredump_ret}" -eq 0 ]]; then - echo "ERROR: COREDUMP_SANITY_TEST exited 0 with CUOPT_TEST_COREDUMP=1; expected crash" >&2 - exit 1 - fi -else - echo "Skipping COREDUMP_SANITY_TEST (binary not found)" -fi - shopt -s nullglob for gt in "${GTEST_DIR}"/*_TEST; do test_name=$(basename "${gt}") diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index d391f47e09..1caaed4c54 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -11,7 +11,6 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR -cuopt_coredump_smoke_fail_job_if_enabled . /opt/conda/etc/profile.d/conda.sh diff --git a/cpp/tests/utilities/CMakeLists.txt b/cpp/tests/utilities/CMakeLists.txt index 89958f32e1..75c3805c6e 100644 --- a/cpp/tests/utilities/CMakeLists.txt +++ b/cpp/tests/utilities/CMakeLists.txt @@ -5,6 +5,3 @@ # Add CLI end-to-end test ConfigureTest(CLI_TEST test_cli.cpp) - -# Opt-in SIGSEGV for validating core dump collection (CUOPT_TEST_COREDUMP=1) -ConfigureTest(COREDUMP_SANITY_TEST coredump_sanity_test.cpp) From 75fc79ae1bbe7fea694171f5ad68566f21254202 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 6 Apr 2026 15:48:22 -0500 Subject: [PATCH 09/18] fix --- cpp/tests/utilities/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/utilities/CMakeLists.txt b/cpp/tests/utilities/CMakeLists.txt index 75c3805c6e..5f9e6d5e82 100644 --- a/cpp/tests/utilities/CMakeLists.txt +++ b/cpp/tests/utilities/CMakeLists.txt @@ -1,5 +1,5 @@ # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on From c05f23ed8dfac89e6f2baf25ede2c7fc6d67e4c2 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 6 Apr 2026 15:52:38 -0500 Subject: [PATCH 10/18] remove testing --- ci/test_cpp_memcheck.sh | 1 - ci/test_notebooks.sh | 1 - ci/test_python.sh | 1 - ci/test_self_hosted_service.sh | 1 - ci/test_skills_assets.sh | 1 - ci/test_wheel_cuopt.sh | 1 - ci/test_wheel_cuopt_server.sh | 1 - cpp/tests/utilities/coredump_sanity_test.cpp | 34 -------------------- 8 files changed, 41 deletions(-) delete mode 100644 cpp/tests/utilities/coredump_sanity_test.cpp diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh index aab3bfb6b8..d9ddbc7edd 100755 --- a/ci/test_cpp_memcheck.sh +++ b/ci/test_cpp_memcheck.sh @@ -16,7 +16,6 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR -cuopt_coredump_smoke_fail_job_if_enabled . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index 424e8a6cd3..b7897e4032 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -11,7 +11,6 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR -cuopt_coredump_smoke_fail_job_if_enabled . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_python.sh b/ci/test_python.sh index 6e13f19032..aec654ad17 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -11,7 +11,6 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR -cuopt_coredump_smoke_fail_job_if_enabled . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_self_hosted_service.sh b/ci/test_self_hosted_service.sh index 7150356764..c9b7d509b6 100755 --- a/ci/test_self_hosted_service.sh +++ b/ci/test_self_hosted_service.sh @@ -11,7 +11,6 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR -cuopt_coredump_smoke_fail_job_if_enabled source rapids-init-pip diff --git a/ci/test_skills_assets.sh b/ci/test_skills_assets.sh index ec934b079d..ccb5161a11 100755 --- a/ci/test_skills_assets.sh +++ b/ci/test_skills_assets.sh @@ -16,7 +16,6 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR -cuopt_coredump_smoke_fail_job_if_enabled # Use rapids-logger in CI; fall back to echo for local testing if command -v rapids-logger &>/dev/null; then diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh index 1b8d83a176..33a5300581 100755 --- a/ci/test_wheel_cuopt.sh +++ b/ci/test_wheel_cuopt.sh @@ -11,7 +11,6 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR -cuopt_coredump_smoke_fail_job_if_enabled # sets up a constraints file for 'pip' and puts its location in an exported variable PIP_EXPORT, # so those constraints will affect all future 'pip install' calls diff --git a/ci/test_wheel_cuopt_server.sh b/ci/test_wheel_cuopt_server.sh index 62f4a5109c..19c4bf7c1a 100755 --- a/ci/test_wheel_cuopt_server.sh +++ b/ci/test_wheel_cuopt_server.sh @@ -11,7 +11,6 @@ source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" cuopt_enable_coredumps trap 'cuopt_collect_coredumps || true' EXIT unset _CUOPT_CI_DIR -cuopt_coredump_smoke_fail_job_if_enabled source rapids-init-pip diff --git a/cpp/tests/utilities/coredump_sanity_test.cpp b/cpp/tests/utilities/coredump_sanity_test.cpp deleted file mode 100644 index 31fafc61f7..0000000000 --- a/cpp/tests/utilities/coredump_sanity_test.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* clang-format off */ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - */ -/* clang-format on */ - -#include - -#include -#include -#include - -namespace { - -bool coredump_env_enabled() -{ - const char* v = std::getenv("CUOPT_TEST_COREDUMP"); - if (v == nullptr || v[0] == '\0') { return false; } - return std::strcmp(v, "0") != 0; -} - -} // namespace - -// Opt-in crash to validate CI core dump collection (see ci/cuopt_coredumps.sh). -// Normal CI: skipped. To reproduce locally or in a one-off job: -// CUOPT_TEST_COREDUMP=1 ulimit -c unlimited ./COREDUMP_SANITY_TEST -TEST(CoredumpSanity, IntentionalSegfaultWhenEnvSet) -{ - if (!coredump_env_enabled()) { - GTEST_SKIP() << "Set CUOPT_TEST_COREDUMP=1 to intentionally SIGSEGV for core dump checks."; - } - std::raise(SIGSEGV); -} From cd5cf1f0a8844ffc071b1fc11fca77809d1f79c9 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 6 Apr 2026 16:05:05 -0500 Subject: [PATCH 11/18] revert ctesting addons --- ci/run_ctests.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh index 01508ffc01..bdd5857c06 100755 --- a/ci/run_ctests.sh +++ b/ci/run_ctests.sh @@ -25,11 +25,7 @@ shopt -s nullglob for gt in "${GTEST_DIR}"/*_TEST; do test_name=$(basename "${gt}") echo "Running gtest ${test_name}" - if ! "${gt}" "$@"; then - _g_rc=$? - echo "ERROR: gtest ${test_name} failed (exit ${_g_rc}); stopping run_ctests.sh" >&2 - exit "${_g_rc}" - fi + "${gt}" "$@" done shopt -u nullglob From f33d6b43c9f7403400dd15d28f1b18c96fbbaf40 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 6 Apr 2026 16:07:47 -0500 Subject: [PATCH 12/18] revert changes --- ci/test_cpp.sh | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 1caaed4c54..d9d0e36e7c 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -51,19 +51,14 @@ pushd "${RAPIDS_DATASET_ROOT_DIR}" ./get_test_data.sh popd +EXITCODE=0 +trap "EXITCODE=1" ERR set +e - # Run gtests from libcuopt-tests package export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/ rapids-logger "Run gtests" timeout 40m ./ci/run_ctests.sh -EXITCODE=$? -set -e - -if [[ "${EXITCODE}" -ne 0 ]]; then - rapids-logger "run_ctests.sh failed (exit ${EXITCODE}); skipping remaining steps" -fi -rapids-logger "Test script exiting with value: ${EXITCODE}" -exit "${EXITCODE}" +rapids-logger "Test script exiting with value: $EXITCODE" +exit ${EXITCODE} From f707b553f90cf847b1b992d8599cbecb71958e95 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 6 Apr 2026 16:38:27 -0500 Subject: [PATCH 13/18] refactor --- ci/test_cpp.sh | 19 +++++++++++++------ ci/test_cpp_memcheck.sh | 9 +++------ ci/test_notebooks.sh | 9 +++------ ci/test_python.sh | 9 +++------ ci/test_self_hosted_service.sh | 9 +++------ ci/test_skills_assets.sh | 9 +++------ ci/test_wheel_cuopt.sh | 9 +++------ ci/test_wheel_cuopt_server.sh | 9 +++------ ci/{ => utils}/cuopt_coredumps.sh | 10 ++++++++-- 9 files changed, 42 insertions(+), 50 deletions(-) rename ci/{ => utils}/cuopt_coredumps.sh (95%) diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index d9d0e36e7c..ec386f9e6f 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -5,12 +5,9 @@ set -euo pipefail -_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=ci/cuopt_coredumps.sh -source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" -cuopt_enable_coredumps -trap 'cuopt_collect_coredumps || true' EXIT -unset _CUOPT_CI_DIR +# shellcheck source=ci/utils/cuopt_coredumps.sh +source "$(dirname "${BASH_SOURCE[0]}")/utils/cuopt_coredumps.sh" +cuopt_coredumps_ci_setup . /opt/conda/etc/profile.d/conda.sh @@ -60,5 +57,15 @@ export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/ rapids-logger "Run gtests" timeout 40m ./ci/run_ctests.sh +# Optional core-dump path check: no compiled binary — child bash sends itself SIGSEGV. +# Child exits 139; || true keeps this script running so the EXIT trap can collect cores. +# For normal CI, leave unset and set CUOPT_CI_COREDUMP_PROBE=1 only when probing artifacts. +CUOPT_CI_COREDUMP_PROBE=1 +if [[ "${CUOPT_CI_COREDUMP_PROBE:-}" == 1 ]]; then + rapids-logger "CUOPT_CI_COREDUMP_PROBE: child bash SIGSEGV (core dump artifact check)" + bash -c 'kill -SEGV $$' || true + sleep 0.2 +fi + rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh index d9ddbc7edd..8b61198240 100755 --- a/ci/test_cpp_memcheck.sh +++ b/ci/test_cpp_memcheck.sh @@ -10,12 +10,9 @@ fi set -euo pipefail -_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=ci/cuopt_coredumps.sh -source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" -cuopt_enable_coredumps -trap 'cuopt_collect_coredumps || true' EXIT -unset _CUOPT_CI_DIR +# shellcheck source=ci/utils/cuopt_coredumps.sh +source "$(dirname "${BASH_SOURCE[0]}")/utils/cuopt_coredumps.sh" +cuopt_coredumps_ci_setup . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index b7897e4032..54bb4d2967 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -5,12 +5,9 @@ set -euo pipefail -_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=ci/cuopt_coredumps.sh -source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" -cuopt_enable_coredumps -trap 'cuopt_collect_coredumps || true' EXIT -unset _CUOPT_CI_DIR +# shellcheck source=ci/utils/cuopt_coredumps.sh +source "$(dirname "${BASH_SOURCE[0]}")/utils/cuopt_coredumps.sh" +cuopt_coredumps_ci_setup . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_python.sh b/ci/test_python.sh index aec654ad17..831d4dc174 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -5,12 +5,9 @@ set -euo pipefail -_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=ci/cuopt_coredumps.sh -source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" -cuopt_enable_coredumps -trap 'cuopt_collect_coredumps || true' EXIT -unset _CUOPT_CI_DIR +# shellcheck source=ci/utils/cuopt_coredumps.sh +source "$(dirname "${BASH_SOURCE[0]}")/utils/cuopt_coredumps.sh" +cuopt_coredumps_ci_setup . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_self_hosted_service.sh b/ci/test_self_hosted_service.sh index c9b7d509b6..ef0ab14b35 100755 --- a/ci/test_self_hosted_service.sh +++ b/ci/test_self_hosted_service.sh @@ -5,12 +5,9 @@ set -euo pipefail -_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=ci/cuopt_coredumps.sh -source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" -cuopt_enable_coredumps -trap 'cuopt_collect_coredumps || true' EXIT -unset _CUOPT_CI_DIR +# shellcheck source=ci/utils/cuopt_coredumps.sh +source "$(dirname "${BASH_SOURCE[0]}")/utils/cuopt_coredumps.sh" +cuopt_coredumps_ci_setup source rapids-init-pip diff --git a/ci/test_skills_assets.sh b/ci/test_skills_assets.sh index ccb5161a11..583044bea0 100755 --- a/ci/test_skills_assets.sh +++ b/ci/test_skills_assets.sh @@ -10,12 +10,9 @@ set -euo pipefail -_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=ci/cuopt_coredumps.sh -source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" -cuopt_enable_coredumps -trap 'cuopt_collect_coredumps || true' EXIT -unset _CUOPT_CI_DIR +# shellcheck source=ci/utils/cuopt_coredumps.sh +source "$(dirname "${BASH_SOURCE[0]}")/utils/cuopt_coredumps.sh" +cuopt_coredumps_ci_setup # Use rapids-logger in CI; fall back to echo for local testing if command -v rapids-logger &>/dev/null; then diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh index 33a5300581..2859b96fe3 100755 --- a/ci/test_wheel_cuopt.sh +++ b/ci/test_wheel_cuopt.sh @@ -5,12 +5,9 @@ set -euo pipefail -_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=ci/cuopt_coredumps.sh -source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" -cuopt_enable_coredumps -trap 'cuopt_collect_coredumps || true' EXIT -unset _CUOPT_CI_DIR +# shellcheck source=ci/utils/cuopt_coredumps.sh +source "$(dirname "${BASH_SOURCE[0]}")/utils/cuopt_coredumps.sh" +cuopt_coredumps_ci_setup # sets up a constraints file for 'pip' and puts its location in an exported variable PIP_EXPORT, # so those constraints will affect all future 'pip install' calls diff --git a/ci/test_wheel_cuopt_server.sh b/ci/test_wheel_cuopt_server.sh index 19c4bf7c1a..cda880765c 100755 --- a/ci/test_wheel_cuopt_server.sh +++ b/ci/test_wheel_cuopt_server.sh @@ -5,12 +5,9 @@ set -eou pipefail -_CUOPT_CI_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=ci/cuopt_coredumps.sh -source "${_CUOPT_CI_DIR}/cuopt_coredumps.sh" -cuopt_enable_coredumps -trap 'cuopt_collect_coredumps || true' EXIT -unset _CUOPT_CI_DIR +# shellcheck source=ci/utils/cuopt_coredumps.sh +source "$(dirname "${BASH_SOURCE[0]}")/utils/cuopt_coredumps.sh" +cuopt_coredumps_ci_setup source rapids-init-pip diff --git a/ci/cuopt_coredumps.sh b/ci/utils/cuopt_coredumps.sh similarity index 95% rename from ci/cuopt_coredumps.sh rename to ci/utils/cuopt_coredumps.sh index 44b2e7030a..c8ec2ef84f 100644 --- a/ci/cuopt_coredumps.sh +++ b/ci/utils/cuopt_coredumps.sh @@ -17,8 +17,8 @@ # no edits here — e.g. test_foo.sh → conda-foo-tests, test_wheel_bar.sh → wheel-bar-tests). # Then CUDA / Python / arch / build_type from RAPIDS CI env. # -# Shells: source this file from repo ci/ scripts, then call cuopt_enable_coredumps and trap -# cuopt_collect_coredumps on EXIT. +# Test drivers: source ci/utils/cuopt_coredumps.sh from a sibling ci/test_*.sh, then call +# cuopt_coredumps_ci_setup (enable + EXIT trap). # Set in cuopt_enable_coredumps; collect reuses when non-empty. CUOPT_GDB_CORE_ARTIFACT_DIR= @@ -201,3 +201,9 @@ cuopt_collect_coredumps() { fi fi } + +# Standard CI wiring for ci/test_*.sh: call once after sourcing this file. +cuopt_coredumps_ci_setup() { + cuopt_enable_coredumps + trap 'cuopt_collect_coredumps || true' EXIT +} From f97a868ea7876225025025715289fbf28e9dddf8 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 6 Apr 2026 16:41:43 -0500 Subject: [PATCH 14/18] update --- ci/run_ctests.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh index bdd5857c06..fc1de8e1b4 100755 --- a/ci/run_ctests.sh +++ b/ci/run_ctests.sh @@ -21,13 +21,11 @@ else exit 1 fi -shopt -s nullglob for gt in "${GTEST_DIR}"/*_TEST; do test_name=$(basename "${gt}") echo "Running gtest ${test_name}" "${gt}" "$@" done -shopt -u nullglob # Run C_API_TEST with CPU memory for local solves (excluding time limit tests) if [ -x "${GTEST_DIR}/C_API_TEST" ]; then From d44697472037a3fe5ee364ee8cd9eede4061dec6 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 9 Apr 2026 10:29:49 -0500 Subject: [PATCH 15/18] update --- ci/test_cpp.sh | 17 +++- ci/utils/cuopt_coredumps.sh | 191 +++++++++++++++++++++++++++++------- 2 files changed, 172 insertions(+), 36 deletions(-) diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index ec386f9e6f..719e26bcb9 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -63,8 +63,23 @@ timeout 40m ./ci/run_ctests.sh CUOPT_CI_COREDUMP_PROBE=1 if [[ "${CUOPT_CI_COREDUMP_PROBE:-}" == 1 ]]; then rapids-logger "CUOPT_CI_COREDUMP_PROBE: child bash SIGSEGV (core dump artifact check)" + # Count core files before the probe. + _probe_n_before="$(find "${CUOPT_COREDUMP_DIR:-/dev/null}" -type f 2>/dev/null | wc -l | tr -d '[:space:]')" bash -c 'kill -SEGV $$' || true - sleep 0.2 + # Brief pause so the kernel can finish writing the core. + sleep 1 + # Eagerly collect now so we can verify the probe worked. + cuopt_collect_coredumps || true + _probe_n_after="$(find "${CUOPT_COREDUMP_DIR:-/dev/null}" -type f 2>/dev/null | wc -l | tr -d '[:space:]')" + if [[ "${_probe_n_after}" -gt "${_probe_n_before}" ]]; then + rapids-logger "COREDUMP_PROBE: SUCCESS — $((_probe_n_after - _probe_n_before)) core file(s) collected" + else + rapids-logger "COREDUMP_PROBE: FAILED — no core file collected for SIGSEGV probe" + rapids-logger " core_pattern=$(cat /proc/sys/kernel/core_pattern 2>/dev/null || echo n/a)" + rapids-logger " ulimit -c=$(ulimit -c)" + rapids-logger " CUOPT_COREDUMP_DIR=${CUOPT_COREDUMP_DIR:-unset}" + rapids-logger " Hint: core_pattern may require a privileged container or --cap-add=SYS_PTRACE" + fi fi rapids-logger "Test script exiting with value: $EXITCODE" diff --git a/ci/utils/cuopt_coredumps.sh b/ci/utils/cuopt_coredumps.sh index c8ec2ef84f..61de4fc2d9 100644 --- a/ci/utils/cuopt_coredumps.sh +++ b/ci/utils/cuopt_coredumps.sh @@ -124,6 +124,9 @@ cuopt_enable_coredumps() { CUOPT_GDB_CORE_ARTIFACT_DIR="$(cuopt__gdb_core_artifact_basename)" export CUOPT_GDB_CORE_ARTIFACT_DIR export CUOPT_COREDUMP_DIR="${base}/${CUOPT_GDB_CORE_ARTIFACT_DIR}" + # Record startup time so coredumpctl collection can filter to this session only. + export CUOPT_COREDUMP_SINCE + CUOPT_COREDUMP_SINCE="$(date '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo '')" mkdir -p "${CUOPT_COREDUMP_DIR}" local pattern_target="${CUOPT_COREDUMP_DIR}/core.%e.%p.%t" @@ -132,9 +135,11 @@ cuopt_enable_coredumps() { ulimit -c unlimited 2>/dev/null || true ulimit -H -c unlimited 2>/dev/null || true - # When unset, ask the kernel for broad core dump contents (Linux 4.6+; ignored elsewhere). - if [[ -z "${COREDUMP_FILTER:-}" ]]; then - export COREDUMP_FILTER=0xff + # Write the coredump filter to the kernel's per-process file (env var alone has no effect). + # 0xff = dump all memory segments (shared, private, huge, DAX — Linux 4.6+). + local filter="${COREDUMP_FILTER:-0xff}" + if [[ -w /proc/self/coredump_filter ]]; then + echo "${filter}" >/proc/self/coredump_filter 2>/dev/null || true fi # Prefer writing cores as files under CUOPT_COREDUMP_DIR (often fails in unprivileged Docker). @@ -146,21 +151,139 @@ cuopt_enable_coredumps() { fi pattern="$(cat /proc/sys/kernel/core_pattern 2>/dev/null || echo n/a)" + + # Track whether core_pattern points to our directory (file-based) or a pipe/collector. + export CUOPT_COREDUMP_PATTERN_IS_PIPE=0 + if [[ "${pattern}" == \|* ]]; then + CUOPT_COREDUMP_PATTERN_IS_PIPE=1 + fi + + local coredump_filter_val="n/a" + if [[ -r /proc/self/coredump_filter ]]; then + coredump_filter_val="$(cat /proc/self/coredump_filter 2>/dev/null || echo n/a)" + fi + + local _log_msg="Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit=$(ulimit -c) core_pattern=${pattern} coredump_filter=${coredump_filter_val}" if declare -F rapids-logger &>/dev/null; then - rapids-logger "Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit -c=$(ulimit -c) core_pattern=${pattern}" - if [[ "${pattern}" == \|* ]]; then - rapids-logger "WARNING: core_pattern pipes to a collector (e.g. apport); cores may not appear as files under ${CUOPT_COREDUMP_DIR}. Use a writable core_pattern or a privileged runner if needed." - fi + rapids-logger "${_log_msg}" else - echo "Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit -c=$(ulimit -c) core_pattern=${pattern}" - if [[ "${pattern}" == \|* ]]; then - echo "WARNING: core_pattern pipes to a collector; files may not land in ${CUOPT_COREDUMP_DIR}" >&2 + echo "${_log_msg}" + fi + + if [[ "${CUOPT_COREDUMP_PATTERN_IS_PIPE}" == 1 ]]; then + local _pipe_msg="WARNING: core_pattern pipes to a collector — cores will NOT appear as files. Fallback: coredumpctl (systemd-coredump) or /var/crash (apport) will be checked at collection time." + if command -v coredumpctl &>/dev/null; then + _pipe_msg+=" coredumpctl is available." + else + _pipe_msg+=" coredumpctl NOT found; if systemd-coredump is the handler, cores may be lost." + fi + if declare -F rapids-logger &>/dev/null; then + rapids-logger "${_pipe_msg}" + else + echo "WARNING: ${_pipe_msg}" >&2 fi fi } +cuopt__log() { + if declare -F rapids-logger &>/dev/null; then + rapids-logger "$1" + else + echo "$1" + fi +} + +# Copy a single core file into the artifact directory with a sanitized name. +cuopt__copy_core_to_dest() { + local f="$1" dest="$2" label="${3:-}" + [[ -f "${f}" && -s "${f}" ]] || return 0 + local base_name + base_name="$(basename "${f}")" + if [[ -n "${label}" ]]; then + base_name="${label}_${base_name}" + fi + base_name="${base_name//\//_}" + local dest_path="${dest}/${base_name}" + if [[ -e "${dest_path}" ]]; then + dest_path="${dest}/${base_name}.${RANDOM}" + fi + cp -a "${f}" "${dest_path}" 2>/dev/null || true +} + +# Collect cores written as files (core_pattern was file-based or we got lucky). +cuopt__collect_core_files() { + local dest="$1" + shift + local search_dirs=("$@") + local f + for dir in "${search_dirs[@]}"; do + [[ -d "${dir}" ]] || continue + while IFS= read -r -d '' f; do + [[ -f "${f}" ]] || continue + # Skip files already in dest. + case "${f}" in + "${dest}/"*) continue ;; + esac + cuopt__copy_core_to_dest "${f}" "${dest}" "" + done < <( + find "${dir}" \ + \( -path '*/.git/*' -o -path '*/opt/conda/*' -o -path '*/conda_pkgs/*' -o -path "${dest}/*" \) -prune -o \ + \( -name 'core' -o -name 'core.*' \) -type f -print0 2>/dev/null + ) + done +} + +# Fallback: extract cores via coredumpctl (systemd-coredump handler). +cuopt__collect_via_coredumpctl() { + local dest="$1" + command -v coredumpctl &>/dev/null || return 0 + + cuopt__log "Attempting coredumpctl extraction (core_pattern is piped to systemd-coredump)" + + # Build the coredumpctl list command — scope to this session if we have a start time. + local -a list_cmd=(coredumpctl list --no-pager --no-legend) + if [[ -n "${CUOPT_COREDUMP_SINCE:-}" ]]; then + list_cmd+=(--since "${CUOPT_COREDUMP_SINCE}") + cuopt__log " Filtering coredumpctl to cores since ${CUOPT_COREDUMP_SINCE}" + fi + + local line pid exe core_path + # --no-legend output format: DAY DATE TIME TZ PID UID GID SIG COREFILE EXE... + while IFS= read -r line; do + # Skip header / empty lines. + [[ "${line}" =~ ^[[:space:]]*[A-Z] ]] && continue + [[ -z "${line}" ]] && continue + # Parse PID (5th field) and EXE (last field). + pid="$(echo "${line}" | awk '{print $5}')" + exe="$(echo "${line}" | awk '{print $NF}')" + [[ -n "${pid}" ]] || continue + core_path="${dest}/coredumpctl_${pid}_$(basename "${exe:-unknown}").core" + if [[ -e "${core_path}" ]]; then + core_path="${core_path}.${RANDOM}" + fi + coredumpctl dump "${pid}" -o "${core_path}" 2>/dev/null || true + if [[ -s "${core_path}" ]]; then + cuopt__log "Extracted core for PID ${pid} (${exe}) → ${core_path} ($(du -h "${core_path}" | cut -f1))" + else + rm -f "${core_path}" 2>/dev/null || true + fi + done < <("${list_cmd[@]}" 2>/dev/null || true) +} + +# Fallback: collect cores from apport crash reports (/var/crash). +cuopt__collect_from_apport() { + local dest="$1" + local crash_dir="/var/crash" + [[ -d "${crash_dir}" ]] || return 0 + local f + for f in "${crash_dir}"/*.crash "${crash_dir}"/core.* "${crash_dir}"/core; do + [[ -f "${f}" && -s "${f}" ]] || continue + cuopt__copy_core_to_dest "${f}" "${dest}" "apport" + done +} + cuopt_collect_coredumps() { - local ws base dest n_before n_after f rel dest_name dest_path + local ws base dest n_before n_after ws="${GITHUB_WORKSPACE:-${PWD}}" base="${RAPIDS_ARTIFACTS_DIR:-${ws}/artifacts}" if [[ -z "${CUOPT_GDB_CORE_ARTIFACT_DIR:-}" ]]; then @@ -171,33 +294,31 @@ cuopt_collect_coredumps() { n_before="$(find "${dest}" -type f 2>/dev/null | wc -l | tr -d '[:space:]')" - while IFS= read -r -d '' f; do - [[ -f "${f}" ]] || continue - case "${f}" in - "${dest}/"*) continue ;; - esac - rel="${f#"${ws}"/}" - if [[ "${rel}" == "${f}" ]]; then - rel="$(basename "${f}")" - fi - dest_name="${rel//\//_}" - dest_path="${dest}/${dest_name}" - if [[ -e "${dest_path}" ]]; then - dest_path="${dest}/${dest_name}.${RANDOM}" - fi - cp -a "${f}" "${dest_path}" 2>/dev/null || true - done < <( - find "${ws}" \ - \( -path '*/.git/*' -o -path '*/opt/conda/*' -o -path '*/conda_pkgs/*' -o -path "${dest}/*" \) -prune -o \ - \( -name 'core' -o -name 'core.*' \) -type f -print0 2>/dev/null - ) + # 1) Search for core files in workspace + common system locations. + cuopt__collect_core_files "${dest}" \ + "${ws}" "/tmp" "/var/lib/systemd/coredump" "/var/crash" + + # 2) If core_pattern pipes to a collector, try extracting via coredumpctl / apport. + if [[ "${CUOPT_COREDUMP_PATTERN_IS_PIPE:-0}" == 1 ]]; then + cuopt__collect_via_coredumpctl "${dest}" + cuopt__collect_from_apport "${dest}" + fi n_after="$(find "${dest}" -type f 2>/dev/null | wc -l | tr -d '[:space:]')" if [[ "${n_after}" -gt "${n_before}" ]]; then - if declare -F rapids-logger &>/dev/null; then - rapids-logger "Wrote $((n_after - n_before)) core file(s) into ${dest} (${n_after} total)" - else - echo "cuOpt coredumps: ${n_after} file(s) in ${dest}" + cuopt__log "Collected $((n_after - n_before)) core file(s) into ${dest} (${n_after} total)" + ls -lh "${dest}"/ 2>/dev/null || true + else + cuopt__log "WARNING: No core files found. Cores may have been discarded by the system collector." + cuopt__log " core_pattern=$(cat /proc/sys/kernel/core_pattern 2>/dev/null || echo n/a)" + cuopt__log " Searched: ${ws} /tmp /var/lib/systemd/coredump /var/crash" + if [[ "${CUOPT_COREDUMP_PATTERN_IS_PIPE:-0}" == 1 ]]; then + if command -v coredumpctl &>/dev/null; then + cuopt__log " coredumpctl list output:" + coredumpctl list --no-pager 2>/dev/null || true + else + cuopt__log " coredumpctl not available; cannot extract from systemd-coredump" + fi fi fi } From 5c34d6b04a65296c305bab2d2ad57cdd8a179378 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 9 Apr 2026 10:31:07 -0500 Subject: [PATCH 16/18] fix --- ci/utils/cuopt_coredumps.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/utils/cuopt_coredumps.sh b/ci/utils/cuopt_coredumps.sh index 61de4fc2d9..b76b06f879 100644 --- a/ci/utils/cuopt_coredumps.sh +++ b/ci/utils/cuopt_coredumps.sh @@ -163,7 +163,8 @@ cuopt_enable_coredumps() { coredump_filter_val="$(cat /proc/self/coredump_filter 2>/dev/null || echo n/a)" fi - local _log_msg="Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit=$(ulimit -c) core_pattern=${pattern} coredump_filter=${coredump_filter_val}" + local _log_msg + _log_msg="Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit=$(ulimit -c) core_pattern=${pattern} coredump_filter=${coredump_filter_val}" if declare -F rapids-logger &>/dev/null; then rapids-logger "${_log_msg}" else From 3eb48b9319f9d3e7af3f9aa6ad97a635148c0764 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 9 Apr 2026 10:40:24 -0500 Subject: [PATCH 17/18] remove redundant items --- ci/utils/cuopt_coredumps.sh | 42 ++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/ci/utils/cuopt_coredumps.sh b/ci/utils/cuopt_coredumps.sh index b76b06f879..5e69b0781c 100644 --- a/ci/utils/cuopt_coredumps.sh +++ b/ci/utils/cuopt_coredumps.sh @@ -115,6 +115,14 @@ cuopt__gdb_core_artifact_basename() { echo "cuopt-gdb-cores_${job}_cuda${cuda_ver}_py${py_ver}_${arch_}_${bt}" } +cuopt__log() { + if declare -F rapids-logger &>/dev/null; then + rapids-logger "$1" + else + echo "$1" + fi +} + cuopt_enable_coredumps() { local ws base pattern ws="${GITHUB_WORKSPACE:-${PWD}}" @@ -163,13 +171,7 @@ cuopt_enable_coredumps() { coredump_filter_val="$(cat /proc/self/coredump_filter 2>/dev/null || echo n/a)" fi - local _log_msg - _log_msg="Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit=$(ulimit -c) core_pattern=${pattern} coredump_filter=${coredump_filter_val}" - if declare -F rapids-logger &>/dev/null; then - rapids-logger "${_log_msg}" - else - echo "${_log_msg}" - fi + cuopt__log "Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit=$(ulimit -c) core_pattern=${pattern} coredump_filter=${coredump_filter_val}" if [[ "${CUOPT_COREDUMP_PATTERN_IS_PIPE}" == 1 ]]; then local _pipe_msg="WARNING: core_pattern pipes to a collector — cores will NOT appear as files. Fallback: coredumpctl (systemd-coredump) or /var/crash (apport) will be checked at collection time." @@ -178,19 +180,7 @@ cuopt_enable_coredumps() { else _pipe_msg+=" coredumpctl NOT found; if systemd-coredump is the handler, cores may be lost." fi - if declare -F rapids-logger &>/dev/null; then - rapids-logger "${_pipe_msg}" - else - echo "WARNING: ${_pipe_msg}" >&2 - fi - fi -} - -cuopt__log() { - if declare -F rapids-logger &>/dev/null; then - rapids-logger "$1" - else - echo "$1" + cuopt__log "${_pipe_msg}" fi } @@ -258,9 +248,13 @@ cuopt__collect_via_coredumpctl() { pid="$(echo "${line}" | awk '{print $5}')" exe="$(echo "${line}" | awk '{print $NF}')" [[ -n "${pid}" ]] || continue - core_path="${dest}/coredumpctl_${pid}_$(basename "${exe:-unknown}").core" + local exe_base + exe_base="$(basename "${exe:-unknown}")" + core_path="${dest}/coredumpctl_${pid}_${exe_base}.core" + # Skip if this PID was already extracted (e.g. by a prior probe collection). if [[ -e "${core_path}" ]]; then - core_path="${core_path}.${RANDOM}" + cuopt__log " Skipping PID ${pid} (${exe_base}) — already extracted" + continue fi coredumpctl dump "${pid}" -o "${core_path}" 2>/dev/null || true if [[ -s "${core_path}" ]]; then @@ -297,7 +291,7 @@ cuopt_collect_coredumps() { # 1) Search for core files in workspace + common system locations. cuopt__collect_core_files "${dest}" \ - "${ws}" "/tmp" "/var/lib/systemd/coredump" "/var/crash" + "${ws}" "/tmp" "/var/lib/systemd/coredump" # 2) If core_pattern pipes to a collector, try extracting via coredumpctl / apport. if [[ "${CUOPT_COREDUMP_PATTERN_IS_PIPE:-0}" == 1 ]]; then @@ -312,7 +306,7 @@ cuopt_collect_coredumps() { else cuopt__log "WARNING: No core files found. Cores may have been discarded by the system collector." cuopt__log " core_pattern=$(cat /proc/sys/kernel/core_pattern 2>/dev/null || echo n/a)" - cuopt__log " Searched: ${ws} /tmp /var/lib/systemd/coredump /var/crash" + cuopt__log " Searched: ${ws} /tmp /var/lib/systemd/coredump (+ /var/crash if piped)" if [[ "${CUOPT_COREDUMP_PATTERN_IS_PIPE:-0}" == 1 ]]; then if command -v coredumpctl &>/dev/null; then cuopt__log " coredumpctl list output:" From 367ec36b273f293511259f40750efb5d182f24f6 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 9 Apr 2026 12:05:28 -0500 Subject: [PATCH 18/18] hijack apport pipe handler to capture core dumps in unprivileged containers When core_pattern pipes to apport (common in CI), apport silently discards cores from non-packaged binaries. Replace the handler with a forwarder script that reads the core from stdin and writes it to the artifact directory. Falls back gracefully if the handler is read-only. --- ci/utils/cuopt_coredumps.sh | 72 ++++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 4 deletions(-) diff --git a/ci/utils/cuopt_coredumps.sh b/ci/utils/cuopt_coredumps.sh index 5e69b0781c..b0a85d5918 100644 --- a/ci/utils/cuopt_coredumps.sh +++ b/ci/utils/cuopt_coredumps.sh @@ -123,6 +123,61 @@ cuopt__log() { fi } +# When core_pattern pipes to a handler (apport/systemd-coredump), replace the handler +# binary with a forwarder script that reads the core from stdin and writes it to disk. +# The kernel invokes: |/path/to/handler -p%p -s%s ... -- %E +# Our replacement reads stdin (the raw core) and writes it using the -p (PID) argument. +cuopt__hijack_pipe_handler() { + local pattern="$1" dest="$2" + # Extract the handler binary path (first token after the leading '|'). + local handler + handler="$(echo "${pattern}" | sed 's/^|//; s/ .*//')" + [[ -n "${handler}" && -f "${handler}" ]] || return 0 + + # Back up the original handler (only once). + local backup="${handler}.cuopt_orig" + if [[ ! -f "${backup}" ]]; then + cp -a "${handler}" "${backup}" 2>/dev/null || { + cuopt__log "WARNING: cannot back up ${handler} — hijack skipped (read-only?)" + return 0 + } + fi + + # Write a forwarder script that saves stdin (core dump) to the artifact dir. + # The kernel passes flags like -p PID; we parse -p to name the file. + cat > "${handler}" <<'FORWARDER_EOF' +#!/bin/sh +# cuOpt core-dump forwarder — replaces apport/systemd-coredump handler. +# Reads raw core dump from stdin, writes to CUOPT_COREDUMP_DIR. +PID="unknown" +EXE="unknown" +while [ $# -gt 0 ]; do + case "$1" in + -p) shift; PID="$1" ;; + -p*) PID="${1#-p}" ;; + --) shift; EXE="$(echo "$*" | tr '/' '_')" ; break ;; + esac + shift +done +FORWARDER_EOF + # Append the dest directory (expanded now, not at runtime). + cat >> "${handler}" <> "${handler}" <<'FORWARDER_BODY' +mkdir -p "${DEST}" 2>/dev/null +CORE_FILE="${DEST}/core.${EXE}.${PID}" +cat > "${CORE_FILE}" +if [ -s "${CORE_FILE}" ]; then + echo "cuopt-forwarder: saved core for PID ${PID} (${EXE}) → ${CORE_FILE}" >&2 +else + rm -f "${CORE_FILE}" 2>/dev/null +fi +FORWARDER_BODY + chmod +x "${handler}" 2>/dev/null || true + cuopt__log "Hijacked pipe handler ${handler} → core forwarder (dest=${dest})" +} + cuopt_enable_coredumps() { local ws base pattern ws="${GITHUB_WORKSPACE:-${PWD}}" @@ -164,6 +219,10 @@ cuopt_enable_coredumps() { export CUOPT_COREDUMP_PATTERN_IS_PIPE=0 if [[ "${pattern}" == \|* ]]; then CUOPT_COREDUMP_PATTERN_IS_PIPE=1 + # Attempt to hijack the pipe handler (e.g. apport) with a forwarder that saves cores + # as files. The kernel pipes the core dump on stdin to the handler binary; if we replace + # it with our own script, the core lands in CUOPT_COREDUMP_DIR. + cuopt__hijack_pipe_handler "${pattern}" "${CUOPT_COREDUMP_DIR}" fi local coredump_filter_val="n/a" @@ -174,11 +233,16 @@ cuopt_enable_coredumps() { cuopt__log "Core dumps: dir=${CUOPT_COREDUMP_DIR} ulimit=$(ulimit -c) core_pattern=${pattern} coredump_filter=${coredump_filter_val}" if [[ "${CUOPT_COREDUMP_PATTERN_IS_PIPE}" == 1 ]]; then - local _pipe_msg="WARNING: core_pattern pipes to a collector — cores will NOT appear as files. Fallback: coredumpctl (systemd-coredump) or /var/crash (apport) will be checked at collection time." - if command -v coredumpctl &>/dev/null; then - _pipe_msg+=" coredumpctl is available." + local _pipe_msg="core_pattern pipes to a collector." + if [[ -f "$(echo "${pattern}" | sed 's/^|//; s/ .*//' ).cuopt_orig" ]]; then + _pipe_msg+=" Handler hijacked with core forwarder — cores should land in ${CUOPT_COREDUMP_DIR}." else - _pipe_msg+=" coredumpctl NOT found; if systemd-coredump is the handler, cores may be lost." + _pipe_msg+=" Handler hijack failed. Fallback: coredumpctl / /var/crash at collection time." + if command -v coredumpctl &>/dev/null; then + _pipe_msg+=" coredumpctl is available." + else + _pipe_msg+=" coredumpctl NOT found; cores may be lost." + fi fi cuopt__log "${_pipe_msg}" fi