diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index a945cde8ec..3ba0edd8c1 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -37,6 +37,10 @@ on: If 'true', trigger the test workflow after all builds complete. type: boolean default: false + summary-only: + description: "If true, skip all build jobs and run only build-summary" + type: boolean + default: false concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -44,6 +48,7 @@ concurrency: jobs: cpp-build: + if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main with: @@ -53,6 +58,7 @@ jobs: sha: ${{ inputs.sha }} script: ci/build_cpp.sh python-build: + if: ${{ !inputs.summary-only }} needs: [cpp-build] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main @@ -63,6 +69,7 @@ jobs: sha: ${{ inputs.sha }} script: ci/build_python.sh upload-conda: + if: ${{ !inputs.summary-only }} needs: [cpp-build, python-build] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main @@ -72,6 +79,7 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} wheel-build-cuopt-mps-parser: + if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: @@ -86,6 +94,7 @@ jobs: # need 1 build per Python version and arch (but CUDA version doesn't matter so choose the latest) matrix_filter: 'group_by([.ARCH, (.PY_VER |split(".") | map(tonumber))])|map(max_by([(.CUDA_VER|split(".")|map(tonumber))]))' wheel-publish-cuopt-mps-parser: + if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-mps-parser secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -97,6 +106,7 @@ jobs: package-name: cuopt_mps_parser package-type: python wheel-build-libcuopt: + if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-mps-parser secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main @@ -110,6 +120,7 @@ jobs: package-type: cpp matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) wheel-publish-libcuopt: + if: ${{ !inputs.summary-only }} needs: wheel-build-libcuopt secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -121,6 +132,7 @@ jobs: package-name: libcuopt package-type: cpp wheel-build-cuopt: + if: ${{ !inputs.summary-only }} needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main @@ -133,6 +145,7 @@ jobs: package-name: cuopt package-type: python wheel-publish-cuopt: + if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -144,6 +157,7 @@ jobs: package-name: cuopt package-type: python wheel-build-cuopt-server: + if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: @@ -158,6 +172,7 @@ jobs: # Only need 1 package per CUDA major version. This selects "ARCH=amd64 + the latest supported Python, 1 job per major CUDA version". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) wheel-publish-cuopt-server: + if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-server secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -169,6 +184,7 @@ jobs: package-name: cuopt_server package-type: python docs-build: + if: ${{ !inputs.summary-only }} needs: [python-build] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main @@ -184,6 +200,7 @@ jobs: container_image: "rapidsai/ci-conda:26.06-latest" script: "ci/build_docs.sh" wheel-build-cuopt-sh-client: + if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: @@ -199,6 +216,7 @@ jobs: # only need 1 build (noarch package): this selects amd64, oldest-supported Python, latest-supported CUDA matrix_filter: '[map(select(.ARCH == "amd64")) | min_by((.PY_VER | split(".") | map(tonumber)), (.CUDA_VER | split(".") | map(-tonumber)))]' wheel-publish-cuopt-sh-client: + if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-sh-client secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -219,7 +237,7 @@ jobs: - wheel-publish-cuopt-server - wheel-publish-cuopt-sh-client - wheel-publish-libcuopt - if: inputs.trigger-tests + if: ${{ inputs.trigger-tests && !inputs.summary-only }} runs-on: ubuntu-latest # ref: https://docs.github.com/en/actions/reference/security/secure-use#use-an-intermediate-environment-variable env: @@ -242,7 +260,35 @@ jobs: -f date="${INPUT_DATE}" \ -f sha="${INPUT_SHA}" + build-summary: + if: ${{ always() && (inputs.build_type == 'nightly') }} + needs: + - tests + - build-images + - docs-build + runs-on: linux-amd64-cpu4 + container: + image: python:3.12-slim + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.sha }} + - name: Install dependencies + run: apt-get update && apt-get install -y --no-install-recommends curl + - name: Send build summary + env: + GITHUB_TOKEN: ${{ github.token }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_SERVER_URL: ${{ github.server_url }} + CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} + SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} + RAPIDS_BRANCH: ${{ inputs.branch }} + run: bash ci/build_summary.sh + build-images: + if: ${{ !inputs.summary-only }} needs: - wheel-publish-cuopt - wheel-publish-cuopt-server diff --git a/.github/workflows/nightly-summary.yaml b/.github/workflows/nightly-summary.yaml new file mode 100644 index 0000000000..1bc3369c41 --- /dev/null +++ b/.github/workflows/nightly-summary.yaml @@ -0,0 +1,83 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: nightly-summary + +on: + workflow_dispatch: + inputs: + branch: + description: "Branch name the run targets" + required: true + type: string + default: main + sha: + description: "Full git commit SHA to check out" + required: true + type: string + build_type: + description: "Build type (nightly, pull-request, branch)" + required: true + type: string + default: nightly + date: + description: "Date (YYYY-MM-DD) for this run. Defaults to today." + required: false + type: string + workflow_call: + inputs: + branch: + required: true + type: string + sha: + required: true + type: string + build_type: + required: true + type: string + date: + required: false + type: string + secrets: + CUOPT_S3_URI: + required: true + CUOPT_AWS_ACCESS_KEY_ID: + required: true + CUOPT_AWS_SECRET_ACCESS_KEY: + required: true + CUOPT_SLACK_WEBHOOK_URL: + required: false + CUOPT_SLACK_BOT_TOKEN: + required: false + CUOPT_SLACK_CHANNEL_ID: + required: false + +jobs: + nightly-summary: + runs-on: linux-amd64-cpu4 + container: + image: python:3.12-slim + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.sha }} + - name: Install dependencies + run: | + apt-get update && apt-get install -y --no-install-recommends curl + pip install awscli + - name: Run nightly summary + env: + CUOPT_S3_URI: ${{ secrets.CUOPT_S3_URI }} + CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} + CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} + CUOPT_SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + RAPIDS_BRANCH: ${{ inputs.branch }} + RUN_DATE: ${{ inputs.date }} + GITHUB_TOKEN: ${{ github.token }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_SERVER_URL: ${{ github.server_url }} + run: bash ci/nightly_summary.sh diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a652c23b9a..be67501892 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -299,8 +299,8 @@ jobs: script: ci/test_cpp.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY @@ -323,8 +323,8 @@ jobs: script: ci/test_python.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY @@ -384,8 +384,8 @@ jobs: script: ci/test_wheel_cuopt.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY @@ -424,8 +424,8 @@ jobs: script: ci/test_wheel_cuopt_server.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a8cc5f2943..5246ed0124 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,9 +24,14 @@ on: description: "build_type: one of [branch, nightly, pull-request]" type: string default: nightly + summary-only: + description: "If true, skip all test jobs and run only nightly-summary" + type: boolean + default: false jobs: conda-cpp-tests: + if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main with: build_type: ${{ inputs.build_type }} @@ -35,13 +40,15 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_cpp.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + conda-python-tests: + if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main with: run_codecov: false @@ -51,13 +58,15 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_python.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + wheel-tests-cuopt: + if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} @@ -66,13 +75,15 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_wheel_cuopt.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + wheel-tests-cuopt-server: + if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} @@ -81,13 +92,15 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_wheel_cuopt_server.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + conda-notebook-tests: + if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main with: @@ -99,3 +112,24 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:26.06-latest" script: ci/test_notebooks.sh + nightly-summary: + if: ${{ always() && inputs.build_type == 'nightly' }} + needs: + - conda-cpp-tests + - conda-python-tests + - wheel-tests-cuopt + - wheel-tests-cuopt-server + - conda-notebook-tests + uses: ./.github/workflows/nightly-summary.yaml + with: + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + build_type: ${{ inputs.build_type }} + date: ${{ inputs.date }} + secrets: + CUOPT_S3_URI: ${{ secrets.CUOPT_S3_URI }} + CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} + CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} + CUOPT_SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} diff --git a/ci/build_summary.sh b/ci/build_summary.sh new file mode 100755 index 0000000000..e8fd81a436 --- /dev/null +++ b/ci/build_summary.sh @@ -0,0 +1,158 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Send a Slack notification summarizing the build workflow status. +# Queries the GitHub API for job statuses and posts a compact message. + +set -euo pipefail + +BRANCH="${RAPIDS_BRANCH:-main}" +RUN_DATE="$(date +%F)" +GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}" +SLACK_WEBHOOK_URL="${CUOPT_SLACK_WEBHOOK_URL:-}" +SLACK_BOT_TOKEN="${SLACK_BOT_TOKEN:-}" +SLACK_CHANNEL_ID="${SLACK_CHANNEL_ID:-}" + +if [ -z "${SLACK_WEBHOOK_URL}" ] && [ -z "${SLACK_BOT_TOKEN}" ]; then + echo "No Slack credentials set, skipping build summary." + exit 0 +fi + +# Fetch workflow job statuses +JOBS_FILE=$(mktemp) +if [ -n "${GITHUB_TOKEN:-}" ] && [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then + echo "Fetching build job statuses from GitHub API..." + curl -s -L \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100" \ + > "${JOBS_FILE}" +else + echo "{}" > "${JOBS_FILE}" +fi + +# Generate Slack payload +PAYLOAD=$(python3 -c " +import json, sys + +with open(sys.argv[1]) as f: + data = json.load(f) +branch = sys.argv[2] +date = sys.argv[3] +run_url = sys.argv[4] + +jobs = data.get('jobs', []) + +# Filter out build-summary itself and compute-matrix helpers +jobs = [j for j in jobs + if 'build-summary' not in j.get('name', '').lower() + and 'compute-matrix' not in j.get('name', '').lower()] + +# Group by workflow prefix +groups = {} +for j in jobs: + name = j.get('name', '') + prefix = name.split(' / ')[0] if ' / ' in name else name + groups.setdefault(prefix, []).append(j) + +total = len(jobs) +failed_count = sum(1 for j in jobs if j.get('conclusion') == 'failure') +passed_count = sum(1 for j in jobs if j.get('conclusion') == 'success') + +if failed_count > 0: + emoji = ':x:' + status = f'{failed_count} build job(s) failed' +else: + emoji = ':white_check_mark:' + status = f'All {passed_count} build jobs passed' + +blocks = [] +blocks.append({ + 'type': 'header', + 'text': {'type': 'plain_text', 'text': f'cuOpt Build \u2014 {branch} \u2014 {date}', 'emoji': True}, +}) +blocks.append({ + 'type': 'section', + 'text': {'type': 'mrkdwn', 'text': f'{emoji} *{status}*'}, +}) +blocks.append({'type': 'divider'}) + +# Build status per group +lines = [] +for group_name, group_jobs in sorted(groups.items()): + g_passed = sum(1 for j in group_jobs if j.get('conclusion') == 'success') + g_failed = sum(1 for j in group_jobs if j.get('conclusion') == 'failure') + g_total = len(group_jobs) + + if g_failed > 0: + icon = ':x:' + detail = f'{g_failed}/{g_total} failed' + elif g_passed == g_total: + icon = ':white_check_mark:' + detail = f'{g_total} passed' + else: + icon = ':grey_question:' + detail = f'{g_passed}/{g_total} passed' + lines.append(f'{icon} *{group_name}* \u2014 {detail}') + +text = '\n'.join(lines) +if len(text) > 2900: + text = text[:2900] + '\n_...truncated_' +blocks.append({ + 'type': 'section', + 'text': {'type': 'mrkdwn', 'text': text}, +}) + +# Link +if run_url: + blocks.append({'type': 'divider'}) + blocks.append({ + 'type': 'context', + 'elements': [{'type': 'mrkdwn', 'text': f'<{run_url}|:github: GitHub Actions>'}], + }) + +print(json.dumps({ + 'username': 'cuOpt Build Bot', + 'icon_emoji': ':package:', + 'blocks': blocks, +})) +" "${JOBS_FILE}" "${BRANCH}" "${RUN_DATE}" "${GITHUB_RUN_URL}") + +rm -f "${JOBS_FILE}" + +# Send via bot token (preferred) or webhook +echo "Sending build summary to Slack..." +if [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ]; then + BOT_PAYLOAD=$(python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +p['channel'] = sys.argv[2] +print(json.dumps(p)) +" "${PAYLOAD}" "${SLACK_CHANNEL_ID}") + + RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${BOT_PAYLOAD}" \ + "https://slack.com/api/chat.postMessage") + + OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "") + if [ "${OK}" != "True" ]; then + echo "WARNING: chat.postMessage failed: ${RESPONSE}" >&2 + # Fall back to webhook + curl -s -X POST -H 'Content-type: application/json' --data "${PAYLOAD}" "${SLACK_WEBHOOK_URL}" || true + else + echo "Build summary posted to Slack." + fi +elif [ -n "${SLACK_WEBHOOK_URL}" ]; then + response=$(curl -s -X POST \ + -H 'Content-type: application/json' \ + --data "${PAYLOAD}" \ + "${SLACK_WEBHOOK_URL}") + if [ "${response}" != "ok" ]; then + echo "WARNING: Slack webhook returned: ${response}" >&2 + else + echo "Build summary posted to Slack." + fi +fi diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html new file mode 100644 index 0000000000..73329dea0c --- /dev/null +++ b/ci/dashboard/index.html @@ -0,0 +1,689 @@ + + + + + +cuOpt Nightly Test Dashboard + + + + + + + + + + + + +
+
Loading dashboard data...
+
+ + + + + + + + + diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh new file mode 100755 index 0000000000..41790d8b44 --- /dev/null +++ b/ci/nightly_summary.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Aggregate all per-matrix nightly test summaries and send a single +# consolidated Slack notification. Runs as a post-test job after all +# matrix CI jobs finish. +# +# The script needs S3 access via CUOPT_S3_URI (bucket root) and CUOPT_AWS_* credentials. +# +# Optional: +# CUOPT_SLACK_WEBHOOK_URL - sends Slack if set +# RAPIDS_BRANCH - branch name (default: main) +# RAPIDS_BUILD_TYPE - build type (nightly, pull-request, etc.) +# GITHUB_TOKEN - for querying workflow job statuses +# GITHUB_RUN_ID - current workflow run ID + +set -euo pipefail + +SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" +OUTPUT_DIR="${PWD}/aggregate-output" +mkdir -p "${OUTPUT_DIR}" + +RUN_DATE="${RUN_DATE:-$(date +%F)}" +BRANCH="${RAPIDS_BRANCH:-main}" + +GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}" + +# Map CUOPT_AWS_* to standard AWS env vars for the aws CLI +export AWS_ACCESS_KEY_ID="${CUOPT_AWS_ACCESS_KEY_ID:-${AWS_ACCESS_KEY_ID:-}}" +export AWS_SECRET_ACCESS_KEY="${CUOPT_AWS_SECRET_ACCESS_KEY:-${AWS_SECRET_ACCESS_KEY:-}}" +unset AWS_SESSION_TOKEN + +if [ -z "${CUOPT_S3_URI:-}" ]; then + echo "WARNING: CUOPT_S3_URI is not set. Skipping nightly aggregation." >&2 + exit 0 +fi + +S3_BASE="${CUOPT_S3_URI}ci_test_reports/nightly" +BRANCH_SLUG=$(echo "${BRANCH}" | tr '/' '-') +# Per-matrix summaries are uploaded by test jobs under summaries/{date}/{branch}/. +# For production nightlies (main, release/*), RAPIDS_BRANCH matches the branch input. +# For feature branch testing, RAPIDS_BRANCH may default to "main" in rapidsai containers, +# so we search the date prefix recursively (s3_list handles this). +S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/" +S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/" +S3_CONSOLIDATED_JSON="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/consolidated.json" +S3_CONSOLIDATED_HTML="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/consolidated.html" +S3_INDEX_URI="${S3_BASE}/index.json" +S3_DASHBOARD_URI="${S3_BASE}/dashboard/${BRANCH_SLUG}/index.html" +DASHBOARD_DIR="${SCRIPT_DIR}/dashboard" + +# --- Query GitHub API for workflow job statuses --- +WORKFLOW_JOBS_JSON="${OUTPUT_DIR}/workflow_jobs.json" +if [ -n "${GITHUB_TOKEN:-}" ] && [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then + echo "Fetching workflow job statuses from GitHub API..." + curl -s -L \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100" \ + > "${WORKFLOW_JOBS_JSON}" || echo "{}" > "${WORKFLOW_JOBS_JSON}" +else + echo "WARNING: GITHUB_TOKEN or GITHUB_RUN_ID not set, skipping workflow job status." >&2 + echo "{}" > "${WORKFLOW_JOBS_JSON}" +fi + + +# Fallback: search the date-level prefix if branch-specific path is empty. +# This handles the case where RAPIDS_BRANCH in rapidsai containers differs +# from the branch input (e.g., feature branch testing where RAPIDS_BRANCH=main). +S3_SUMMARIES_FALLBACK="${S3_BASE}/summaries/${RUN_DATE}/" + +echo "Aggregating nightly summaries from ${S3_SUMMARIES_PREFIX}" + +python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \ + --s3-summaries-prefix "${S3_SUMMARIES_PREFIX}" \ + --s3-summaries-fallback "${S3_SUMMARIES_FALLBACK}" \ + --s3-reports-prefix "${S3_REPORTS_PREFIX}" \ + --s3-output-uri "${S3_CONSOLIDATED_JSON}" \ + --s3-html-output-uri "${S3_CONSOLIDATED_HTML}" \ + --s3-index-uri "${S3_INDEX_URI}" \ + --s3-dashboard-uri "${S3_DASHBOARD_URI}" \ + --dashboard-dir "${DASHBOARD_DIR}" \ + --output-dir "${OUTPUT_DIR}" \ + --date "${RUN_DATE}" \ + --branch "${BRANCH}" \ + --github-run-url "${GITHUB_RUN_URL}" \ + --workflow-jobs "${WORKFLOW_JOBS_JSON}" + +# --- Generate presigned URLs for reports (7-day expiry) --- +PRESIGN_EXPIRY=604800 +PRESIGNED_HTML=$(aws s3 presign "${S3_CONSOLIDATED_HTML}" --expires-in "${PRESIGN_EXPIRY}" 2>&1) || { + echo "WARNING: Failed to generate presigned URL for report: ${PRESIGNED_HTML}" >&2 + PRESIGNED_HTML="" +} +PRESIGNED_DASHBOARD=$(aws s3 presign "${S3_DASHBOARD_URI}" --expires-in "${PRESIGN_EXPIRY}" 2>&1) || { + echo "WARNING: Failed to generate presigned URL for dashboard: ${PRESIGNED_DASHBOARD}" >&2 + PRESIGNED_DASHBOARD="" +} + +# Send consolidated Slack notification if webhook is available and this is a nightly build +if [ -n "${CUOPT_SLACK_WEBHOOK_URL:-}" ] && [ "${RAPIDS_BUILD_TYPE:-}" = "nightly" ]; then + echo "Sending consolidated Slack notification" + CONSOLIDATED_SUMMARY="${OUTPUT_DIR}/consolidated_summary.json" \ + CONSOLIDATED_HTML="${OUTPUT_DIR}/consolidated_report.html" \ + SLACK_WEBHOOK_URL="${CUOPT_SLACK_WEBHOOK_URL}" \ + SLACK_BOT_TOKEN="${CUOPT_SLACK_BOT_TOKEN:-}" \ + SLACK_CHANNEL_ID="${CUOPT_SLACK_CHANNEL_ID:-}" \ + PRESIGNED_REPORT_URL="${PRESIGNED_HTML}" \ + PRESIGNED_DASHBOARD_URL="${PRESIGNED_DASHBOARD}" \ + bash "${SCRIPT_DIR}/utils/send_consolidated_summary.sh" +fi + +echo "Nightly summary complete." diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh index fc1de8e1b4..f1d57519b1 100755 --- a/ci/run_ctests.sh +++ b/ci/run_ctests.sh @@ -21,16 +21,40 @@ else exit 1 fi -for gt in "${GTEST_DIR}"/*_TEST; do +GTEST_MAX_RETRIES=${GTEST_MAX_RETRIES:-1} + +run_gtest_with_retry() { + local gt="$1" + shift + local test_name test_name=$(basename "${gt}") + echo "Running gtest ${test_name}" - "${gt}" "$@" + if "${gt}" "$@"; then + return 0 + fi + + local attempt + for attempt in $(seq 1 "${GTEST_MAX_RETRIES}"); do + echo "WARNING: ${test_name} failed, retry ${attempt}/${GTEST_MAX_RETRIES}" + if "${gt}" "$@"; then + echo "FLAKY: ${test_name} passed on retry ${attempt}" + return 0 + fi + done + + echo "FAILED: ${test_name} failed after $((GTEST_MAX_RETRIES + 1)) attempts" + return 1 +} + +for gt in "${GTEST_DIR}"/*_TEST; do + run_gtest_with_retry "${gt}" "$@" done # Run C_API_TEST with CPU memory for local solves (excluding time limit tests) if [ -x "${GTEST_DIR}/C_API_TEST" ]; then echo "Running gtest C_API_TEST with CUOPT_USE_CPU_MEM_FOR_LOCAL" - CUOPT_USE_CPU_MEM_FOR_LOCAL=1 "${GTEST_DIR}/C_API_TEST" --gtest_filter=-c_api/TimeLimitTestFixture.* "$@" + CUOPT_USE_CPU_MEM_FOR_LOCAL=1 run_gtest_with_retry "${GTEST_DIR}/C_API_TEST" --gtest_filter=-c_api/TimeLimitTestFixture.* "$@" else echo "Skipping C_API_TEST with CUOPT_USE_CPU_MEM_FOR_LOCAL (binary not found)" fi diff --git a/ci/run_cuopt_pytests.sh b/ci/run_cuopt_pytests.sh index 66e996715a..080fa42a1b 100755 --- a/ci/run_cuopt_pytests.sh +++ b/ci/run_cuopt_pytests.sh @@ -9,4 +9,4 @@ set -euo pipefail # Support invoking run_cuopt_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuopt/cuopt/ -pytest -s --cache-clear "$@" tests +pytest -s --cache-clear --reruns 2 --reruns-delay 5 "$@" tests diff --git a/ci/run_cuopt_server_pytests.sh b/ci/run_cuopt_server_pytests.sh index 4cb361a473..75d87d255d 100755 --- a/ci/run_cuopt_server_pytests.sh +++ b/ci/run_cuopt_server_pytests.sh @@ -9,4 +9,4 @@ set -euo pipefail # Support invoking run_cuopt_server_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuopt_server/cuopt_server/ -pytest -s --cache-clear "$@" tests +pytest -s --cache-clear --reruns 2 --reruns-delay 5 "$@" tests diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 653c44133a..a68e0c7979 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail @@ -54,5 +54,9 @@ export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/ rapids-logger "Run gtests" timeout 40m ./ci/run_ctests.sh +rapids-logger "Generate nightly test report" +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "cpp" + rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index 22c41af84c..0b2b339ba1 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail @@ -64,5 +64,11 @@ for nb in ${NBLIST}; do fi done +popd + +rapids-logger "Generate nightly test report" +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "notebooks" --with-python-version + rapids-logger "Notebook test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_python.sh b/ci/test_python.sh index 4f91c83334..9af612ad76 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -77,5 +77,9 @@ timeout 20m ./ci/run_cuopt_server_pytests.sh \ rapids-logger "Test skills/ assets (Python, C, CLI)" timeout 10m ./ci/test_skills_assets.sh +rapids-logger "Generate nightly test report" +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "python" --with-python-version + rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh index a327082e83..878db67594 100755 --- a/ci/test_wheel_cuopt.sh +++ b/ci/test_wheel_cuopt.sh @@ -63,6 +63,14 @@ cd - RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)" export RAPIDS_DATASET_ROOT_DIR +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +export RAPIDS_TESTS_DIR +mkdir -p "${RAPIDS_TESTS_DIR}" + +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + # Run CLI tests timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh @@ -71,7 +79,9 @@ timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh # Due to race condition in certain cases UCX might not be able to cleanup properly, so we set the number of threads to 1 export OMP_NUM_THREADS=1 -timeout 30m ./ci/run_cuopt_pytests.sh --verbose --capture=no +timeout 30m ./ci/run_cuopt_pytests.sh \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-wheel-cuopt.xml" \ + --verbose --capture=no # run thirdparty integration tests for only nightly builds if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then @@ -80,3 +90,9 @@ if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then ./ci/thirdparty-testing/run_pulp_tests.sh ./ci/thirdparty-testing/run_pyomo_tests.sh fi + +# Generate nightly test report +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "wheel-python" --with-python-version + +exit ${EXITCODE} diff --git a/ci/test_wheel_cuopt_server.sh b/ci/test_wheel_cuopt_server.sh index a76969b965..55852a913c 100755 --- a/ci/test_wheel_cuopt_server.sh +++ b/ci/test_wheel_cuopt_server.sh @@ -39,7 +39,22 @@ rapids-pip-retry install \ RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)" export RAPIDS_DATASET_ROOT_DIR -timeout 30m ./ci/run_cuopt_server_pytests.sh --verbose --capture=no +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +mkdir -p "${RAPIDS_TESTS_DIR}" + +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + +timeout 30m ./ci/run_cuopt_server_pytests.sh \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-wheel-cuopt-server.xml" \ + --verbose --capture=no # Run documentation tests ./ci/test_doc_examples.sh + +# Generate nightly test report +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "wheel-server" --with-python-version + +exit ${EXITCODE} diff --git a/ci/thirdparty-testing/run_cvxpy_tests.sh b/ci/thirdparty-testing/run_cvxpy_tests.sh index c336f6a800..4b874fc4f0 100755 --- a/ci/thirdparty-testing/run_cvxpy_tests.sh +++ b/ci/thirdparty-testing/run_cvxpy_tests.sh @@ -32,10 +32,14 @@ python -m pip install \ # ensure that environment is still consistent (i.e. cvxpy requirements do not conflict with cuopt's) pip check +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +mkdir -p "${RAPIDS_TESTS_DIR}" + echo "running 'cvxpy' tests" timeout 3m python -m pytest \ --verbose \ --capture=no \ --error-for-skips \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-cvxpy.xml" \ -k "TestCUOPT" \ ./cvxpy/tests/test_conic_solvers.py diff --git a/ci/thirdparty-testing/run_pulp_tests.sh b/ci/thirdparty-testing/run_pulp_tests.sh index f9cb0ca8a5..2c26db7a23 100755 --- a/ci/thirdparty-testing/run_pulp_tests.sh +++ b/ci/thirdparty-testing/run_pulp_tests.sh @@ -23,6 +23,9 @@ python -m pip install \ pip check +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +mkdir -p "${RAPIDS_TESTS_DIR}" + rapids-logger "running PuLP tests (cuOpt-related)" # PuLP uses pytest; run only tests that reference cuopt/CUOPT # Exit code 5 = no tests collected; then try run_tests.py which detects solvers (including cuopt) @@ -30,6 +33,7 @@ pytest_rc=0 timeout 5m python -m pytest \ --verbose \ --capture=no \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-pulp.xml" \ -k "cuopt or CUOPT" \ pulp/tests/ || pytest_rc=$? diff --git a/ci/thirdparty-testing/run_pyomo_tests.sh b/ci/thirdparty-testing/run_pyomo_tests.sh index f50df676c9..d2b0639f6e 100755 --- a/ci/thirdparty-testing/run_pyomo_tests.sh +++ b/ci/thirdparty-testing/run_pyomo_tests.sh @@ -23,11 +23,15 @@ python -m pip install \ pip check +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +mkdir -p "${RAPIDS_TESTS_DIR}" + rapids-logger "running Pyomo tests (cuopt_direct / cuOpt-related)" # Run only tests that reference cuopt (cuopt_direct solver) timeout 5m python -m pytest \ --verbose \ --capture=no \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-pyomo.xml" \ -k "cuopt or CUOPT" \ pyomo/solvers/tests/ diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py new file mode 100644 index 0000000000..04989a4846 --- /dev/null +++ b/ci/utils/aggregate_nightly.py @@ -0,0 +1,770 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Aggregate per-matrix nightly test summaries into a single consolidated report. + +Runs as a post-test job after all matrix CI jobs finish. It: + 1. Lists all JSON summaries uploaded to S3 for today's date + 2. Downloads and merges them + 3. Builds a matrix grid (test_type x matrix_label → status) + 4. Generates a consolidated JSON, HTML report, and Slack payload + 5. Uploads the consolidated report to S3 + +Usage: + python ci/utils/aggregate_nightly.py \\ + --s3-summaries-prefix s3://bucket/ci_test_reports/nightly/summaries/2026-04-13/ \\ + --s3-reports-prefix s3://bucket/ci_test_reports/nightly/reports/2026-04-13/ \\ + --output-dir /tmp/aggregate-output \\ + --date 2026-04-13 \\ + --branch main +""" + +import argparse +import json +import os +import sys +from datetime import datetime, timezone +from pathlib import Path + +# Ensure ci/utils is importable when invoked as a script +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from s3_helpers import s3_download, s3_upload, s3_list # noqa: E402 + + +# --------------------------------------------------------------------------- +# Download and merge summaries +# --------------------------------------------------------------------------- + + +def download_summaries(s3_prefix, local_dir, s3_fallback_prefix=""): + """Download all JSON summaries from S3 prefix into local_dir. + If s3_fallback_prefix is set and no summaries found at s3_prefix, + retries with the fallback (used when RAPIDS_BRANCH in rapidsai + containers doesn't match the branch input). + Returns list of loaded summary dicts.""" + local_dir = Path(local_dir) + local_dir.mkdir(parents=True, exist_ok=True) + + uris = s3_list(s3_prefix) + json_uris = [ + u for u in uris + if u.endswith(".json") and not u.endswith("/consolidated.json") + ] + + # Fallback: search the parent date prefix if branch-specific path is empty + if not json_uris and s3_fallback_prefix and s3_fallback_prefix != s3_prefix: + print(f"No summaries at {s3_prefix}, trying fallback: {s3_fallback_prefix}") + uris = s3_list(s3_fallback_prefix) + json_uris = [ + u for u in uris + if u.endswith(".json") and not u.endswith("/consolidated.json") + ] + if json_uris: + s3_prefix = s3_fallback_prefix + + print(f"Found {len(json_uris)} summary file(s) at {s3_prefix}") + + summaries = [] + for uri in json_uris: + filename = uri.rsplit("/", 1)[-1] + local_path = str(local_dir / filename) + if s3_download(uri, local_path): + try: + with open(local_path) as f: + summaries.append(json.load(f)) + except (json.JSONDecodeError, OSError) as exc: + print( + f"WARNING: Failed to parse {local_path}: {exc}", + file=sys.stderr, + ) + return summaries + + +def load_local_summaries(local_dir): + """Load summaries from a local directory (for testing without S3).""" + local_dir = Path(local_dir) + summaries = [] + for json_file in sorted(local_dir.glob("*.json")): + try: + with open(json_file) as f: + summaries.append(json.load(f)) + except (json.JSONDecodeError, OSError) as exc: + print( + f"WARNING: Failed to parse {json_file}: {exc}", file=sys.stderr + ) + return summaries + + +# --------------------------------------------------------------------------- +# Aggregation +# --------------------------------------------------------------------------- + + +def aggregate_summaries(summaries): + """Merge per-matrix summaries into a consolidated view. + + Returns a dict with: + - matrix_grid: list of {test_type, matrix_label, status, counts, ...} + - totals: aggregate counts + - all_new_failures, all_recurring_failures, all_flaky_tests, + all_resolved_tests: merged lists with matrix context added + """ + grid = [] + totals = { + "total": 0, + "passed": 0, + "failed": 0, + "flaky": 0, + "skipped": 0, + "resolved": 0, + } + all_new_failures = [] + all_recurring_failures = [] + all_flaky_tests = [] + all_resolved_tests = [] + + for s in summaries: + test_type = s.get("test_type", "unknown") + matrix_label = s.get("matrix_label", "unknown") + counts = s.get("counts", {}) + + # Determine job status + failed = counts.get("failed", 0) + flaky = counts.get("flaky", 0) + has_new = s.get("has_new_failures", False) + + if failed > 0: + status = "failed-new" if has_new else "failed-recurring" + elif flaky > 0: + status = "flaky" + elif counts.get("total", 0) == 0: + status = "no-results" + else: + status = "passed" + + grid.append( + { + "test_type": test_type, + "matrix_label": matrix_label, + "status": status, + "counts": counts, + "sha": s.get("sha", ""), + } + ) + + # Accumulate totals + for key in totals: + totals[key] += counts.get(key, 0) + + # Merge failure lists with matrix context + ctx = {"test_type": test_type, "matrix_label": matrix_label} + for entry in s.get("new_failures", []): + all_new_failures.append({**entry, **ctx}) + for entry in s.get("recurring_failures", []): + all_recurring_failures.append({**entry, **ctx}) + for entry in s.get("flaky_tests", []): + all_flaky_tests.append({**entry, **ctx}) + for entry in s.get("resolved_tests", []): + all_resolved_tests.append({**entry, **ctx}) + + # Sort grid for consistent display + grid.sort(key=lambda g: (g["test_type"], g["matrix_label"])) + + return { + "matrix_grid": grid, + "totals": totals, + "all_new_failures": all_new_failures, + "all_recurring_failures": all_recurring_failures, + "all_flaky_tests": all_flaky_tests, + "all_resolved_tests": all_resolved_tests, + } + + +# --------------------------------------------------------------------------- +# Consolidated JSON +# --------------------------------------------------------------------------- + + +def parse_workflow_jobs(workflow_jobs_path): + """Parse GitHub Actions workflow job statuses from JSON file. + Returns all jobs (except nightly-summary itself) with name, + conclusion, URL, and whether they are tracked by per-matrix + S3 summaries.""" + if not workflow_jobs_path or not Path(workflow_jobs_path).exists(): + return [] + + # Job name prefixes that are covered by per-matrix S3 reports. + # These jobs also have detailed test results; other jobs only have + # a pass/fail status at the workflow level. + TRACKED_PREFIXES = ( + "conda-cpp-tests", + "conda-python-tests", + "wheel-tests-cuopt-server", + "wheel-tests-cuopt", + ) + + try: + with open(workflow_jobs_path) as f: + data = json.load(f) + jobs_list = data.get("jobs", []) + result = [] + for job in jobs_list: + name = job.get("name", "") + # Skip the nightly-summary job itself + if "nightly-summary" in name.lower(): + continue + # Skip helper jobs (compute-matrix, etc.) + if "compute-matrix" in name.lower(): + continue + tracked = any(name.startswith(p) for p in TRACKED_PREFIXES) + result.append({ + "name": name, + "conclusion": job.get("conclusion", "unknown"), + "status": job.get("status", "unknown"), + "url": job.get("html_url", ""), + "has_test_details": tracked, + }) + return result + except (json.JSONDecodeError, OSError) as exc: + print( + f"WARNING: Failed to parse workflow jobs: {exc}", + file=sys.stderr, + ) + return [] + + +def generate_consolidated_json(agg, date_str, branch, github_run_url="", + workflow_jobs=None): + """Generate the consolidated JSON for Slack and dashboard.""" + total_jobs = len(agg["matrix_grid"]) + failed_jobs = sum( + 1 for g in agg["matrix_grid"] if g["status"].startswith("failed") + ) + flaky_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "flaky") + passed_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "passed") + + # Workflow-level CI job statuses + wf_jobs = workflow_jobs or [] + failed_ci_jobs = [j for j in wf_jobs if j["conclusion"] == "failure"] + # Jobs without per-matrix S3 tracking (notebooks, JuMP, etc.) + untracked_failed = [ + j for j in failed_ci_jobs if not j.get("has_test_details", False) + ] + + return { + "timestamp": datetime.now(timezone.utc).isoformat(), + "date": date_str, + "branch": branch, + "github_run_url": github_run_url, + "job_summary": { + "total": total_jobs, + "passed": passed_jobs, + "failed": failed_jobs, + "flaky": flaky_jobs, + }, + "test_totals": agg["totals"], + "has_new_failures": len(agg["all_new_failures"]) > 0, + "matrix_grid": agg["matrix_grid"], + "new_failures": agg["all_new_failures"], + "recurring_failures": agg["all_recurring_failures"], + "flaky_tests": agg["all_flaky_tests"], + "resolved_tests": agg["all_resolved_tests"], + "workflow_jobs": wf_jobs, + "failed_ci_jobs": failed_ci_jobs, + "untracked_failed_ci_jobs": untracked_failed, + } + + +# --------------------------------------------------------------------------- +# Consolidated HTML +# --------------------------------------------------------------------------- + + +def _html_escape(text): + return ( + str(text) + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) + + +def _status_badge(status): + """Return an HTML badge for a matrix cell status.""" + colors = { + "passed": ("#388e3c", "PASS"), + "failed-new": ("#d32f2f", "NEW FAIL"), + "failed-recurring": ("#e65100", "RECURRING"), + "flaky": ("#f9a825", "FLAKY"), + "no-results": ("#757575", "NO DATA"), + } + bg, label = colors.get(status, ("#757575", status.upper())) + text_color = "#212121" if status == "flaky" else "#fff" + return ( + f'' + f"{label}" + ) + + +def generate_consolidated_html( + agg, + date_str, + branch, + github_run_url="", + s3_reports_prefix="", +): + """Generate a consolidated HTML dashboard for all matrix combos.""" + total_jobs = len(agg["matrix_grid"]) + failed_jobs = sum( + 1 for g in agg["matrix_grid"] if g["status"].startswith("failed") + ) + + if failed_jobs > 0: + bar_color = "#d32f2f" + bar_text = f"{failed_jobs} of {total_jobs} matrix jobs have failures" + elif any(g["status"] == "flaky" for g in agg["matrix_grid"]): + bar_color = "#f9a825" + bar_text = "All jobs passed (flaky tests detected)" + else: + bar_color = "#388e3c" + bar_text = f"All {total_jobs} matrix jobs passed" + + totals = agg["totals"] + + parts = [] + parts.append(f""" + + + + +cuOpt Nightly — {_html_escape(branch)} — {_html_escape(date_str)} + + + +

cuOpt Nightly Tests — {_html_escape(branch)}

+
+ Date: {_html_escape(date_str)}""") + + if github_run_url: + parts.append( + f'  |  ' + f"GitHub Actions Run" + ) + + parts.append(f"""
+
{bar_text}
+
+
{totals["total"]}
Total Tests
+
{totals["passed"]}
Passed
+
{totals["failed"]}
Failed
+
{totals["flaky"]}
Flaky
+
Skipped
+
{totals["resolved"]}
Stabilized
+
""") + + # --- New failures --- + if agg["all_new_failures"]: + parts.append("

New Failures

") + parts.append( + "" + "" + ) + for e in agg["all_new_failures"]: + msg = _html_escape(e.get("message", "")) + short = _html_escape(e.get("message", "")[:100]) + parts.append( + f"" + f"" + f"" + f"" + f"' + ) + parts.append("
Test TypeMatrixSuiteTestError
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_html_escape(e['name'])}
{short}" + f'
{msg}
") + + # --- Recurring failures --- + if agg["all_recurring_failures"]: + parts.append("

Recurring Failures

") + parts.append( + "" + "" + ) + for e in agg["all_recurring_failures"]: + msg = _html_escape(e.get("message", "")) + short = _html_escape(e.get("message", "")[:100]) + parts.append( + f"" + f"" + f"" + f"" + f"" + f"' + ) + parts.append("
Test TypeMatrixSuiteTestSinceError
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_html_escape(e['name'])}{_html_escape(e.get('first_seen', '?'))}
{short}" + f'
{msg}
") + + # --- Resolved --- + if agg["all_resolved_tests"]: + parts.append("

Stabilized Tests

") + parts.append( + "" + "" + ) + for e in agg["all_resolved_tests"]: + parts.append( + f"" + f"" + f"" + f"" + f"" + f"" + ) + parts.append("
Test TypeMatrixSuiteTestFailing SinceCount
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_html_escape(e['name'])}{_html_escape(e.get('first_seen', '?'))}{e.get('failure_count', '?')}
") + + # --- Flaky --- + if agg["all_flaky_tests"]: + parts.append("

Flaky Tests

") + parts.append( + "" + "" + ) + for e in agg["all_flaky_tests"]: + parts.append( + f"" + f"" + f"" + f"" + f"" + ) + parts.append("
Test TypeMatrixSuiteTestRetries
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_html_escape(e['name'])}{e.get('retry_count', '?')}
") + + if ( + not agg["all_new_failures"] + and not agg["all_recurring_failures"] + and not agg["all_flaky_tests"] + and not agg["all_resolved_tests"] + ): + parts.append( + '

' + "All tests passed across all matrices!

" + ) + + # --- Matrix grid (at the end) --- + parts.append("

Matrix Overview

") + parts.append( + "" + "" + ) + for g in agg["matrix_grid"]: + counts = g["counts"] + report_link = "" + if s3_reports_prefix: + report_filename = f"{g['test_type']}-{g['matrix_label']}.html" + report_link = ( + f'View' + ) + parts.append( + f"" + f"" + f"" + f"" + f"" + f"" + f"" + f"" + ) + parts.append("
Test TypeMatrixStatusPassedFailedFlakyTotalReport
{_html_escape(g['test_type'])}{_html_escape(g['matrix_label'])}{_status_badge(g['status'])}{counts.get('passed', 0)}{counts.get('failed', 0)}{counts.get('flaky', 0)}{counts.get('total', 0)}{report_link}
") + + parts.append("") + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Index management +# --------------------------------------------------------------------------- + +MAX_INDEX_DAYS = 90 # Keep at most 90 days in the index + + +def update_index(s3_index_uri, date_str, consolidated, output_dir): + """Download index.json, add today's entry, prune old entries, re-upload.""" + local_index = str(output_dir / "index.json") + + # Download existing index (or start fresh) + index = {"_schema_version": 1, "dates": {}} + if s3_download(s3_index_uri, local_index): + try: + with open(local_index) as f: + loaded = json.load(f) + if "dates" in loaded: + index = loaded + except (json.JSONDecodeError, OSError): + pass + + # Add today's entry keyed by date/branch for multi-branch support + branch = consolidated.get("branch", "main") + entry_key = f"{date_str}/{branch}" + index["dates"][entry_key] = { + "date": date_str, + "branch": branch, + "job_summary": consolidated.get("job_summary", {}), + "test_totals": consolidated.get("test_totals", {}), + "has_new_failures": consolidated.get("has_new_failures", False), + "github_run_url": consolidated.get("github_run_url", ""), + } + + # Prune to last N entries + dates_sorted = sorted(index["dates"].keys(), reverse=True) + if len(dates_sorted) > MAX_INDEX_DAYS: + for old_key in dates_sorted[MAX_INDEX_DAYS:]: + del index["dates"][old_key] + + # Write and upload + with open(local_index, "w") as f: + json.dump(index, f, indent=2, sort_keys=True) + f.write("\n") + print(f"Updated index.json with {len(index['dates'])} date(s)") + + s3_upload(local_index, s3_index_uri) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description="Aggregate per-matrix nightly test summaries" + ) + parser.add_argument( + "--s3-summaries-prefix", + default="", + help="S3 prefix for per-matrix JSON summaries (e.g., s3://bucket/.../summaries/2026-04-13/)", + ) + parser.add_argument( + "--s3-summaries-fallback", + default="", + help="Fallback S3 prefix if no summaries found at primary prefix", + ) + parser.add_argument( + "--s3-reports-prefix", + default="", + help="S3 prefix where per-matrix HTML reports live (for linking)", + ) + parser.add_argument( + "--s3-output-uri", + default="", + help="S3 URI to upload the consolidated JSON", + ) + parser.add_argument( + "--s3-html-output-uri", + default="", + help="S3 URI to upload the consolidated HTML report", + ) + parser.add_argument( + "--s3-index-uri", + default="", + help="S3 URI for the index.json that tracks all available dates (read + write)", + ) + parser.add_argument( + "--s3-dashboard-uri", + default="", + help="S3 URI to upload the dashboard HTML (e.g., s3://bucket/.../dashboard/index.html)", + ) + parser.add_argument( + "--dashboard-dir", + default="", + help="Local directory containing dashboard files to upload", + ) + parser.add_argument( + "--local-summaries-dir", + default="", + help="Local directory with JSON summaries (alternative to S3, for testing)", + ) + parser.add_argument( + "--output-dir", + default="aggregate-output", + help="Local directory to write output files", + ) + parser.add_argument( + "--date", + default=datetime.now(timezone.utc).strftime("%Y-%m-%d"), + help="Date for this run (YYYY-MM-DD)", + ) + parser.add_argument("--branch", default="main", help="Branch name") + parser.add_argument( + "--github-run-url", + default="", + help="URL to the GitHub Actions run", + ) + parser.add_argument( + "--workflow-jobs", + default="", + help="Path to JSON file with GitHub Actions workflow job statuses", + ) + + args = parser.parse_args() + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # ---- Step 1: Collect summaries ---- + if args.local_summaries_dir: + summaries = load_local_summaries(args.local_summaries_dir) + elif args.s3_summaries_prefix: + download_dir = output_dir / "downloaded_summaries" + summaries = download_summaries( + args.s3_summaries_prefix, download_dir, args.s3_summaries_fallback + ) + else: + print( + "ERROR: Provide --s3-summaries-prefix or --local-summaries-dir", + file=sys.stderr, + ) + return 1 + + if not summaries: + print( + "WARNING: No summaries found. Generating empty report.", + file=sys.stderr, + ) + + print(f"Loaded {len(summaries)} matrix summary file(s)") + + # ---- Step 2: Aggregate ---- + agg = aggregate_summaries(summaries) + print( + f"Matrix grid: {len(agg['matrix_grid'])} jobs — " + f"{sum(1 for g in agg['matrix_grid'] if g['status'] == 'passed')} passed, " + f"{sum(1 for g in agg['matrix_grid'] if g['status'].startswith('failed'))} failed, " + f"{sum(1 for g in agg['matrix_grid'] if g['status'] == 'flaky')} flaky" + ) + + # ---- Step 2b: Parse workflow job statuses ---- + workflow_jobs = parse_workflow_jobs(args.workflow_jobs) + if workflow_jobs: + failed_wf = [j for j in workflow_jobs if j["conclusion"] == "failure"] + print( + f"Workflow jobs: {len(workflow_jobs)} total, " + f"{len(failed_wf)} failed" + ) + + # ---- Step 3: Generate outputs ---- + consolidated = generate_consolidated_json( + agg, + args.date, + args.branch, + args.github_run_url, + workflow_jobs, + ) + + json_path = output_dir / "consolidated_summary.json" + json_path.write_text(json.dumps(consolidated, indent=2) + "\n") + print(f"Consolidated JSON written to {json_path}") + + html_report = generate_consolidated_html( + agg, + args.date, + args.branch, + args.github_run_url, + args.s3_reports_prefix, + ) + html_path = output_dir / "consolidated_report.html" + html_path.write_text(html_report) + print(f"Consolidated HTML written to {html_path}") + + # ---- Step 4: Upload to S3 ---- + if args.s3_output_uri: + s3_upload(str(json_path), args.s3_output_uri) + if args.s3_html_output_uri: + s3_upload(str(html_path), args.s3_html_output_uri) + + # ---- Step 5: Update index.json ---- + if args.s3_index_uri: + update_index( + args.s3_index_uri, + args.date, + consolidated, + output_dir, + ) + + # ---- Step 6: Upload dashboard (self-contained with embedded data) ---- + if args.s3_dashboard_uri and args.dashboard_dir: + dashboard_file = Path(args.dashboard_dir) / "index.html" + if dashboard_file.exists(): + # Read the index.json we just uploaded/created + index_path = output_dir / "index.json" + index_data = {} + if index_path.exists(): + with open(index_path) as f: + index_data = json.load(f) + + # Inject data into dashboard HTML so it works without S3 fetches + dashboard_html = dashboard_file.read_text() + inject_script = ( + "\n" + ) + # Insert before + dashboard_html = dashboard_html.replace( + "", inject_script + "" + ) + + embedded_path = output_dir / "dashboard.html" + embedded_path.write_text(dashboard_html) + s3_upload(str(embedded_path), args.s3_dashboard_uri) + print(f"Dashboard uploaded with embedded data") + else: + print( + f"WARNING: Dashboard not found at {dashboard_file}", + file=sys.stderr, + ) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ci/utils/nightly_report.py b/ci/utils/nightly_report.py new file mode 100755 index 0000000000..2bd23b1f18 --- /dev/null +++ b/ci/utils/nightly_report.py @@ -0,0 +1,1005 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Nightly test report generator for cuOpt CI. + +Parses JUnit XML test results, classifies failures as flaky vs genuine, +maintains a failure history database on S3, and outputs: + - HTML report (detailed, uploaded to S3 and linked from Slack) + - Markdown summary (for $GITHUB_STEP_SUMMARY or terminal) + - JSON summary (for downstream consumers like Slack notifier and dashboard) + +Each CI matrix job (CUDA version x Python version x architecture) runs this +script independently. The --test-type and --matrix-label flags identify the +job so that history and summaries are stored per-matrix-combo. + +History lifecycle: + 1. Download history from S3 (falls back to empty if not found) + 2. Classify this run's results + 3. Update history: mark new failures, bump recurring counts, resolve stabilized tests + 4. Upload updated history back to S3 + 5. Generate reports (HTML, Markdown, JSON, GitHub Step Summary) + 6. Upload per-run JSON snapshot to S3 summaries dir (for aggregation) + +Usage: + python ci/utils/nightly_report.py \\ + --results-dir test-results/ \\ + --output-dir report-output/ \\ + --sha abc123 \\ + --test-type python \\ + --matrix-label cuda12.9-py3.12-x86_64 \\ + --s3-history-uri s3://bucket/ci_test_reports/nightly/history/python-main-cuda12.9-py3.12-x86_64.json \\ + --s3-summary-uri s3://bucket/ci_test_reports/nightly/summaries/2026-04-13/python-cuda12.9-py3.12-x86_64.json +""" + +import argparse +import json +import os +import sys +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path +from xml.etree import ElementTree + +# Ensure ci/utils is importable when invoked as a script +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from s3_helpers import s3_download, s3_upload # noqa: E402 + +EMPTY_HISTORY = {"_schema_version": 2, "tests": {}} + +# A test that resolves then fails again within this window is considered +# "bouncing" (intermittently flaky) rather than a new failure. +BOUNCE_WINDOW_DAYS = int(os.environ.get("CUOPT_BOUNCE_WINDOW_DAYS", 14)) + +# Number of failure/resolve cycles that classify a test as cross-run flaky. +BOUNCE_THRESHOLD = int(os.environ.get("CUOPT_BOUNCE_THRESHOLD", 2)) + + +# --------------------------------------------------------------------------- +# JUnit XML parsing +# --------------------------------------------------------------------------- + + +def parse_junit_xml(xml_path): + """Parse a JUnit XML file and return a list of test result dicts.""" + results = [] + try: + tree = ElementTree.parse(xml_path) + except ElementTree.ParseError as e: + print(f"WARNING: Failed to parse {xml_path}: {e}", file=sys.stderr) + return results + + root = tree.getroot() + + if root.tag == "testsuites": + suites = root.findall("testsuite") + elif root.tag == "testsuite": + suites = [root] + else: + return results + + for suite in suites: + suite_name = suite.get("name", os.path.basename(xml_path)) + for testcase in suite.findall("testcase"): + name = testcase.get("name", "unknown") + classname = testcase.get("classname", "") + time_taken = testcase.get("time", "0") + + failure = testcase.find("failure") + error = testcase.find("error") + skipped = testcase.find("skipped") + + if skipped is not None: + status = "skipped" + message = skipped.get("message", "") + elif failure is not None: + status = "failed" + message = failure.get("message", "") + if failure.text: + message = failure.text[:500] + elif error is not None: + status = "error" + message = error.get("message", "") + if error.text: + message = error.text[:500] + else: + status = "passed" + message = "" + + results.append( + { + "suite": suite_name, + "classname": classname, + "name": name, + "status": status, + "time": time_taken, + "message": message, + "source_file": str(xml_path), + } + ) + + return results + + +def collect_all_results(results_dir): + """Collect test results from all JUnit XML files in a directory.""" + results_dir = Path(results_dir) + all_results = [] + for xml_file in sorted(results_dir.rglob("*.xml")): + all_results.extend(parse_junit_xml(xml_file)) + return all_results + + +# --------------------------------------------------------------------------- +# Classification +# --------------------------------------------------------------------------- + + +def classify_failures(results): + """ + Classify test results into passed, failed, flaky, skipped, and error. + + pytest-rerunfailures records reruns as additional entries. + A test that failed then passed on rerun is flaky. + """ + test_groups = defaultdict(list) + for r in results: + key = f"{r['suite']}::{r['classname']}::{r['name']}" + test_groups[key].append(r) + + classified = { + "passed": [], + "failed": [], + "flaky": [], + "skipped": [], + "error": [], + } + + for key, entries in test_groups.items(): + statuses = [e["status"] for e in entries] + + if all(s == "skipped" for s in statuses): + classified["skipped"].append(entries[0]) + elif any(s == "passed" for s in statuses): + if any(s in ("failed", "error") for s in statuses): + entry = entries[-1].copy() + entry["status"] = "flaky" + entry["retry_count"] = sum( + 1 for s in statuses if s in ("failed", "error") + ) + classified["flaky"].append(entry) + else: + classified["passed"].append(entries[-1]) + elif any(s == "error" for s in statuses): + classified["error"].append(entries[-1]) + else: + classified["failed"].append(entries[-1]) + + return classified + + +# --------------------------------------------------------------------------- +# History management +# --------------------------------------------------------------------------- + + +def load_history(history_path): + """Load failure history from a local JSON file.""" + try: + with open(history_path) as f: + data = json.load(f) + if "tests" in data: + return data + except (FileNotFoundError, json.JSONDecodeError): + pass + return dict(EMPTY_HISTORY) + + +def _days_between(date_a, date_b): + """Return absolute number of days between two YYYY-MM-DD strings.""" + try: + a = datetime.strptime(date_a, "%Y-%m-%d") + b = datetime.strptime(date_b, "%Y-%m-%d") + return abs((a - b).days) + except (ValueError, TypeError): + return 999 + + +def _is_recent_resolve(rec, date_str): + """Check if a test was resolved recently (within bounce window).""" + resolved_date = rec.get("resolved_date", "") + if not resolved_date: + return False + return _days_between(resolved_date, date_str) <= BOUNCE_WINDOW_DAYS + + +def update_history(history, classified, sha, date_str): + """ + Update failure history with this run's results. + + Returns (history, new_failures, recurring_failures, resolved_tests). + + Classification logic: + - "new failure": never seen before (no history entry at all) + - "recurring": was already active (failing on previous runs) + - "bouncing": was resolved recently but failed again — reactivated + as recurring (not new), and marked cross-run flaky after 2+ bounces + - "resolved": was active, now passes — notified once, then silent + on subsequent passes + """ + tests = history.setdefault("tests", {}) + new_failures = [] + recurring_failures = [] + resolved_tests = [] + + # --- Genuine failures --- + for entry in classified["failed"] + classified["error"]: + test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" + + if test_key in tests: + rec = tests[test_key] + + if rec["status"] == "active": + # Still failing — bump count + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + recurring_failures.append( + {**entry, "first_seen": rec["first_seen_date"]} + ) + elif rec["status"] == "resolved" and _is_recent_resolve( + rec, date_str + ): + # Bouncing: resolved recently but failed again. + # Reactivate as recurring, not new. Track the bounce. + rec["status"] = "active" + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + rec["bounce_count"] = rec.get("bounce_count", 0) + 1 + if rec["bounce_count"] >= BOUNCE_THRESHOLD: + rec["is_flaky"] = True + recurring_failures.append( + { + **entry, + "first_seen": rec["first_seen_date"], + "is_bouncing": True, + } + ) + else: + # Resolved long ago — treat as new cycle but keep history + rec["status"] = "active" + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + rec["bounce_count"] = rec.get("bounce_count", 0) + 1 + new_failures.append(entry) + else: + # Truly new — never seen before + tests[test_key] = { + "suite": entry["suite"], + "classname": entry["classname"], + "name": entry["name"], + "first_seen_date": date_str, + "first_seen_sha": sha, + "last_seen_date": date_str, + "last_seen_sha": sha, + "failure_count": 1, + "is_flaky": False, + "bounce_count": 0, + "status": "active", + } + new_failures.append(entry) + + # --- Flaky tests (passed on retry within this run) --- + for entry in classified["flaky"]: + test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" + if test_key in tests: + rec = tests[test_key] + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + rec["is_flaky"] = True + # If it was resolved, reactivate — it's still unstable + if rec["status"] == "resolved": + rec["status"] = "active" + rec["bounce_count"] = rec.get("bounce_count", 0) + 1 + else: + tests[test_key] = { + "suite": entry["suite"], + "classname": entry["classname"], + "name": entry["name"], + "first_seen_date": date_str, + "first_seen_sha": sha, + "last_seen_date": date_str, + "last_seen_sha": sha, + "failure_count": 1, + "is_flaky": True, + "bounce_count": 0, + "status": "active", + } + + # --- Resolve stabilized tests --- + passed_keys = set() + for entry in classified["passed"]: + test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" + passed_keys.add(test_key) + + for test_key in passed_keys: + if test_key in tests and tests[test_key]["status"] == "active": + rec = tests[test_key] + rec["status"] = "resolved" + rec["resolved_date"] = date_str + rec["resolved_sha"] = sha + resolved_tests.append( + { + "suite": rec["suite"], + "classname": rec["classname"], + "name": rec["name"], + "first_seen": rec["first_seen_date"], + "failure_count": rec["failure_count"], + "bounce_count": rec.get("bounce_count", 0), + "was_flaky": rec.get("is_flaky", False), + } + ) + # If already "resolved" and passes again — no notification. + # The resolved notification was sent once when it first stabilized. + + return history, new_failures, recurring_failures, resolved_tests + + +def save_history(history, history_path): + """Write history to a local JSON file.""" + with open(history_path, "w") as f: + json.dump(history, f, indent=2, sort_keys=True) + f.write("\n") + + +# --------------------------------------------------------------------------- +# Report generation +# --------------------------------------------------------------------------- + + +def generate_markdown_report( + classified, + new_failures, + recurring_failures, + resolved_tests, + history, + test_type="", + matrix_label="", + sha="", + date_str="", +): + """Generate a Markdown summary report.""" + lines = [] + title = "# Nightly Test Report" + if test_type: + title += f" — {test_type}" + if matrix_label: + title += f" [{matrix_label}]" + lines.append(title) + lines.append("") + if date_str or sha: + meta_parts = [] + if date_str: + meta_parts.append(f"**Date:** {date_str}") + if sha: + meta_parts.append(f"**Commit:** `{sha[:12]}`") + if matrix_label: + meta_parts.append(f"**Matrix:** {matrix_label}") + lines.append(" | ".join(meta_parts)) + lines.append("") + + total_passed = len(classified["passed"]) + total_failed = len(classified["failed"]) + len(classified["error"]) + total_flaky = len(classified["flaky"]) + total_skipped = len(classified["skipped"]) + total = total_passed + total_failed + total_flaky + total_skipped + + lines.append("## Summary") + lines.append("") + lines.append("| Metric | Count |") + lines.append("|--------|-------|") + lines.append(f"| Total tests | {total} |") + lines.append(f"| Passed | {total_passed} |") + lines.append(f"| **Genuine failures** | **{total_failed}** |") + lines.append(f"| Flaky (passed on retry) | {total_flaky} |") + lines.append(f"| Skipped | {total_skipped} |") + if resolved_tests: + lines.append( + f"| **Stabilized (were failing, now pass)** | **{len(resolved_tests)}** |" + ) + lines.append("") + + # -- New genuine failures (highest priority) -- + if new_failures: + lines.append("## NEW Failures (not previously seen)") + lines.append("") + lines.append("| Suite | Test | Error |") + lines.append("|-------|------|-------|") + for entry in new_failures: + short_msg = ( + entry.get("message", "")[:80] + .replace("\n", " ") + .replace("|", "\\|") + ) + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {short_msg} |" + ) + lines.append("") + + # -- Recurring failures -- + if recurring_failures: + lines.append("## Recurring Failures") + lines.append("") + lines.append("| Suite | Test | First seen | Failure count | Error |") + lines.append("|-------|------|------------|---------------|-------|") + for entry in recurring_failures: + short_msg = ( + entry.get("message", "")[:60] + .replace("\n", " ") + .replace("|", "\\|") + ) + first_seen = entry.get("first_seen", "unknown") + test_key = ( + f"{entry['suite']}::{entry['classname']}::{entry['name']}" + ) + count = ( + history.get("tests", {}) + .get(test_key, {}) + .get("failure_count", "?") + ) + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {first_seen} | {count} | {short_msg} |" + ) + lines.append("") + + # -- Stabilized tests -- + if resolved_tests: + lines.append("## Stabilized Tests (were failing, now passing)") + lines.append("") + lines.append( + "| Suite | Test | Was failing since | Total failure count | Was flaky? |" + ) + lines.append( + "|-------|------|-------------------|---------------------|------------|" + ) + for entry in resolved_tests: + flaky_badge = "Yes" if entry.get("was_flaky") else "No" + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {entry['first_seen']} " + f"| {entry['failure_count']} | {flaky_badge} |" + ) + lines.append("") + + # -- Flaky tests -- + if classified["flaky"]: + lines.append("## Flaky Tests (passed on retry)") + lines.append("") + lines.append("| Suite | Test | Retries needed |") + lines.append("|-------|------|----------------|") + for entry in classified["flaky"]: + retry_count = entry.get("retry_count", "?") + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {retry_count} |" + ) + lines.append("") + + # -- Detailed errors -- + all_failures = classified["failed"] + classified["error"] + if all_failures: + lines.append("## All Failure Details") + lines.append("") + for entry in all_failures: + lines.append(f"### `{entry['classname']}::{entry['name']}`") + lines.append(f"- **Suite**: {entry['suite']}") + lines.append(f"- **Source**: {entry['source_file']}") + msg = entry.get("message", "").strip() + if msg: + lines.append("- **Error**:") + lines.append("```") + for line in msg.split("\n")[:20]: + lines.append(line) + lines.append("```") + lines.append("") + + if not all_failures and not classified["flaky"] and not resolved_tests: + lines.append("All tests passed! No failures or flaky tests detected.") + lines.append("") + + return "\n".join(lines) + + +def generate_json_summary( + classified, + new_failures, + recurring_failures, + resolved_tests, + test_type="", + matrix_label="", + sha="", + date_str="", +): + """Generate a JSON summary for downstream tools (Slack notifier, dashboard).""" + return { + "timestamp": datetime.now(timezone.utc).isoformat(), + "test_type": test_type, + "matrix_label": matrix_label, + "sha": sha, + "date": date_str, + "counts": { + "total": sum(len(v) for v in classified.values()), + "passed": len(classified["passed"]), + "failed": len(classified["failed"]) + len(classified["error"]), + "flaky": len(classified["flaky"]), + "skipped": len(classified["skipped"]), + "resolved": len(resolved_tests), + }, + "has_new_failures": len(new_failures) > 0, + "new_failures": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "message": e.get("message", "")[:200], + } + for e in new_failures + ], + "recurring_failures": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "first_seen": e.get("first_seen", "unknown"), + "message": e.get("message", "")[:200], + } + for e in recurring_failures + ], + "flaky_tests": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "retry_count": e.get("retry_count", 0), + } + for e in classified["flaky"] + ], + "resolved_tests": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "first_seen": e.get("first_seen", "unknown"), + "failure_count": e.get("failure_count", 0), + "was_flaky": e.get("was_flaky", False), + } + for e in resolved_tests + ], + } + + +# --------------------------------------------------------------------------- +# HTML report +# --------------------------------------------------------------------------- + + +def _html_escape(text): + """Escape HTML special characters.""" + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) + + +def generate_html_report( + classified, + new_failures, + recurring_failures, + resolved_tests, + history, + test_type="", + matrix_label="", + sha="", + date_str="", +): + """Generate a self-contained HTML report with detailed failure info.""" + total_passed = len(classified["passed"]) + total_failed = len(classified["failed"]) + len(classified["error"]) + total_flaky = len(classified["flaky"]) + total_skipped = len(classified["skipped"]) + total = total_passed + total_failed + total_flaky + total_skipped + + title = "Nightly Test Report" + if test_type: + title += f" — {_html_escape(test_type)}" + if matrix_label: + title += f" [{_html_escape(matrix_label)}]" + + # Determine overall status color + if total_failed > 0: + status_color = "#d32f2f" + status_text = f"{total_failed} failure(s)" + elif total_flaky > 0: + status_color = "#f9a825" + status_text = "All passed (flaky detected)" + else: + status_color = "#388e3c" + status_text = "All passed" + + parts = [] + parts.append(f""" + + + + +{title} + + + +

{title}

+
""") + + meta_parts = [] + if date_str: + meta_parts.append(f"Date: {_html_escape(date_str)}") + if sha: + meta_parts.append(f"Commit: {_html_escape(sha[:12])}") + if matrix_label: + meta_parts.append( + f"Matrix: {_html_escape(matrix_label)}" + ) + parts.append("  |  ".join(meta_parts)) + + parts.append(f"""
+
{status_text}
+
+
{total}
Total
+
{total_passed}
Passed
+
{total_failed}
Failed
+
{total_flaky}
Flaky
+
Skipped
+
{len(resolved_tests)}
Stabilized
+
""") + + # --- New failures --- + if new_failures: + parts.append("

New Failures

") + parts.append("") + for e in new_failures: + msg = _html_escape(e.get("message", "")) + short = _html_escape(e.get("message", "")[:100]) + parts.append( + f"" + f"' + f"' + ) + parts.append("
SuiteTestError
{_html_escape(e['suite'])}{_html_escape(e['name'])} " + f'NEW
{short}" + f'
{msg}
") + + # --- Recurring failures --- + if recurring_failures: + parts.append("

Recurring Failures

") + parts.append( + "" + "" + ) + for e in recurring_failures: + msg = _html_escape(e.get("message", "")) + short = _html_escape(e.get("message", "")[:100]) + first_seen = _html_escape(e.get("first_seen", "unknown")) + test_key = f"{e['suite']}::{e['classname']}::{e['name']}" + count = ( + history.get("tests", {}) + .get(test_key, {}) + .get("failure_count", "?") + ) + parts.append( + f"" + f"' + f"" + f"' + ) + parts.append("
SuiteTestFirst SeenCountError
{_html_escape(e['suite'])}{_html_escape(e['name'])} " + f'RECURRING{first_seen}{count}
{short}" + f'
{msg}
") + + # --- Stabilized --- + if resolved_tests: + parts.append("

Stabilized Tests

") + parts.append( + "" + "" + ) + for e in resolved_tests: + flaky_tag = "Yes" if e.get("was_flaky") else "No" + parts.append( + f"" + f"' + f"" + f"" + f"" + ) + parts.append("
SuiteTestFailing SinceFailure CountWas Flaky?
{_html_escape(e['suite'])}{_html_escape(e['name'])} " + f'FIXED{_html_escape(e.get('first_seen', '?'))}{e.get('failure_count', '?')}{flaky_tag}
") + + # --- Flaky --- + if classified["flaky"]: + parts.append("

Flaky Tests (passed on retry)

") + parts.append("") + for e in classified["flaky"]: + parts.append( + f"" + f"' + f"" + ) + parts.append("
SuiteTestRetries
{_html_escape(e['suite'])}{_html_escape(e['name'])} " + f'FLAKY{e.get('retry_count', '?')}
") + + # --- All failure details --- + all_failures = classified["failed"] + classified["error"] + if all_failures: + parts.append("

All Failure Details

") + for e in all_failures: + msg = _html_escape(e.get("message", "").strip()) + parts.append( + f'

' + f"{_html_escape(e['classname'])}::{_html_escape(e['name'])}

" + f'

' + f"Suite: {_html_escape(e['suite'])}  |  " + f"Source: {_html_escape(e['source_file'])}

" + ) + if msg: + parts.append(f'
{msg}
') + parts.append("
") + + if not all_failures and not classified["flaky"] and not resolved_tests: + parts.append( + '

All tests passed! No failures or flaky tests detected.

' + ) + + parts.append("") + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description="Generate nightly test failure report from JUnit XML results" + ) + parser.add_argument( + "--results-dir", + required=True, + help="Directory containing JUnit XML test result files", + ) + parser.add_argument( + "--output-dir", + default="report-output", + help="Directory to write report files to", + ) + parser.add_argument( + "--sha", + default=os.environ.get("GITHUB_SHA", "unknown"), + help="Git commit SHA for this run", + ) + parser.add_argument( + "--date", + default=datetime.now(timezone.utc).strftime("%Y-%m-%d"), + help="Date for this run (YYYY-MM-DD)", + ) + parser.add_argument( + "--test-type", + default="unknown", + help=( + "Test type identifier (e.g., cpp, python, wheel-python, " + "wheel-server, notebooks)" + ), + ) + parser.add_argument( + "--matrix-label", + default="", + help=( + "Matrix combination label (e.g., cuda12.9-py3.12-x86_64). " + "Included in reports and JSON summary to identify the CI job." + ), + ) + parser.add_argument( + "--s3-history-uri", + default="", + help=( + "S3 URI for persistent failure history JSON. " + "Downloaded before analysis, uploaded after update. " + "Example: s3://bucket/ci_test_reports/nightly/history/" + "python-main-cuda12.9-py3.12-x86_64.json" + ), + ) + parser.add_argument( + "--s3-summary-uri", + default="", + help=( + "S3 URI to upload this run's JSON snapshot for aggregation. " + "Example: s3://bucket/ci_test_reports/nightly/summaries/" + "2026-04-13/python-cuda12.9-py3.12-x86_64.json" + ), + ) + parser.add_argument( + "--s3-html-uri", + default="", + help=( + "S3 URI to upload the HTML report. " + "Example: s3://bucket/ci_test_reports/nightly/reports/" + "2026-04-13/python-cuda12.9-py3.12-x86_64.html" + ), + ) + parser.add_argument( + "--github-step-summary", + default=os.environ.get("GITHUB_STEP_SUMMARY", ""), + help="Path to write GitHub Actions step summary", + ) + + args = parser.parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + local_history_path = str(output_dir / "test_failure_history.json") + + # ---- Step 1: Download history from S3 ---- + if args.s3_history_uri: + s3_download(args.s3_history_uri, local_history_path) + + # ---- Step 2: Collect and classify results ---- + print(f"Collecting test results from {args.results_dir} ...") + results = collect_all_results(args.results_dir) + if not results: + print("WARNING: No test results found.", file=sys.stderr) + + print(f"Found {len(results)} test case entries across all XML files") + classified = classify_failures(results) + + print( + f"Classification: {len(classified['passed'])} passed, " + f"{len(classified['failed'])} failed, " + f"{len(classified['error'])} errors, " + f"{len(classified['flaky'])} flaky, " + f"{len(classified['skipped'])} skipped" + ) + + # ---- Step 3: Update history ---- + history = load_history(local_history_path) + history, new_failures, recurring_failures, resolved_tests = update_history( + history, classified, args.sha, args.date + ) + + if resolved_tests: + print( + f"Stabilized: {len(resolved_tests)} previously-failing test(s) now pass" + ) + + save_history(history, local_history_path) + print(f"Updated local history at {local_history_path}") + + # ---- Step 4: Upload history back to S3 ---- + if args.s3_history_uri: + s3_upload(local_history_path, args.s3_history_uri) + + # ---- Step 5: Generate reports ---- + report_kwargs = dict( + test_type=args.test_type, + matrix_label=args.matrix_label, + sha=args.sha, + date_str=args.date, + ) + + md_report = generate_markdown_report( + classified, + new_failures, + recurring_failures, + resolved_tests, + history, + **report_kwargs, + ) + md_path = output_dir / "nightly_report.md" + md_path.write_text(md_report) + print(f"Markdown report written to {md_path}") + + html_report = generate_html_report( + classified, + new_failures, + recurring_failures, + resolved_tests, + history, + **report_kwargs, + ) + html_path = output_dir / "nightly_report.html" + html_path.write_text(html_report) + print(f"HTML report written to {html_path}") + + json_summary = generate_json_summary( + classified, + new_failures, + recurring_failures, + resolved_tests, + **report_kwargs, + ) + json_path = output_dir / "nightly_summary.json" + json_path.write_text(json.dumps(json_summary, indent=2) + "\n") + print(f"JSON summary written to {json_path}") + + if args.github_step_summary: + with open(args.github_step_summary, "a") as f: + f.write(md_report) + print(f"Wrote GitHub Step Summary to {args.github_step_summary}") + + # ---- Step 6: Upload per-run snapshot and HTML to S3 ---- + if args.s3_summary_uri: + s3_upload(str(json_path), args.s3_summary_uri) + + if args.s3_html_uri: + s3_upload(str(html_path), args.s3_html_uri) + + # ---- Exit code ---- + genuine_failures = len(classified["failed"]) + len(classified["error"]) + if genuine_failures > 0: + print( + f"\nFAILED: {genuine_failures} genuine test failure(s) detected." + ) + return 1 + if classified["flaky"]: + print( + f"\nWARNING: All tests passed but {len(classified['flaky'])} flaky test(s) detected." + ) + else: + print("\nAll tests passed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ci/utils/nightly_report_helper.sh b/ci/utils/nightly_report_helper.sh new file mode 100755 index 0000000000..c3b77e6b7a --- /dev/null +++ b/ci/utils/nightly_report_helper.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Shared helper for generating nightly test reports with matrix-aware S3 paths. +# +# Usage (source from any test script): +# +# # For C++ tests (no Python version in matrix label): +# generate_nightly_report "cpp" +# +# # For Python tests (includes Python version in matrix label): +# generate_nightly_report "python" --with-python-version +# +# # For wheel tests: +# generate_nightly_report "wheel-python" --with-python-version +# +# Prerequisites (set before calling): +# RAPIDS_TESTS_DIR - directory containing JUnit XML test results +# +# Optional environment variables (auto-detected if not set): +# RAPIDS_CUDA_VERSION - CUDA version (e.g., "12.9") +# RAPIDS_PY_VERSION - Python version (e.g., "3.12"), used with --with-python-version +# RAPIDS_BRANCH - branch name (e.g., "main") +# CUOPT_S3_URI - S3 bucket root (e.g., s3://cuopt-datasets/) +# GITHUB_SHA - commit SHA +# GITHUB_STEP_SUMMARY - path for GitHub Actions step summary + +# Resolve the directory where THIS helper lives (ci/utils/) +_HELPER_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +generate_nightly_report() { + local test_type="${1:?Usage: generate_nightly_report [--with-python-version]}" + local include_py_version=false + + shift + while [ $# -gt 0 ]; do + case "$1" in + --with-python-version) include_py_version=true ;; + *) echo "WARNING: Unknown option: $1" >&2 ;; + esac + shift + done + + # --- Build matrix label --- + local cuda_tag="cuda${RAPIDS_CUDA_VERSION:-unknown}" + local arch_tag + arch_tag="$(arch)" + local matrix_label="${cuda_tag}-${arch_tag}" + + if [ "${include_py_version}" = true ]; then + local py_tag="py${RAPIDS_PY_VERSION:-unknown}" + matrix_label="${cuda_tag}-${py_tag}-${arch_tag}" + fi + + local branch_slug + branch_slug=$(echo "${RAPIDS_BRANCH:-main}" | tr '/' '-') + local run_date + run_date="$(date +%F)" + + # --- Ensure results dir exists --- + RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" + mkdir -p "${RAPIDS_TESTS_DIR}" + + local report_output_dir="${RAPIDS_TESTS_DIR}/report" + mkdir -p "${report_output_dir}" + + # --- Build S3 URIs --- + local s3_history_uri="" + local s3_summary_uri="" + local s3_html_uri="" + + if [ -n "${CUOPT_S3_URI:-}" ]; then + local s3_base="${CUOPT_S3_URI}ci_test_reports/nightly" + s3_history_uri="${s3_base}/history/${branch_slug}/${test_type}-${matrix_label}.json" + s3_summary_uri="${s3_base}/summaries/${run_date}/${branch_slug}/${test_type}-${matrix_label}.json" + s3_html_uri="${s3_base}/reports/${run_date}/${branch_slug}/${test_type}-${matrix_label}.html" + fi + + # --- Run nightly report --- + python3 "${_HELPER_DIR}/nightly_report.py" \ + --results-dir "${RAPIDS_TESTS_DIR}" \ + --output-dir "${report_output_dir}" \ + --sha "${GITHUB_SHA:-unknown}" \ + --date "${run_date}" \ + --test-type "${test_type}" \ + --matrix-label "${matrix_label}" \ + --s3-history-uri "${s3_history_uri}" \ + --s3-summary-uri "${s3_summary_uri}" \ + --s3-html-uri "${s3_html_uri}" \ + --github-step-summary "${GITHUB_STEP_SUMMARY:-}" \ + || true +} diff --git a/ci/utils/s3_helpers.py b/ci/utils/s3_helpers.py new file mode 100644 index 0000000000..54e8b96d21 --- /dev/null +++ b/ci/utils/s3_helpers.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Shared S3 helper functions for cuOpt CI scripts. + +Maps CUOPT_AWS_* credentials to standard AWS env vars and provides +download / upload / list wrappers around the aws CLI. +""" + +import os +import subprocess +import sys + + +def s3_env(): + """Build env dict for AWS CLI calls using CUOPT-specific credentials. + + The cuOpt S3 bucket requires explicit CUOPT_AWS_* static credentials. + Role-based credentials from aws-actions/configure-aws-credentials do not + have access. We override AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY with + the CUOPT_* values and unset AWS_SESSION_TOKEN to avoid mixing with + role-based session tokens (matching the pattern in datasets/*.sh). + """ + env = os.environ.copy() + if os.environ.get("CUOPT_AWS_ACCESS_KEY_ID"): + env["AWS_ACCESS_KEY_ID"] = os.environ["CUOPT_AWS_ACCESS_KEY_ID"] + if os.environ.get("CUOPT_AWS_SECRET_ACCESS_KEY"): + env["AWS_SECRET_ACCESS_KEY"] = os.environ[ + "CUOPT_AWS_SECRET_ACCESS_KEY" + ] + # Unset session token to avoid mixing role-based tokens with static keys + env.pop("AWS_SESSION_TOKEN", None) + if os.environ.get("CUOPT_AWS_REGION"): + env["AWS_DEFAULT_REGION"] = os.environ["CUOPT_AWS_REGION"] + elif "AWS_DEFAULT_REGION" not in env: + env["AWS_DEFAULT_REGION"] = "us-east-1" + return env + + +def s3_download(s3_uri, local_path): + """Download a file from S3. Returns True on success, False on any error.""" + env = s3_env() + try: + subprocess.run( + ["aws", "s3", "cp", s3_uri, local_path], + env=env, + check=True, + capture_output=True, + text=True, + ) + print(f"Downloaded {s3_uri}") + return True + except FileNotFoundError: + print( + "WARNING: aws CLI not found, skipping S3 download", file=sys.stderr + ) + return False + except subprocess.CalledProcessError as exc: + print( + f"WARNING: S3 download failed (first run?): {exc.stderr.strip()}", + file=sys.stderr, + ) + return False + + +def s3_upload(local_path, s3_uri): + """Upload a file to S3. Returns True on success.""" + env = s3_env() + try: + subprocess.run( + ["aws", "s3", "cp", local_path, s3_uri], + env=env, + check=True, + capture_output=True, + text=True, + ) + print(f"Uploaded {local_path} to {s3_uri}") + return True + except FileNotFoundError: + print( + "WARNING: aws CLI not found, skipping S3 upload", file=sys.stderr + ) + return False + except subprocess.CalledProcessError as exc: + print( + f"WARNING: S3 upload failed: {exc.stderr.strip()}", file=sys.stderr + ) + return False + + +def s3_list(s3_prefix): + """List objects under an S3 prefix (recursive). Returns list of S3 URIs.""" + env = s3_env() + # Extract bucket and prefix from s3_prefix for reconstructing full URIs + # s3_prefix looks like "s3://bucket/path/to/prefix/" + try: + result = subprocess.run( + ["aws", "s3", "ls", "--recursive", s3_prefix], + env=env, + check=True, + capture_output=True, + text=True, + ) + except (FileNotFoundError, subprocess.CalledProcessError) as exc: + print(f"WARNING: S3 ls failed: {exc}", file=sys.stderr) + return [] + + # --recursive output format: "2026-04-16 12:00:00 1234 path/to/file.json" + # We need to reconstruct full S3 URIs from the key paths + # Parse bucket from s3_prefix + if not s3_prefix.startswith("s3://"): + return [] + without_scheme = s3_prefix[5:] # remove "s3://" + bucket = without_scheme.split("/")[0] + base_uri = f"s3://{bucket}/" + + uris = [] + for line in result.stdout.strip().splitlines(): + parts = line.split(None, 3) # date, time, size, key + if len(parts) == 4: + uris.append(f"{base_uri}{parts[3]}") + return uris diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh new file mode 100755 index 0000000000..195a7d5797 --- /dev/null +++ b/ci/utils/send_consolidated_summary.sh @@ -0,0 +1,401 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Send a consolidated Slack notification for the entire nightly run. +# Reads the aggregated JSON produced by aggregate_nightly.py and sends: +# - Main message: Header + status summary + test totals + failed CI jobs +# - Thread replies: matrix details, failure details, links, HTML report +# +# If SLACK_BOT_TOKEN is available, posts via chat.postMessage (enables +# threading). Falls back to webhook (no threading) otherwise. +# +# Required environment variables: +# SLACK_WEBHOOK_URL - Slack incoming webhook URL (fallback) +# CONSOLIDATED_SUMMARY - Path to consolidated_summary.json +# +# Optional environment variables: +# CONSOLIDATED_HTML - Path to consolidated HTML file to upload +# SLACK_BOT_TOKEN - Slack Bot Token (xoxb-*) for threading + file uploads +# SLACK_CHANNEL_ID - Slack channel ID (required with bot token) +# PRESIGNED_REPORT_URL - Presigned URL for consolidated HTML report +# PRESIGNED_DASHBOARD_URL - Presigned URL for dashboard + +set -euo pipefail + +CONSOLIDATED_SUMMARY="${CONSOLIDATED_SUMMARY:?CONSOLIDATED_SUMMARY must point to consolidated_summary.json}" +SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:?SLACK_WEBHOOK_URL is required}" +CONSOLIDATED_HTML="${CONSOLIDATED_HTML:-}" +SLACK_BOT_TOKEN="${SLACK_BOT_TOKEN:-}" +SLACK_CHANNEL_ID="${SLACK_CHANNEL_ID:-}" +PRESIGNED_REPORT_URL="${PRESIGNED_REPORT_URL:-}" +PRESIGNED_DASHBOARD_URL="${PRESIGNED_DASHBOARD_URL:-}" + +if [ ! -f "${CONSOLIDATED_SUMMARY}" ]; then + echo "ERROR: Summary file not found: ${CONSOLIDATED_SUMMARY}" >&2 + exit 1 +fi + +# Generate Slack payloads — one JSON object per line. +# Line 1 = main message, lines 2+ = thread replies. +PAYLOADS=$(python3 - "${CONSOLIDATED_SUMMARY}" "${PRESIGNED_REPORT_URL}" "${PRESIGNED_DASHBOARD_URL}" <<'PYEOF' +import json, sys + +summary_path = sys.argv[1] +presigned_report_url = sys.argv[2] if len(sys.argv) > 2 else "" +presigned_dashboard_url = sys.argv[3] if len(sys.argv) > 3 else "" + +with open(summary_path) as f: + d = json.load(f) + +branch = d.get("branch", "main") +date = d.get("date", "unknown") +github_run_url = d.get("github_run_url", "") +jobs = d.get("job_summary", {}) +totals = d.get("test_totals", {}) +grid = d.get("matrix_grid", []) +has_new = d.get("has_new_failures", False) +failed_ci_jobs = d.get("failed_ci_jobs", []) +untracked_failed = d.get("untracked_failed_ci_jobs", []) +workflow_jobs = d.get("workflow_jobs", []) + +total_jobs = jobs.get("total", 0) +failed_jobs = jobs.get("failed", 0) +flaky_jobs = jobs.get("flaky", 0) +passed_jobs = jobs.get("passed", 0) + +total_ci_jobs = len(workflow_jobs) +failed_ci_count = len(failed_ci_jobs) +passed_ci_count = sum(1 for j in workflow_jobs if j["conclusion"] == "success") + +status_icons = { + "passed": ":white_check_mark:", + "failed-new": ":rotating_light:", + "failed-recurring": ":x:", + "flaky": ":warning:", + "no-results": ":grey_question:", +} + +def make_payload(blocks): + return json.dumps({ + "username": "cuOpt Nightly Bot", + "icon_emoji": ":robot_face:", + "blocks": blocks, + }) + + +# ══════════════════════════════════════════════════════════════════════ +# MAIN MESSAGE (line 1) — posted to channel, becomes thread parent +# ══════════════════════════════════════════════════════════════════════ +blocks = [] + +# Identify which workflows have failures (from both CI jobs and matrix grid) +failing_workflows = set() +for j in failed_ci_jobs: + prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"] + failing_workflows.add(prefix) +for g in grid: + if g["status"].startswith("failed"): + failing_workflows.add(g["test_type"]) +flaky_workflows = set() +for g in grid: + if g["status"] == "flaky": + flaky_workflows.add(g["test_type"]) + +has_failures = len(failing_workflows) > 0 +untracked_count = len(untracked_failed) + +if has_failures and (has_new or untracked_count > 0): + emoji = ":rotating_light:" + text = f"{len(failing_workflows)} workflow(s) with failures" + mention = "" +elif has_failures: + emoji = ":x:" + text = f"Recurring failures in {len(failing_workflows)} workflow(s)" + mention = "" +elif flaky_workflows: + emoji = ":large_yellow_circle:" + text = "All jobs passed but flaky tests detected" + mention = "" +else: + emoji = ":white_check_mark:" + text = f"All {total_jobs} matrix jobs passed" + if total_ci_jobs > 0: + text += f", all {passed_ci_count} CI jobs succeeded" + mention = "" + +stats_parts = [] +if totals.get("failed", 0) > 0: + stats_parts.append(f":x: {totals['failed']} failed") +if totals.get("flaky", 0) > 0: + stats_parts.append(f":warning: {totals['flaky']} flaky") +if not stats_parts: + stats_parts.append(f":white_check_mark: {totals.get('total', 0)} tests passed") +stats = " | ".join(stats_parts) + +blocks.append({ + "type": "header", + "text": { + "type": "plain_text", + "text": f"cuOpt Nightly Tests \u2014 {branch} \u2014 {date}", + "emoji": True, + }, +}) +blocks.append({ + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"{mention}{emoji} *{text}*\n\n{stats}", + }, +}) + +# Per-workflow failure summary using CI job counts from GitHub API +# Build a lookup: workflow prefix -> (failed, total) from workflow_jobs +wf_counts = {} +for j in workflow_jobs: + prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"] + wf_counts.setdefault(prefix, {"failed": 0, "total": 0}) + wf_counts[prefix]["total"] += 1 + if j["conclusion"] == "failure": + wf_counts[prefix]["failed"] += 1 + +if failing_workflows: + lines = [] + for wf in sorted(failing_workflows): + counts = wf_counts.get(wf, {}) + f_count = counts.get("failed", 0) + t_count = counts.get("total", 0) + if t_count > 0: + lines.append(f":x: *{wf}* — {f_count}/{t_count} failed") + else: + lines.append(f":x: *{wf}* — failed") + blocks.append({"type": "divider"}) + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": "\n".join(lines)}, + }) + +# Links in main message +link_parts = [] +if github_run_url: + link_parts.append(f"<{github_run_url}|:github: GitHub Actions>") +if presigned_report_url: + link_parts.append(f"<{presigned_report_url}|:bar_chart: Full Report>") +if presigned_dashboard_url: + link_parts.append(f"<{presigned_dashboard_url}|:chart_with_upwards_trend: Dashboard>") +if link_parts: + blocks.append({"type": "divider"}) + blocks.append({ + "type": "context", + "elements": [{"type": "mrkdwn", "text": " | ".join(link_parts)}], + }) + +print(make_payload(blocks)) + + +# ══════════════════════════════════════════════════════════════════════ +# THREAD REPLIES (lines 2+) — posted as replies to main message +# ══════════════════════════════════════════════════════════════════════ + +# ── Thread 1: Failing and flaky tests (grouped by workflow) ─────────── +# Build per-workflow test issue lists +new_failures = d.get("new_failures", []) +recurring = d.get("recurring_failures", []) +flaky = d.get("flaky_tests", []) +resolved = d.get("resolved_tests", []) + +# Collect all test issues by test_type (workflow) +issues_by_wf = {} +for f_entry in new_failures: + tt = f_entry.get("test_type", "unknown") + issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []}) + issues_by_wf[tt]["new"].append(f_entry) +for f_entry in recurring: + tt = f_entry.get("test_type", "unknown") + issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []}) + issues_by_wf[tt]["recurring"].append(f_entry) +for f_entry in flaky: + tt = f_entry.get("test_type", "unknown") + issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []}) + issues_by_wf[tt]["flaky"].append(f_entry) +for r in resolved: + tt = r.get("test_type", "unknown") + issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []}) + issues_by_wf[tt]["resolved"].append(r) + +if issues_by_wf: + for wf_name, issues in sorted(issues_by_wf.items()): + wf_blocks = [] + wf_text = f"*{wf_name}*\n" + + # New failures + for f_entry in issues["new"][:10]: + msg = f_entry.get("message", "")[:60].replace("\n", " ") + matrix = f_entry.get("matrix_label", "") + wf_text += f":new: `{f_entry['name']}` ({matrix}) — {msg}\n" + + # Recurring failures + for f_entry in issues["recurring"][:10]: + matrix = f_entry.get("matrix_label", "") + first = f_entry.get("first_seen", "?") + wf_text += f":repeat: `{f_entry['name']}` ({matrix}) — since {first}\n" + + # Flaky + for f_entry in issues["flaky"][:10]: + matrix = f_entry.get("matrix_label", "") + wf_text += f":warning: `{f_entry['name']}` ({matrix})\n" + + # Resolved + for r in issues["resolved"][:5]: + matrix = r.get("matrix_label", "") + count = r.get("failure_count", "?") + wf_text += f":white_check_mark: `{r['name']}` ({matrix}) — was failing {count}x\n" + + # Truncation notes + for category, label, limit in [("new", "new failures", 10), ("recurring", "recurring", 10), + ("flaky", "flaky", 10), ("resolved", "resolved", 5)]: + if len(issues[category]) > limit: + wf_text += f"_...+{len(issues[category]) - limit} more {label}_\n" + + # Chunk if needed + while wf_text: + chunk = wf_text[:2900] + wf_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": chunk.rstrip()}, + }) + wf_text = wf_text[2900:] + + print(make_payload(wf_blocks)) + +PYEOF +) + +# ── Send messages ───────────────────────────────────────────────────── +echo "Sending consolidated Slack notification..." + +THREAD_TS="" +FIRST=true + +while IFS= read -r payload; do + if [ "${FIRST}" = true ] && [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ]; then + # Post main message via chat.postMessage to get thread_ts + BOT_PAYLOAD=$(python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +p['channel'] = sys.argv[2] +print(json.dumps(p)) +" "${payload}" "${SLACK_CHANNEL_ID}") + + RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${BOT_PAYLOAD}" \ + "https://slack.com/api/chat.postMessage") + + THREAD_TS=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ts',''))" 2>/dev/null || echo "") + OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "") + + if [ "${OK}" != "True" ]; then + echo "WARNING: chat.postMessage failed: ${RESPONSE}" >&2 + # Fall back to webhook for this and remaining messages + THREAD_TS="" + curl -s -X POST -H 'Content-type: application/json' --data "${payload}" "${SLACK_WEBHOOK_URL}" || true + else + echo "Main message posted (ts=${THREAD_TS})" + fi + FIRST=false + elif [ -n "${THREAD_TS}" ] && [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ]; then + # Post thread reply via chat.postMessage + THREAD_PAYLOAD=$(python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +p['channel'] = sys.argv[2] +p['thread_ts'] = sys.argv[3] +print(json.dumps(p)) +" "${payload}" "${SLACK_CHANNEL_ID}" "${THREAD_TS}") + + RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${THREAD_PAYLOAD}" \ + "https://slack.com/api/chat.postMessage") + + OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "") + if [ "${OK}" != "True" ]; then + echo "WARNING: Thread reply failed: ${RESPONSE}" >&2 + fi + else + # Fallback: webhook (no threading) + response=$(curl -s -X POST \ + -H 'Content-type: application/json' \ + --data "${payload}" \ + "${SLACK_WEBHOOK_URL}") + if [ "${response}" != "ok" ]; then + echo "WARNING: Slack webhook returned: ${response}" >&2 + fi + FIRST=false + fi +done <<< "${PAYLOADS}" +echo "Consolidated Slack notification sent." + +# ── Upload HTML report as file in thread ────────────────────────────── +if [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ] && [ -n "${CONSOLIDATED_HTML}" ] && [ -f "${CONSOLIDATED_HTML}" ]; then + echo "Uploading HTML report to Slack..." + + REPORT_DATE=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('date','report'))" "${CONSOLIDATED_SUMMARY}" 2>/dev/null || echo "report") + REPORT_BRANCH=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('branch','main'))" "${CONSOLIDATED_SUMMARY}" 2>/dev/null || echo "main") + UPLOAD_FILENAME="cuopt-nightly-${REPORT_BRANCH}-${REPORT_DATE}.html" + FILE_SIZE=$(stat --format=%s "${CONSOLIDATED_HTML}") + UPLOAD_TITLE="cuOpt Nightly Report — ${REPORT_BRANCH} — ${REPORT_DATE}" + + # Step 1: Get an upload URL from Slack + URL_RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + --data-urlencode "filename=${UPLOAD_FILENAME}" \ + --data-urlencode "length=${FILE_SIZE}" \ + "https://slack.com/api/files.getUploadURLExternal") + + UPLOAD_URL=$(echo "${URL_RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('upload_url',''))" 2>/dev/null) + FILE_ID=$(echo "${URL_RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('file_id',''))" 2>/dev/null) + + if [ -z "${UPLOAD_URL}" ] || [ -z "${FILE_ID}" ]; then + echo "WARNING: Slack file upload failed at getUploadURLExternal. Response: ${URL_RESPONSE}" >&2 + else + # Step 2: Upload the file content to the presigned URL + curl -s -X POST \ + -F "file=@${CONSOLIDATED_HTML}" \ + "${UPLOAD_URL}" + + # Step 3: Complete the upload and share to channel (in thread if available) + COMPLETE_PAYLOAD=$(python3 -c " +import json, sys +payload = { + 'files': [{'id': sys.argv[1], 'title': sys.argv[2]}], + 'channel_id': sys.argv[3], + 'initial_comment': 'Full nightly test report \u2014 download and open in a browser for interactive details.', +} +thread_ts = sys.argv[4] if len(sys.argv) > 4 and sys.argv[4] else '' +if thread_ts: + payload['thread_ts'] = thread_ts +print(json.dumps(payload)) +" "${FILE_ID}" "${UPLOAD_TITLE}" "${SLACK_CHANNEL_ID}" "${THREAD_TS}") + + COMPLETE_RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${COMPLETE_PAYLOAD}" \ + "https://slack.com/api/files.completeUploadExternal") + + if echo "${COMPLETE_RESPONSE}" | python3 -c "import json,sys; sys.exit(0 if json.load(sys.stdin).get('ok') else 1)" 2>/dev/null; then + echo "HTML report uploaded to Slack." + else + echo "WARNING: Slack file upload failed at completeUploadExternal. Response: ${COMPLETE_RESPONSE}" >&2 + fi + fi +else + if [ -n "${SLACK_BOT_TOKEN}" ] && [ -z "${SLACK_CHANNEL_ID}" ]; then + echo "WARNING: SLACK_BOT_TOKEN set but SLACK_CHANNEL_ID missing, skipping file upload." >&2 + fi +fi diff --git a/ci/utils/send_nightly_summary.sh b/ci/utils/send_nightly_summary.sh new file mode 100755 index 0000000000..7b39a02cec --- /dev/null +++ b/ci/utils/send_nightly_summary.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Enhanced Slack notification for nightly test results. +# Reads the JSON summary produced by nightly_report.py and sends a rich +# Slack message with: +# - Failure tables with :new: / :repeat: badges +# - @channel on new genuine failures +# - Stabilized tests (were failing, now passing) +# - Flaky test list +# +# Required environment variables: +# SLACK_WEBHOOK_URL - Slack incoming webhook URL (set from CUOPT_SLACK_WEBHOOK_URL in CI) +# NIGHTLY_SUMMARY - Path to nightly_summary.json from nightly_report.py +# +# Optional environment variables: +# GITHUB_RUN_URL - Link to the GitHub Actions run +# REPORT_URL - Link to the S3 HTML report +# CUOPT_BRANCH - Branch name (e.g. main) + +set -euo pipefail + +NIGHTLY_SUMMARY="${NIGHTLY_SUMMARY:?NIGHTLY_SUMMARY must point to nightly_summary.json}" +SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:?SLACK_WEBHOOK_URL is required}" +GITHUB_RUN_URL="${GITHUB_RUN_URL:-}" +REPORT_URL="${REPORT_URL:-}" +CUOPT_BRANCH="${CUOPT_BRANCH:-main}" + +if [ ! -f "${NIGHTLY_SUMMARY}" ]; then + echo "ERROR: Summary file not found: ${NIGHTLY_SUMMARY}" >&2 + exit 1 +fi + +# Build the entire Slack payload in Python for safe JSON handling. +# Shell variable interpolation into nested JSON is brittle; Python reads the +# summary file directly and produces a valid JSON payload on stdout. +PAYLOAD=$(python3 - "${NIGHTLY_SUMMARY}" "${CUOPT_BRANCH}" "${GITHUB_RUN_URL}" "${REPORT_URL}" <<'PYEOF' +import json, sys + +summary_path, branch, github_run_url, report_url = sys.argv[1:5] + +with open(summary_path) as f: + d = json.load(f) + +counts = d["counts"] +total = counts["total"] +passed = counts["passed"] +failed = counts["failed"] +flaky = counts["flaky"] +skipped = counts["skipped"] +resolved = counts.get("resolved", 0) +has_new = d["has_new_failures"] + +# --- Status line --- +if failed > 0: + if has_new: + emoji = ":rotating_light:" + text = "NEW test failures detected" + mention = " " + else: + emoji = ":x:" + text = "Recurring test failures" + mention = "" +elif flaky > 0: + emoji = ":large_yellow_circle:" + text = "All passed but flaky tests detected" + mention = "" +else: + emoji = ":white_check_mark:" + text = "All tests passed" + mention = "" + +stats = ( + f":white_check_mark: {passed} passed | :x: {failed} failed | " + f":warning: {flaky} flaky | :fast_forward: {skipped} skipped | Total: {total}" +) + +blocks = [] + +# Header +blocks.append({ + "type": "header", + "text": {"type": "plain_text", "text": f"cuOpt Nightly Tests \u2014 {branch}", "emoji": True}, +}) + +# Status summary +blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": f"{mention}{emoji} *{text}*\n\n{stats}"}, +}) + +blocks.append({"type": "divider"}) + +# --- Genuine failures --- +if failed > 0: + lines = [] + for f_entry in d.get("new_failures", []): + msg = f_entry.get("message", "")[:60].replace("\n", " ") + lines.append(f" :new: `{f_entry['name']}` ({f_entry['suite']}) \u2014 {msg}") + for f_entry in d.get("recurring_failures", []): + msg = f_entry.get("message", "")[:60].replace("\n", " ") + first = f_entry.get("first_seen", "?") + lines.append(f" :repeat: `{f_entry['name']}` ({f_entry['suite']}) \u2014 since {first}") + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": "*Genuine Failures:*\n" + "\n".join(lines)}, + }) + +# --- Stabilized tests --- +resolved_list = d.get("resolved_tests", []) +if resolved_list: + lines = [] + for r in resolved_list: + since = r.get("first_seen", "?") + count = r.get("failure_count", "?") + flaky_tag = " (was flaky)" if r.get("was_flaky") else "" + lines.append( + f" :white_check_mark: `{r['name']}` ({r['suite']}) \u2014 " + f"failing since {since}, failed {count}x{flaky_tag}" + ) + blocks.append({ + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Stabilized (were failing, now pass):*\n" + "\n".join(lines), + }, + }) + +# --- Flaky tests --- +flaky_list = d.get("flaky_tests", []) +if flaky_list: + lines = [] + for f_entry in flaky_list: + retries = f_entry.get("retry_count", "?") + lines.append(f" :warning: `{f_entry['name']}` ({f_entry['suite']}) \u2014 {retries} retries") + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": "*Flaky Tests (passed on retry):*\n" + "\n".join(lines)}, + }) + +# --- Links --- +link_parts = [] +if github_run_url: + link_parts.append(f"<{github_run_url}|GitHub Actions>") +if report_url: + link_parts.append(f"<{report_url}|Full Report>") +if link_parts: + blocks.append({"type": "divider"}) + blocks.append({ + "type": "context", + "elements": [{"type": "mrkdwn", "text": " ".join(link_parts)}], + }) + +payload = { + "channel": "cuopt-regression-testing", + "username": "cuOpt Nightly Bot", + "icon_emoji": ":robot_face:", + "blocks": blocks, +} +print(json.dumps(payload)) +PYEOF +) + +echo "Sending Slack notification..." +curl -s -X POST \ + -H 'Content-type: application/json' \ + --data "${PAYLOAD}" \ + "${SLACK_WEBHOOK_URL}" + +echo "" +echo "Slack notification sent." diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 04dc6bb83c..e8000ffbb3 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -58,6 +58,7 @@ dependencies: - pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 21891cc9f2..43bc8996ad 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -58,6 +58,7 @@ dependencies: - pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index 89147b18a7..5a53e13d37 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -58,6 +58,7 @@ dependencies: - pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index 8df6f28bf7..2efc26c0cb 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -58,6 +58,7 @@ dependencies: - pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 diff --git a/datasets/get_test_data.sh b/datasets/get_test_data.sh index 528455e133..472813a003 100755 --- a/datasets/get_test_data.sh +++ b/datasets/get_test_data.sh @@ -8,7 +8,7 @@ set -o pipefail ################################################################################ # S3 Dataset Download Support ################################################################################ -# Set CUOPT_DATASET_S3_URI to base S3 path +# Set CUOPT_S3_URI to S3 bucket root (e.g., s3://cuopt-datasets/) # AWS credentials should be configured via: # - Environment variables (CUOPT_AWS_ACCESS_KEY_ID, CUOPT_AWS_SECRET_ACCESS_KEY) # - Standard AWS variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) @@ -18,8 +18,8 @@ set -o pipefail function try_download_from_s3() { local s3_dirs=("$@") # Array of directories to sync from S3 - if [ -z "${CUOPT_DATASET_S3_URI:-}" ]; then - echo "CUOPT_DATASET_S3_URI not set, skipping S3 download..." + if [ -z "${CUOPT_S3_URI:-}" ]; then + echo "CUOPT_S3_URI not set, skipping S3 download..." return 1 fi @@ -35,7 +35,7 @@ function try_download_from_s3() { fi # Append routing subdirectory to base S3 URI - local s3_uri="${CUOPT_DATASET_S3_URI}routing/" + local s3_uri="${CUOPT_S3_URI}ci_datasets/routing/" echo "Downloading datasets from S3..." # Use CUOPT-specific credentials only diff --git a/dependencies.yaml b/dependencies.yaml index 057fc2a318..18d479a99f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -340,6 +340,7 @@ dependencies: packages: - pytest<9.0 - pytest-cov + - pytest-rerunfailures test_python_cuopt: common: - output_types: [conda] diff --git a/python/cuopt/cuopt/linear_programming/pyproject.toml b/python/cuopt/cuopt/linear_programming/pyproject.toml index 934b12f547..6e2c59c43c 100644 --- a/python/cuopt/cuopt/linear_programming/pyproject.toml +++ b/python/cuopt/cuopt/linear_programming/pyproject.toml @@ -37,6 +37,7 @@ Source = "https://github.com/nvidia/cuopt" [project.optional-dependencies] test = [ "pytest-cov", + "pytest-rerunfailures", "pytest<9.0", "rapids-logger==0.2.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cuopt/pyproject.toml b/python/cuopt/pyproject.toml index eff7e01769..18b6e75276 100644 --- a/python/cuopt/pyproject.toml +++ b/python/cuopt/pyproject.toml @@ -47,6 +47,7 @@ classifiers = [ test = [ "numpy>=1.23.5,<3.0", "pytest-cov", + "pytest-rerunfailures", "pytest<9.0", "rapids-logger==0.2.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cuopt_self_hosted/pyproject.toml b/python/cuopt_self_hosted/pyproject.toml index 43aa80a5b3..f4a3b75a60 100644 --- a/python/cuopt_self_hosted/pyproject.toml +++ b/python/cuopt_self_hosted/pyproject.toml @@ -37,6 +37,7 @@ classifiers = [ [project.optional-dependencies] test = [ "pytest-cov", + "pytest-rerunfailures", "pytest<9.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cuopt_server/pyproject.toml b/python/cuopt_server/pyproject.toml index ce96c884be..4f9f141011 100644 --- a/python/cuopt_server/pyproject.toml +++ b/python/cuopt_server/pyproject.toml @@ -48,6 +48,7 @@ test = [ "msgpack==1.1.2", "pexpect", "pytest-cov", + "pytest-rerunfailures", "pytest<9.0", "requests", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/skills/cuopt-developer/SKILL.md b/skills/cuopt-developer/SKILL.md index 99743f9171..66d41c003e 100644 --- a/skills/cuopt-developer/SKILL.md +++ b/skills/cuopt-developer/SKILL.md @@ -295,6 +295,16 @@ rmm::device_uvector data(100, stream); | Missing `nvcc` | Set `$CUDACXX` or add CUDA to `$PATH` | | CUDA out of memory | Reduce problem size | | Slow debug library loading | Device symbols cause delay | + +| CI state doesn't persist between runs | CI containers are ephemeral. Never write persistent state to repo files from CI — use S3 (`CUOPT_S3_URI`) or artifact stores. Ask: "After this container dies, does tomorrow's run see today's data?" | +| CI state transitions go unreported | When CI tracks state over time (e.g. test failures), every transition (new failure, recurring, stabilized) needs an explicit notification path. Ask: "When state X changes to Y, who learns about it and how?" | +| Designing CI features without lifecycle check | Before shipping any CI feature that tracks state: (1) Where does state live between runs? (2) What writes/reads it? (3) What happens on state transitions? Verify end-to-end, not just the happy-path logic. | +| Change applied to only some targets | Before implementing, audit the full scope of what needs the change. For CI: `ls ci/test*.sh`. For APIs: grep all callers. For patterns: find every instance. Enumerate ALL targets first, implement second. | +| Shared resource ignores CI matrix parallelism | CI matrices run jobs in parallel across CUDA x Python x arch. Any shared resource (S3 paths, files, databases) must be keyed by the full execution context. Ask: "What happens when N parallel jobs access this simultaneously?" | +| Same logic duplicated across files | When the same block (>10 lines) appears in 2+ places — any language, any context — extract a shared helper immediately. Don't duplicate first and refactor later. This applies to shell scripts, Python modules, C/C++ code equally. | +| Feature not extensible for new variants | After implementing, ask: "If someone adds a new variant (test type, matrix entry, endpoint, etc.), what do they change?" If the answer is more than a one-line addition, the design needs a shared helper or auto-discovery. Avoid hardcoded lists of known variants. | +| Reports generated without actionable detail | Reports and notifications must include enough context to act without digging: error messages, execution context (matrix, commit), history (new vs recurring), and links or attachments for full details. Provide downloadable artifacts when possible. | + ## Canonical Documentation