diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index a945cde8ec..3ba0edd8c1 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -37,6 +37,10 @@ on:
If 'true', trigger the test workflow after all builds complete.
type: boolean
default: false
+ summary-only:
+ description: "If true, skip all build jobs and run only build-summary"
+ type: boolean
+ default: false
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -44,6 +48,7 @@ concurrency:
jobs:
cpp-build:
+ if: ${{ !inputs.summary-only }}
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
with:
@@ -53,6 +58,7 @@ jobs:
sha: ${{ inputs.sha }}
script: ci/build_cpp.sh
python-build:
+ if: ${{ !inputs.summary-only }}
needs: [cpp-build]
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
@@ -63,6 +69,7 @@ jobs:
sha: ${{ inputs.sha }}
script: ci/build_python.sh
upload-conda:
+ if: ${{ !inputs.summary-only }}
needs: [cpp-build, python-build]
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main
@@ -72,6 +79,7 @@ jobs:
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
wheel-build-cuopt-mps-parser:
+ if: ${{ !inputs.summary-only }}
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
with:
@@ -86,6 +94,7 @@ jobs:
# need 1 build per Python version and arch (but CUDA version doesn't matter so choose the latest)
matrix_filter: 'group_by([.ARCH, (.PY_VER |split(".") | map(tonumber))])|map(max_by([(.CUDA_VER|split(".")|map(tonumber))]))'
wheel-publish-cuopt-mps-parser:
+ if: ${{ !inputs.summary-only }}
needs: wheel-build-cuopt-mps-parser
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
@@ -97,6 +106,7 @@ jobs:
package-name: cuopt_mps_parser
package-type: python
wheel-build-libcuopt:
+ if: ${{ !inputs.summary-only }}
needs: wheel-build-cuopt-mps-parser
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
@@ -110,6 +120,7 @@ jobs:
package-type: cpp
matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
wheel-publish-libcuopt:
+ if: ${{ !inputs.summary-only }}
needs: wheel-build-libcuopt
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
@@ -121,6 +132,7 @@ jobs:
package-name: libcuopt
package-type: cpp
wheel-build-cuopt:
+ if: ${{ !inputs.summary-only }}
needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt]
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
@@ -133,6 +145,7 @@ jobs:
package-name: cuopt
package-type: python
wheel-publish-cuopt:
+ if: ${{ !inputs.summary-only }}
needs: wheel-build-cuopt
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
@@ -144,6 +157,7 @@ jobs:
package-name: cuopt
package-type: python
wheel-build-cuopt-server:
+ if: ${{ !inputs.summary-only }}
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
with:
@@ -158,6 +172,7 @@ jobs:
# Only need 1 package per CUDA major version. This selects "ARCH=amd64 + the latest supported Python, 1 job per major CUDA version".
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
wheel-publish-cuopt-server:
+ if: ${{ !inputs.summary-only }}
needs: wheel-build-cuopt-server
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
@@ -169,6 +184,7 @@ jobs:
package-name: cuopt_server
package-type: python
docs-build:
+ if: ${{ !inputs.summary-only }}
needs: [python-build]
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
@@ -184,6 +200,7 @@ jobs:
container_image: "rapidsai/ci-conda:26.06-latest"
script: "ci/build_docs.sh"
wheel-build-cuopt-sh-client:
+ if: ${{ !inputs.summary-only }}
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
with:
@@ -199,6 +216,7 @@ jobs:
# only need 1 build (noarch package): this selects amd64, oldest-supported Python, latest-supported CUDA
matrix_filter: '[map(select(.ARCH == "amd64")) | min_by((.PY_VER | split(".") | map(tonumber)), (.CUDA_VER | split(".") | map(-tonumber)))]'
wheel-publish-cuopt-sh-client:
+ if: ${{ !inputs.summary-only }}
needs: wheel-build-cuopt-sh-client
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
@@ -219,7 +237,7 @@ jobs:
- wheel-publish-cuopt-server
- wheel-publish-cuopt-sh-client
- wheel-publish-libcuopt
- if: inputs.trigger-tests
+ if: ${{ inputs.trigger-tests && !inputs.summary-only }}
runs-on: ubuntu-latest
# ref: https://docs.github.com/en/actions/reference/security/secure-use#use-an-intermediate-environment-variable
env:
@@ -242,7 +260,35 @@ jobs:
-f date="${INPUT_DATE}" \
-f sha="${INPUT_SHA}"
+ build-summary:
+ if: ${{ always() && (inputs.build_type == 'nightly') }}
+ needs:
+ - tests
+ - build-images
+ - docs-build
+ runs-on: linux-amd64-cpu4
+ container:
+ image: python:3.12-slim
+ steps:
+ - uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.sha }}
+ - name: Install dependencies
+ run: apt-get update && apt-get install -y --no-install-recommends curl
+ - name: Send build summary
+ env:
+ GITHUB_TOKEN: ${{ github.token }}
+ GITHUB_RUN_ID: ${{ github.run_id }}
+ GITHUB_REPOSITORY: ${{ github.repository }}
+ GITHUB_SERVER_URL: ${{ github.server_url }}
+ CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }}
+ SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }}
+ SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }}
+ RAPIDS_BRANCH: ${{ inputs.branch }}
+ run: bash ci/build_summary.sh
+
build-images:
+ if: ${{ !inputs.summary-only }}
needs:
- wheel-publish-cuopt
- wheel-publish-cuopt-server
diff --git a/.github/workflows/nightly-summary.yaml b/.github/workflows/nightly-summary.yaml
new file mode 100644
index 0000000000..1bc3369c41
--- /dev/null
+++ b/.github/workflows/nightly-summary.yaml
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+name: nightly-summary
+
+on:
+ workflow_dispatch:
+ inputs:
+ branch:
+ description: "Branch name the run targets"
+ required: true
+ type: string
+ default: main
+ sha:
+ description: "Full git commit SHA to check out"
+ required: true
+ type: string
+ build_type:
+ description: "Build type (nightly, pull-request, branch)"
+ required: true
+ type: string
+ default: nightly
+ date:
+ description: "Date (YYYY-MM-DD) for this run. Defaults to today."
+ required: false
+ type: string
+ workflow_call:
+ inputs:
+ branch:
+ required: true
+ type: string
+ sha:
+ required: true
+ type: string
+ build_type:
+ required: true
+ type: string
+ date:
+ required: false
+ type: string
+ secrets:
+ CUOPT_S3_URI:
+ required: true
+ CUOPT_AWS_ACCESS_KEY_ID:
+ required: true
+ CUOPT_AWS_SECRET_ACCESS_KEY:
+ required: true
+ CUOPT_SLACK_WEBHOOK_URL:
+ required: false
+ CUOPT_SLACK_BOT_TOKEN:
+ required: false
+ CUOPT_SLACK_CHANNEL_ID:
+ required: false
+
+jobs:
+ nightly-summary:
+ runs-on: linux-amd64-cpu4
+ container:
+ image: python:3.12-slim
+ steps:
+ - uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.sha }}
+ - name: Install dependencies
+ run: |
+ apt-get update && apt-get install -y --no-install-recommends curl
+ pip install awscli
+ - name: Run nightly summary
+ env:
+ CUOPT_S3_URI: ${{ secrets.CUOPT_S3_URI }}
+ CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }}
+ CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
+ CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }}
+ CUOPT_SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }}
+ CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }}
+ RAPIDS_BUILD_TYPE: ${{ inputs.build_type }}
+ RAPIDS_BRANCH: ${{ inputs.branch }}
+ RUN_DATE: ${{ inputs.date }}
+ GITHUB_TOKEN: ${{ github.token }}
+ GITHUB_RUN_ID: ${{ github.run_id }}
+ GITHUB_REPOSITORY: ${{ github.repository }}
+ GITHUB_SERVER_URL: ${{ github.server_url }}
+ run: bash ci/nightly_summary.sh
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a652c23b9a..be67501892 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -299,8 +299,8 @@ jobs:
script: ci/test_cpp.sh
matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }}
secrets:
- script-env-secret-1-key: CUOPT_DATASET_S3_URI
- script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }}
+ script-env-secret-1-key: CUOPT_S3_URI
+ script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }}
script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID
script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }}
script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
@@ -323,8 +323,8 @@ jobs:
script: ci/test_python.sh
matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }}
secrets:
- script-env-secret-1-key: CUOPT_DATASET_S3_URI
- script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }}
+ script-env-secret-1-key: CUOPT_S3_URI
+ script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }}
script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID
script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }}
script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
@@ -384,8 +384,8 @@ jobs:
script: ci/test_wheel_cuopt.sh
matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }}
secrets:
- script-env-secret-1-key: CUOPT_DATASET_S3_URI
- script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }}
+ script-env-secret-1-key: CUOPT_S3_URI
+ script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }}
script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID
script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }}
script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
@@ -424,8 +424,8 @@ jobs:
script: ci/test_wheel_cuopt_server.sh
matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }}
secrets:
- script-env-secret-1-key: CUOPT_DATASET_S3_URI
- script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }}
+ script-env-secret-1-key: CUOPT_S3_URI
+ script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }}
script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID
script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }}
script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a8cc5f2943..5246ed0124 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -24,9 +24,14 @@ on:
description: "build_type: one of [branch, nightly, pull-request]"
type: string
default: nightly
+ summary-only:
+ description: "If true, skip all test jobs and run only nightly-summary"
+ type: boolean
+ default: false
jobs:
conda-cpp-tests:
+ if: ${{ !inputs.summary-only }}
uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
with:
build_type: ${{ inputs.build_type }}
@@ -35,13 +40,15 @@ jobs:
sha: ${{ inputs.sha }}
script: ci/test_cpp.sh
secrets:
- script-env-secret-1-key: CUOPT_DATASET_S3_URI
- script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }}
+ script-env-secret-1-key: CUOPT_S3_URI
+ script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }}
script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID
script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }}
script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
+
conda-python-tests:
+ if: ${{ !inputs.summary-only }}
uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
with:
run_codecov: false
@@ -51,13 +58,15 @@ jobs:
sha: ${{ inputs.sha }}
script: ci/test_python.sh
secrets:
- script-env-secret-1-key: CUOPT_DATASET_S3_URI
- script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }}
+ script-env-secret-1-key: CUOPT_S3_URI
+ script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }}
script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID
script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }}
script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
+
wheel-tests-cuopt:
+ if: ${{ !inputs.summary-only }}
uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
with:
build_type: ${{ inputs.build_type }}
@@ -66,13 +75,15 @@ jobs:
sha: ${{ inputs.sha }}
script: ci/test_wheel_cuopt.sh
secrets:
- script-env-secret-1-key: CUOPT_DATASET_S3_URI
- script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }}
+ script-env-secret-1-key: CUOPT_S3_URI
+ script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }}
script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID
script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }}
script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
+
wheel-tests-cuopt-server:
+ if: ${{ !inputs.summary-only }}
uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
with:
build_type: ${{ inputs.build_type }}
@@ -81,13 +92,15 @@ jobs:
sha: ${{ inputs.sha }}
script: ci/test_wheel_cuopt_server.sh
secrets:
- script-env-secret-1-key: CUOPT_DATASET_S3_URI
- script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }}
+ script-env-secret-1-key: CUOPT_S3_URI
+ script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }}
script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID
script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }}
script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
+
conda-notebook-tests:
+ if: ${{ !inputs.summary-only }}
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
with:
@@ -99,3 +112,24 @@ jobs:
arch: "amd64"
container_image: "rapidsai/ci-conda:26.06-latest"
script: ci/test_notebooks.sh
+ nightly-summary:
+ if: ${{ always() && inputs.build_type == 'nightly' }}
+ needs:
+ - conda-cpp-tests
+ - conda-python-tests
+ - wheel-tests-cuopt
+ - wheel-tests-cuopt-server
+ - conda-notebook-tests
+ uses: ./.github/workflows/nightly-summary.yaml
+ with:
+ branch: ${{ inputs.branch }}
+ sha: ${{ inputs.sha }}
+ build_type: ${{ inputs.build_type }}
+ date: ${{ inputs.date }}
+ secrets:
+ CUOPT_S3_URI: ${{ secrets.CUOPT_S3_URI }}
+ CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }}
+ CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
+ CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }}
+ CUOPT_SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }}
+ CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }}
diff --git a/ci/build_summary.sh b/ci/build_summary.sh
new file mode 100755
index 0000000000..e8fd81a436
--- /dev/null
+++ b/ci/build_summary.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Send a Slack notification summarizing the build workflow status.
+# Queries the GitHub API for job statuses and posts a compact message.
+
+set -euo pipefail
+
+BRANCH="${RAPIDS_BRANCH:-main}"
+RUN_DATE="$(date +%F)"
+GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}"
+SLACK_WEBHOOK_URL="${CUOPT_SLACK_WEBHOOK_URL:-}"
+SLACK_BOT_TOKEN="${SLACK_BOT_TOKEN:-}"
+SLACK_CHANNEL_ID="${SLACK_CHANNEL_ID:-}"
+
+if [ -z "${SLACK_WEBHOOK_URL}" ] && [ -z "${SLACK_BOT_TOKEN}" ]; then
+ echo "No Slack credentials set, skipping build summary."
+ exit 0
+fi
+
+# Fetch workflow job statuses
+JOBS_FILE=$(mktemp)
+if [ -n "${GITHUB_TOKEN:-}" ] && [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then
+ echo "Fetching build job statuses from GitHub API..."
+ curl -s -L \
+ -H "Authorization: Bearer ${GITHUB_TOKEN}" \
+ -H "Accept: application/vnd.github+json" \
+ "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100" \
+ > "${JOBS_FILE}"
+else
+ echo "{}" > "${JOBS_FILE}"
+fi
+
+# Generate Slack payload
+PAYLOAD=$(python3 -c "
+import json, sys
+
+with open(sys.argv[1]) as f:
+ data = json.load(f)
+branch = sys.argv[2]
+date = sys.argv[3]
+run_url = sys.argv[4]
+
+jobs = data.get('jobs', [])
+
+# Filter out build-summary itself and compute-matrix helpers
+jobs = [j for j in jobs
+ if 'build-summary' not in j.get('name', '').lower()
+ and 'compute-matrix' not in j.get('name', '').lower()]
+
+# Group by workflow prefix
+groups = {}
+for j in jobs:
+ name = j.get('name', '')
+ prefix = name.split(' / ')[0] if ' / ' in name else name
+ groups.setdefault(prefix, []).append(j)
+
+total = len(jobs)
+failed_count = sum(1 for j in jobs if j.get('conclusion') == 'failure')
+passed_count = sum(1 for j in jobs if j.get('conclusion') == 'success')
+
+if failed_count > 0:
+ emoji = ':x:'
+ status = f'{failed_count} build job(s) failed'
+else:
+ emoji = ':white_check_mark:'
+ status = f'All {passed_count} build jobs passed'
+
+blocks = []
+blocks.append({
+ 'type': 'header',
+ 'text': {'type': 'plain_text', 'text': f'cuOpt Build \u2014 {branch} \u2014 {date}', 'emoji': True},
+})
+blocks.append({
+ 'type': 'section',
+ 'text': {'type': 'mrkdwn', 'text': f'{emoji} *{status}*'},
+})
+blocks.append({'type': 'divider'})
+
+# Build status per group
+lines = []
+for group_name, group_jobs in sorted(groups.items()):
+ g_passed = sum(1 for j in group_jobs if j.get('conclusion') == 'success')
+ g_failed = sum(1 for j in group_jobs if j.get('conclusion') == 'failure')
+ g_total = len(group_jobs)
+
+ if g_failed > 0:
+ icon = ':x:'
+ detail = f'{g_failed}/{g_total} failed'
+ elif g_passed == g_total:
+ icon = ':white_check_mark:'
+ detail = f'{g_total} passed'
+ else:
+ icon = ':grey_question:'
+ detail = f'{g_passed}/{g_total} passed'
+ lines.append(f'{icon} *{group_name}* \u2014 {detail}')
+
+text = '\n'.join(lines)
+if len(text) > 2900:
+ text = text[:2900] + '\n_...truncated_'
+blocks.append({
+ 'type': 'section',
+ 'text': {'type': 'mrkdwn', 'text': text},
+})
+
+# Link
+if run_url:
+ blocks.append({'type': 'divider'})
+ blocks.append({
+ 'type': 'context',
+ 'elements': [{'type': 'mrkdwn', 'text': f'<{run_url}|:github: GitHub Actions>'}],
+ })
+
+print(json.dumps({
+ 'username': 'cuOpt Build Bot',
+ 'icon_emoji': ':package:',
+ 'blocks': blocks,
+}))
+" "${JOBS_FILE}" "${BRANCH}" "${RUN_DATE}" "${GITHUB_RUN_URL}")
+
+rm -f "${JOBS_FILE}"
+
+# Send via bot token (preferred) or webhook
+echo "Sending build summary to Slack..."
+if [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ]; then
+ BOT_PAYLOAD=$(python3 -c "
+import json, sys
+p = json.loads(sys.argv[1])
+p['channel'] = sys.argv[2]
+print(json.dumps(p))
+" "${PAYLOAD}" "${SLACK_CHANNEL_ID}")
+
+ RESPONSE=$(curl -s -X POST \
+ -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \
+ -H "Content-Type: application/json" \
+ --data "${BOT_PAYLOAD}" \
+ "https://slack.com/api/chat.postMessage")
+
+ OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "")
+ if [ "${OK}" != "True" ]; then
+ echo "WARNING: chat.postMessage failed: ${RESPONSE}" >&2
+ # Fall back to webhook
+ curl -s -X POST -H 'Content-type: application/json' --data "${PAYLOAD}" "${SLACK_WEBHOOK_URL}" || true
+ else
+ echo "Build summary posted to Slack."
+ fi
+elif [ -n "${SLACK_WEBHOOK_URL}" ]; then
+ response=$(curl -s -X POST \
+ -H 'Content-type: application/json' \
+ --data "${PAYLOAD}" \
+ "${SLACK_WEBHOOK_URL}")
+ if [ "${response}" != "ok" ]; then
+ echo "WARNING: Slack webhook returned: ${response}" >&2
+ else
+ echo "Build summary posted to Slack."
+ fi
+fi
diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html
new file mode 100644
index 0000000000..73329dea0c
--- /dev/null
+++ b/ci/dashboard/index.html
@@ -0,0 +1,689 @@
+
+
+
+
+
+cuOpt Nightly Test Dashboard
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Loading dashboard data...
+
+
+
+
+
+
+
+
+
+
+
+ Failures
+ Matrix Grid
+ Flaky
+ Stabilized
+ Trends
+
+
+
+
+
+
+
+
diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh
new file mode 100755
index 0000000000..41790d8b44
--- /dev/null
+++ b/ci/nightly_summary.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Aggregate all per-matrix nightly test summaries and send a single
+# consolidated Slack notification. Runs as a post-test job after all
+# matrix CI jobs finish.
+#
+# The script needs S3 access via CUOPT_S3_URI (bucket root) and CUOPT_AWS_* credentials.
+#
+# Optional:
+# CUOPT_SLACK_WEBHOOK_URL - sends Slack if set
+# RAPIDS_BRANCH - branch name (default: main)
+# RAPIDS_BUILD_TYPE - build type (nightly, pull-request, etc.)
+# GITHUB_TOKEN - for querying workflow job statuses
+# GITHUB_RUN_ID - current workflow run ID
+
+set -euo pipefail
+
+SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
+OUTPUT_DIR="${PWD}/aggregate-output"
+mkdir -p "${OUTPUT_DIR}"
+
+RUN_DATE="${RUN_DATE:-$(date +%F)}"
+BRANCH="${RAPIDS_BRANCH:-main}"
+
+GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}"
+
+# Map CUOPT_AWS_* to standard AWS env vars for the aws CLI
+export AWS_ACCESS_KEY_ID="${CUOPT_AWS_ACCESS_KEY_ID:-${AWS_ACCESS_KEY_ID:-}}"
+export AWS_SECRET_ACCESS_KEY="${CUOPT_AWS_SECRET_ACCESS_KEY:-${AWS_SECRET_ACCESS_KEY:-}}"
+unset AWS_SESSION_TOKEN
+
+if [ -z "${CUOPT_S3_URI:-}" ]; then
+ echo "WARNING: CUOPT_S3_URI is not set. Skipping nightly aggregation." >&2
+ exit 0
+fi
+
+S3_BASE="${CUOPT_S3_URI}ci_test_reports/nightly"
+BRANCH_SLUG=$(echo "${BRANCH}" | tr '/' '-')
+# Per-matrix summaries are uploaded by test jobs under summaries/{date}/{branch}/.
+# For production nightlies (main, release/*), RAPIDS_BRANCH matches the branch input.
+# For feature branch testing, RAPIDS_BRANCH may default to "main" in rapidsai containers,
+# so we search the date prefix recursively (s3_list handles this).
+S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/"
+S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/"
+S3_CONSOLIDATED_JSON="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/consolidated.json"
+S3_CONSOLIDATED_HTML="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/consolidated.html"
+S3_INDEX_URI="${S3_BASE}/index.json"
+S3_DASHBOARD_URI="${S3_BASE}/dashboard/${BRANCH_SLUG}/index.html"
+DASHBOARD_DIR="${SCRIPT_DIR}/dashboard"
+
+# --- Query GitHub API for workflow job statuses ---
+WORKFLOW_JOBS_JSON="${OUTPUT_DIR}/workflow_jobs.json"
+if [ -n "${GITHUB_TOKEN:-}" ] && [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then
+ echo "Fetching workflow job statuses from GitHub API..."
+ curl -s -L \
+ -H "Authorization: Bearer ${GITHUB_TOKEN}" \
+ -H "Accept: application/vnd.github+json" \
+ "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100" \
+ > "${WORKFLOW_JOBS_JSON}" || echo "{}" > "${WORKFLOW_JOBS_JSON}"
+else
+ echo "WARNING: GITHUB_TOKEN or GITHUB_RUN_ID not set, skipping workflow job status." >&2
+ echo "{}" > "${WORKFLOW_JOBS_JSON}"
+fi
+
+
+# Fallback: search the date-level prefix if branch-specific path is empty.
+# This handles the case where RAPIDS_BRANCH in rapidsai containers differs
+# from the branch input (e.g., feature branch testing where RAPIDS_BRANCH=main).
+S3_SUMMARIES_FALLBACK="${S3_BASE}/summaries/${RUN_DATE}/"
+
+echo "Aggregating nightly summaries from ${S3_SUMMARIES_PREFIX}"
+
+python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \
+ --s3-summaries-prefix "${S3_SUMMARIES_PREFIX}" \
+ --s3-summaries-fallback "${S3_SUMMARIES_FALLBACK}" \
+ --s3-reports-prefix "${S3_REPORTS_PREFIX}" \
+ --s3-output-uri "${S3_CONSOLIDATED_JSON}" \
+ --s3-html-output-uri "${S3_CONSOLIDATED_HTML}" \
+ --s3-index-uri "${S3_INDEX_URI}" \
+ --s3-dashboard-uri "${S3_DASHBOARD_URI}" \
+ --dashboard-dir "${DASHBOARD_DIR}" \
+ --output-dir "${OUTPUT_DIR}" \
+ --date "${RUN_DATE}" \
+ --branch "${BRANCH}" \
+ --github-run-url "${GITHUB_RUN_URL}" \
+ --workflow-jobs "${WORKFLOW_JOBS_JSON}"
+
+# --- Generate presigned URLs for reports (7-day expiry) ---
+PRESIGN_EXPIRY=604800
+PRESIGNED_HTML=$(aws s3 presign "${S3_CONSOLIDATED_HTML}" --expires-in "${PRESIGN_EXPIRY}" 2>&1) || {
+ echo "WARNING: Failed to generate presigned URL for report: ${PRESIGNED_HTML}" >&2
+ PRESIGNED_HTML=""
+}
+PRESIGNED_DASHBOARD=$(aws s3 presign "${S3_DASHBOARD_URI}" --expires-in "${PRESIGN_EXPIRY}" 2>&1) || {
+ echo "WARNING: Failed to generate presigned URL for dashboard: ${PRESIGNED_DASHBOARD}" >&2
+ PRESIGNED_DASHBOARD=""
+}
+
+# Send consolidated Slack notification if webhook is available and this is a nightly build
+if [ -n "${CUOPT_SLACK_WEBHOOK_URL:-}" ] && [ "${RAPIDS_BUILD_TYPE:-}" = "nightly" ]; then
+ echo "Sending consolidated Slack notification"
+ CONSOLIDATED_SUMMARY="${OUTPUT_DIR}/consolidated_summary.json" \
+ CONSOLIDATED_HTML="${OUTPUT_DIR}/consolidated_report.html" \
+ SLACK_WEBHOOK_URL="${CUOPT_SLACK_WEBHOOK_URL}" \
+ SLACK_BOT_TOKEN="${CUOPT_SLACK_BOT_TOKEN:-}" \
+ SLACK_CHANNEL_ID="${CUOPT_SLACK_CHANNEL_ID:-}" \
+ PRESIGNED_REPORT_URL="${PRESIGNED_HTML}" \
+ PRESIGNED_DASHBOARD_URL="${PRESIGNED_DASHBOARD}" \
+ bash "${SCRIPT_DIR}/utils/send_consolidated_summary.sh"
+fi
+
+echo "Nightly summary complete."
diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh
index fc1de8e1b4..f1d57519b1 100755
--- a/ci/run_ctests.sh
+++ b/ci/run_ctests.sh
@@ -21,16 +21,40 @@ else
exit 1
fi
-for gt in "${GTEST_DIR}"/*_TEST; do
+GTEST_MAX_RETRIES=${GTEST_MAX_RETRIES:-1}
+
+run_gtest_with_retry() {
+ local gt="$1"
+ shift
+ local test_name
test_name=$(basename "${gt}")
+
echo "Running gtest ${test_name}"
- "${gt}" "$@"
+ if "${gt}" "$@"; then
+ return 0
+ fi
+
+ local attempt
+ for attempt in $(seq 1 "${GTEST_MAX_RETRIES}"); do
+ echo "WARNING: ${test_name} failed, retry ${attempt}/${GTEST_MAX_RETRIES}"
+ if "${gt}" "$@"; then
+ echo "FLAKY: ${test_name} passed on retry ${attempt}"
+ return 0
+ fi
+ done
+
+ echo "FAILED: ${test_name} failed after $((GTEST_MAX_RETRIES + 1)) attempts"
+ return 1
+}
+
+for gt in "${GTEST_DIR}"/*_TEST; do
+ run_gtest_with_retry "${gt}" "$@"
done
# Run C_API_TEST with CPU memory for local solves (excluding time limit tests)
if [ -x "${GTEST_DIR}/C_API_TEST" ]; then
echo "Running gtest C_API_TEST with CUOPT_USE_CPU_MEM_FOR_LOCAL"
- CUOPT_USE_CPU_MEM_FOR_LOCAL=1 "${GTEST_DIR}/C_API_TEST" --gtest_filter=-c_api/TimeLimitTestFixture.* "$@"
+ CUOPT_USE_CPU_MEM_FOR_LOCAL=1 run_gtest_with_retry "${GTEST_DIR}/C_API_TEST" --gtest_filter=-c_api/TimeLimitTestFixture.* "$@"
else
echo "Skipping C_API_TEST with CUOPT_USE_CPU_MEM_FOR_LOCAL (binary not found)"
fi
diff --git a/ci/run_cuopt_pytests.sh b/ci/run_cuopt_pytests.sh
index 66e996715a..080fa42a1b 100755
--- a/ci/run_cuopt_pytests.sh
+++ b/ci/run_cuopt_pytests.sh
@@ -9,4 +9,4 @@ set -euo pipefail
# Support invoking run_cuopt_pytests.sh outside the script directory
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuopt/cuopt/
-pytest -s --cache-clear "$@" tests
+pytest -s --cache-clear --reruns 2 --reruns-delay 5 "$@" tests
diff --git a/ci/run_cuopt_server_pytests.sh b/ci/run_cuopt_server_pytests.sh
index 4cb361a473..75d87d255d 100755
--- a/ci/run_cuopt_server_pytests.sh
+++ b/ci/run_cuopt_server_pytests.sh
@@ -9,4 +9,4 @@ set -euo pipefail
# Support invoking run_cuopt_server_pytests.sh outside the script directory
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuopt_server/cuopt_server/
-pytest -s --cache-clear "$@" tests
+pytest -s --cache-clear --reruns 2 --reruns-delay 5 "$@" tests
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 653c44133a..a68e0c7979 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -euo pipefail
@@ -54,5 +54,9 @@ export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/
rapids-logger "Run gtests"
timeout 40m ./ci/run_ctests.sh
+rapids-logger "Generate nightly test report"
+source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh"
+generate_nightly_report "cpp"
+
rapids-logger "Test script exiting with value: $EXITCODE"
exit ${EXITCODE}
diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh
index 22c41af84c..0b2b339ba1 100755
--- a/ci/test_notebooks.sh
+++ b/ci/test_notebooks.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -euo pipefail
@@ -64,5 +64,11 @@ for nb in ${NBLIST}; do
fi
done
+popd
+
+rapids-logger "Generate nightly test report"
+source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh"
+generate_nightly_report "notebooks" --with-python-version
+
rapids-logger "Notebook test script exiting with value: $EXITCODE"
exit ${EXITCODE}
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 4f91c83334..9af612ad76 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -77,5 +77,9 @@ timeout 20m ./ci/run_cuopt_server_pytests.sh \
rapids-logger "Test skills/ assets (Python, C, CLI)"
timeout 10m ./ci/test_skills_assets.sh
+rapids-logger "Generate nightly test report"
+source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh"
+generate_nightly_report "python" --with-python-version
+
rapids-logger "Test script exiting with value: $EXITCODE"
exit ${EXITCODE}
diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh
index a327082e83..878db67594 100755
--- a/ci/test_wheel_cuopt.sh
+++ b/ci/test_wheel_cuopt.sh
@@ -63,6 +63,14 @@ cd -
RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
export RAPIDS_DATASET_ROOT_DIR
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
+export RAPIDS_TESTS_DIR
+mkdir -p "${RAPIDS_TESTS_DIR}"
+
+EXITCODE=0
+trap "EXITCODE=1" ERR
+set +e
+
# Run CLI tests
timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh
@@ -71,7 +79,9 @@ timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh
# Due to race condition in certain cases UCX might not be able to cleanup properly, so we set the number of threads to 1
export OMP_NUM_THREADS=1
-timeout 30m ./ci/run_cuopt_pytests.sh --verbose --capture=no
+timeout 30m ./ci/run_cuopt_pytests.sh \
+ --junitxml="${RAPIDS_TESTS_DIR}/junit-wheel-cuopt.xml" \
+ --verbose --capture=no
# run thirdparty integration tests for only nightly builds
if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then
@@ -80,3 +90,9 @@ if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then
./ci/thirdparty-testing/run_pulp_tests.sh
./ci/thirdparty-testing/run_pyomo_tests.sh
fi
+
+# Generate nightly test report
+source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh"
+generate_nightly_report "wheel-python" --with-python-version
+
+exit ${EXITCODE}
diff --git a/ci/test_wheel_cuopt_server.sh b/ci/test_wheel_cuopt_server.sh
index a76969b965..55852a913c 100755
--- a/ci/test_wheel_cuopt_server.sh
+++ b/ci/test_wheel_cuopt_server.sh
@@ -39,7 +39,22 @@ rapids-pip-retry install \
RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
export RAPIDS_DATASET_ROOT_DIR
-timeout 30m ./ci/run_cuopt_server_pytests.sh --verbose --capture=no
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
+mkdir -p "${RAPIDS_TESTS_DIR}"
+
+EXITCODE=0
+trap "EXITCODE=1" ERR
+set +e
+
+timeout 30m ./ci/run_cuopt_server_pytests.sh \
+ --junitxml="${RAPIDS_TESTS_DIR}/junit-wheel-cuopt-server.xml" \
+ --verbose --capture=no
# Run documentation tests
./ci/test_doc_examples.sh
+
+# Generate nightly test report
+source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh"
+generate_nightly_report "wheel-server" --with-python-version
+
+exit ${EXITCODE}
diff --git a/ci/thirdparty-testing/run_cvxpy_tests.sh b/ci/thirdparty-testing/run_cvxpy_tests.sh
index c336f6a800..4b874fc4f0 100755
--- a/ci/thirdparty-testing/run_cvxpy_tests.sh
+++ b/ci/thirdparty-testing/run_cvxpy_tests.sh
@@ -32,10 +32,14 @@ python -m pip install \
# ensure that environment is still consistent (i.e. cvxpy requirements do not conflict with cuopt's)
pip check
+RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}"
+mkdir -p "${RAPIDS_TESTS_DIR}"
+
echo "running 'cvxpy' tests"
timeout 3m python -m pytest \
--verbose \
--capture=no \
--error-for-skips \
+ --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-cvxpy.xml" \
-k "TestCUOPT" \
./cvxpy/tests/test_conic_solvers.py
diff --git a/ci/thirdparty-testing/run_pulp_tests.sh b/ci/thirdparty-testing/run_pulp_tests.sh
index f9cb0ca8a5..2c26db7a23 100755
--- a/ci/thirdparty-testing/run_pulp_tests.sh
+++ b/ci/thirdparty-testing/run_pulp_tests.sh
@@ -23,6 +23,9 @@ python -m pip install \
pip check
+RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}"
+mkdir -p "${RAPIDS_TESTS_DIR}"
+
rapids-logger "running PuLP tests (cuOpt-related)"
# PuLP uses pytest; run only tests that reference cuopt/CUOPT
# Exit code 5 = no tests collected; then try run_tests.py which detects solvers (including cuopt)
@@ -30,6 +33,7 @@ pytest_rc=0
timeout 5m python -m pytest \
--verbose \
--capture=no \
+ --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-pulp.xml" \
-k "cuopt or CUOPT" \
pulp/tests/ || pytest_rc=$?
diff --git a/ci/thirdparty-testing/run_pyomo_tests.sh b/ci/thirdparty-testing/run_pyomo_tests.sh
index f50df676c9..d2b0639f6e 100755
--- a/ci/thirdparty-testing/run_pyomo_tests.sh
+++ b/ci/thirdparty-testing/run_pyomo_tests.sh
@@ -23,11 +23,15 @@ python -m pip install \
pip check
+RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}"
+mkdir -p "${RAPIDS_TESTS_DIR}"
+
rapids-logger "running Pyomo tests (cuopt_direct / cuOpt-related)"
# Run only tests that reference cuopt (cuopt_direct solver)
timeout 5m python -m pytest \
--verbose \
--capture=no \
+ --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-pyomo.xml" \
-k "cuopt or CUOPT" \
pyomo/solvers/tests/
diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py
new file mode 100644
index 0000000000..04989a4846
--- /dev/null
+++ b/ci/utils/aggregate_nightly.py
@@ -0,0 +1,770 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Aggregate per-matrix nightly test summaries into a single consolidated report.
+
+Runs as a post-test job after all matrix CI jobs finish. It:
+ 1. Lists all JSON summaries uploaded to S3 for today's date
+ 2. Downloads and merges them
+ 3. Builds a matrix grid (test_type x matrix_label → status)
+ 4. Generates a consolidated JSON, HTML report, and Slack payload
+ 5. Uploads the consolidated report to S3
+
+Usage:
+ python ci/utils/aggregate_nightly.py \\
+ --s3-summaries-prefix s3://bucket/ci_test_reports/nightly/summaries/2026-04-13/ \\
+ --s3-reports-prefix s3://bucket/ci_test_reports/nightly/reports/2026-04-13/ \\
+ --output-dir /tmp/aggregate-output \\
+ --date 2026-04-13 \\
+ --branch main
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+# Ensure ci/utils is importable when invoked as a script
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from s3_helpers import s3_download, s3_upload, s3_list # noqa: E402
+
+
+# ---------------------------------------------------------------------------
+# Download and merge summaries
+# ---------------------------------------------------------------------------
+
+
+def download_summaries(s3_prefix, local_dir, s3_fallback_prefix=""):
+ """Download all JSON summaries from S3 prefix into local_dir.
+ If s3_fallback_prefix is set and no summaries found at s3_prefix,
+ retries with the fallback (used when RAPIDS_BRANCH in rapidsai
+ containers doesn't match the branch input).
+ Returns list of loaded summary dicts."""
+ local_dir = Path(local_dir)
+ local_dir.mkdir(parents=True, exist_ok=True)
+
+ uris = s3_list(s3_prefix)
+ json_uris = [
+ u for u in uris
+ if u.endswith(".json") and not u.endswith("/consolidated.json")
+ ]
+
+ # Fallback: search the parent date prefix if branch-specific path is empty
+ if not json_uris and s3_fallback_prefix and s3_fallback_prefix != s3_prefix:
+ print(f"No summaries at {s3_prefix}, trying fallback: {s3_fallback_prefix}")
+ uris = s3_list(s3_fallback_prefix)
+ json_uris = [
+ u for u in uris
+ if u.endswith(".json") and not u.endswith("/consolidated.json")
+ ]
+ if json_uris:
+ s3_prefix = s3_fallback_prefix
+
+ print(f"Found {len(json_uris)} summary file(s) at {s3_prefix}")
+
+ summaries = []
+ for uri in json_uris:
+ filename = uri.rsplit("/", 1)[-1]
+ local_path = str(local_dir / filename)
+ if s3_download(uri, local_path):
+ try:
+ with open(local_path) as f:
+ summaries.append(json.load(f))
+ except (json.JSONDecodeError, OSError) as exc:
+ print(
+ f"WARNING: Failed to parse {local_path}: {exc}",
+ file=sys.stderr,
+ )
+ return summaries
+
+
+def load_local_summaries(local_dir):
+ """Load summaries from a local directory (for testing without S3)."""
+ local_dir = Path(local_dir)
+ summaries = []
+ for json_file in sorted(local_dir.glob("*.json")):
+ try:
+ with open(json_file) as f:
+ summaries.append(json.load(f))
+ except (json.JSONDecodeError, OSError) as exc:
+ print(
+ f"WARNING: Failed to parse {json_file}: {exc}", file=sys.stderr
+ )
+ return summaries
+
+
+# ---------------------------------------------------------------------------
+# Aggregation
+# ---------------------------------------------------------------------------
+
+
+def aggregate_summaries(summaries):
+ """Merge per-matrix summaries into a consolidated view.
+
+ Returns a dict with:
+ - matrix_grid: list of {test_type, matrix_label, status, counts, ...}
+ - totals: aggregate counts
+ - all_new_failures, all_recurring_failures, all_flaky_tests,
+ all_resolved_tests: merged lists with matrix context added
+ """
+ grid = []
+ totals = {
+ "total": 0,
+ "passed": 0,
+ "failed": 0,
+ "flaky": 0,
+ "skipped": 0,
+ "resolved": 0,
+ }
+ all_new_failures = []
+ all_recurring_failures = []
+ all_flaky_tests = []
+ all_resolved_tests = []
+
+ for s in summaries:
+ test_type = s.get("test_type", "unknown")
+ matrix_label = s.get("matrix_label", "unknown")
+ counts = s.get("counts", {})
+
+ # Determine job status
+ failed = counts.get("failed", 0)
+ flaky = counts.get("flaky", 0)
+ has_new = s.get("has_new_failures", False)
+
+ if failed > 0:
+ status = "failed-new" if has_new else "failed-recurring"
+ elif flaky > 0:
+ status = "flaky"
+ elif counts.get("total", 0) == 0:
+ status = "no-results"
+ else:
+ status = "passed"
+
+ grid.append(
+ {
+ "test_type": test_type,
+ "matrix_label": matrix_label,
+ "status": status,
+ "counts": counts,
+ "sha": s.get("sha", ""),
+ }
+ )
+
+ # Accumulate totals
+ for key in totals:
+ totals[key] += counts.get(key, 0)
+
+ # Merge failure lists with matrix context
+ ctx = {"test_type": test_type, "matrix_label": matrix_label}
+ for entry in s.get("new_failures", []):
+ all_new_failures.append({**entry, **ctx})
+ for entry in s.get("recurring_failures", []):
+ all_recurring_failures.append({**entry, **ctx})
+ for entry in s.get("flaky_tests", []):
+ all_flaky_tests.append({**entry, **ctx})
+ for entry in s.get("resolved_tests", []):
+ all_resolved_tests.append({**entry, **ctx})
+
+ # Sort grid for consistent display
+ grid.sort(key=lambda g: (g["test_type"], g["matrix_label"]))
+
+ return {
+ "matrix_grid": grid,
+ "totals": totals,
+ "all_new_failures": all_new_failures,
+ "all_recurring_failures": all_recurring_failures,
+ "all_flaky_tests": all_flaky_tests,
+ "all_resolved_tests": all_resolved_tests,
+ }
+
+
+# ---------------------------------------------------------------------------
+# Consolidated JSON
+# ---------------------------------------------------------------------------
+
+
+def parse_workflow_jobs(workflow_jobs_path):
+ """Parse GitHub Actions workflow job statuses from JSON file.
+ Returns all jobs (except nightly-summary itself) with name,
+ conclusion, URL, and whether they are tracked by per-matrix
+ S3 summaries."""
+ if not workflow_jobs_path or not Path(workflow_jobs_path).exists():
+ return []
+
+ # Job name prefixes that are covered by per-matrix S3 reports.
+ # These jobs also have detailed test results; other jobs only have
+ # a pass/fail status at the workflow level.
+ TRACKED_PREFIXES = (
+ "conda-cpp-tests",
+ "conda-python-tests",
+ "wheel-tests-cuopt-server",
+ "wheel-tests-cuopt",
+ )
+
+ try:
+ with open(workflow_jobs_path) as f:
+ data = json.load(f)
+ jobs_list = data.get("jobs", [])
+ result = []
+ for job in jobs_list:
+ name = job.get("name", "")
+ # Skip the nightly-summary job itself
+ if "nightly-summary" in name.lower():
+ continue
+ # Skip helper jobs (compute-matrix, etc.)
+ if "compute-matrix" in name.lower():
+ continue
+ tracked = any(name.startswith(p) for p in TRACKED_PREFIXES)
+ result.append({
+ "name": name,
+ "conclusion": job.get("conclusion", "unknown"),
+ "status": job.get("status", "unknown"),
+ "url": job.get("html_url", ""),
+ "has_test_details": tracked,
+ })
+ return result
+ except (json.JSONDecodeError, OSError) as exc:
+ print(
+ f"WARNING: Failed to parse workflow jobs: {exc}",
+ file=sys.stderr,
+ )
+ return []
+
+
+def generate_consolidated_json(agg, date_str, branch, github_run_url="",
+ workflow_jobs=None):
+ """Generate the consolidated JSON for Slack and dashboard."""
+ total_jobs = len(agg["matrix_grid"])
+ failed_jobs = sum(
+ 1 for g in agg["matrix_grid"] if g["status"].startswith("failed")
+ )
+ flaky_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "flaky")
+ passed_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "passed")
+
+ # Workflow-level CI job statuses
+ wf_jobs = workflow_jobs or []
+ failed_ci_jobs = [j for j in wf_jobs if j["conclusion"] == "failure"]
+ # Jobs without per-matrix S3 tracking (notebooks, JuMP, etc.)
+ untracked_failed = [
+ j for j in failed_ci_jobs if not j.get("has_test_details", False)
+ ]
+
+ return {
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "date": date_str,
+ "branch": branch,
+ "github_run_url": github_run_url,
+ "job_summary": {
+ "total": total_jobs,
+ "passed": passed_jobs,
+ "failed": failed_jobs,
+ "flaky": flaky_jobs,
+ },
+ "test_totals": agg["totals"],
+ "has_new_failures": len(agg["all_new_failures"]) > 0,
+ "matrix_grid": agg["matrix_grid"],
+ "new_failures": agg["all_new_failures"],
+ "recurring_failures": agg["all_recurring_failures"],
+ "flaky_tests": agg["all_flaky_tests"],
+ "resolved_tests": agg["all_resolved_tests"],
+ "workflow_jobs": wf_jobs,
+ "failed_ci_jobs": failed_ci_jobs,
+ "untracked_failed_ci_jobs": untracked_failed,
+ }
+
+
+# ---------------------------------------------------------------------------
+# Consolidated HTML
+# ---------------------------------------------------------------------------
+
+
+def _html_escape(text):
+ return (
+ str(text)
+ .replace("&", "&")
+ .replace("<", "<")
+ .replace(">", ">")
+ .replace('"', """)
+ )
+
+
+def _status_badge(status):
+ """Return an HTML badge for a matrix cell status."""
+ colors = {
+ "passed": ("#388e3c", "PASS"),
+ "failed-new": ("#d32f2f", "NEW FAIL"),
+ "failed-recurring": ("#e65100", "RECURRING"),
+ "flaky": ("#f9a825", "FLAKY"),
+ "no-results": ("#757575", "NO DATA"),
+ }
+ bg, label = colors.get(status, ("#757575", status.upper()))
+ text_color = "#212121" if status == "flaky" else "#fff"
+ return (
+ f''
+ f"{label} "
+ )
+
+
+def generate_consolidated_html(
+ agg,
+ date_str,
+ branch,
+ github_run_url="",
+ s3_reports_prefix="",
+):
+ """Generate a consolidated HTML dashboard for all matrix combos."""
+ total_jobs = len(agg["matrix_grid"])
+ failed_jobs = sum(
+ 1 for g in agg["matrix_grid"] if g["status"].startswith("failed")
+ )
+
+ if failed_jobs > 0:
+ bar_color = "#d32f2f"
+ bar_text = f"{failed_jobs} of {total_jobs} matrix jobs have failures"
+ elif any(g["status"] == "flaky" for g in agg["matrix_grid"]):
+ bar_color = "#f9a825"
+ bar_text = "All jobs passed (flaky tests detected)"
+ else:
+ bar_color = "#388e3c"
+ bar_text = f"All {total_jobs} matrix jobs passed"
+
+ totals = agg["totals"]
+
+ parts = []
+ parts.append(f"""
+
+
+
+
+cuOpt Nightly — {_html_escape(branch)} — {_html_escape(date_str)}
+
+
+
+cuOpt Nightly Tests — {_html_escape(branch)}
+
+{bar_text}
+
+
{totals["total"]}
Total Tests
+
+
+
+
{totals["skipped"]}
Skipped
+
{totals["resolved"]}
Stabilized
+
""")
+
+ # --- New failures ---
+ if agg["all_new_failures"]:
+ parts.append("New Failures ")
+ parts.append(
+ "Test Type Matrix Suite "
+ "Test Error "
+ )
+ for e in agg["all_new_failures"]:
+ msg = _html_escape(e.get("message", ""))
+ short = _html_escape(e.get("message", "")[:100])
+ parts.append(
+ f"{_html_escape(e['test_type'])} "
+ f"{_html_escape(e['matrix_label'])} "
+ f"{_html_escape(e['suite'])} "
+ f"{_html_escape(e['name'])} "
+ f"{short} "
+ f'{msg} '
+ )
+ parts.append("
")
+
+ # --- Recurring failures ---
+ if agg["all_recurring_failures"]:
+ parts.append("Recurring Failures ")
+ parts.append(
+ "Test Type Matrix Suite "
+ "Test Since Error "
+ )
+ for e in agg["all_recurring_failures"]:
+ msg = _html_escape(e.get("message", ""))
+ short = _html_escape(e.get("message", "")[:100])
+ parts.append(
+ f"{_html_escape(e['test_type'])} "
+ f"{_html_escape(e['matrix_label'])} "
+ f"{_html_escape(e['suite'])} "
+ f"{_html_escape(e['name'])} "
+ f"{_html_escape(e.get('first_seen', '?'))} "
+ f"{short} "
+ f'{msg} '
+ )
+ parts.append("
")
+
+ # --- Resolved ---
+ if agg["all_resolved_tests"]:
+ parts.append("Stabilized Tests ")
+ parts.append(
+ "Test Type Matrix Suite "
+ "Test Failing Since Count "
+ )
+ for e in agg["all_resolved_tests"]:
+ parts.append(
+ f"{_html_escape(e['test_type'])} "
+ f"{_html_escape(e['matrix_label'])} "
+ f"{_html_escape(e['suite'])} "
+ f"{_html_escape(e['name'])} "
+ f"{_html_escape(e.get('first_seen', '?'))} "
+ f"{e.get('failure_count', '?')} "
+ )
+ parts.append("
")
+
+ # --- Flaky ---
+ if agg["all_flaky_tests"]:
+ parts.append("Flaky Tests ")
+ parts.append(
+ "Test Type Matrix Suite "
+ "Test Retries "
+ )
+ for e in agg["all_flaky_tests"]:
+ parts.append(
+ f"{_html_escape(e['test_type'])} "
+ f"{_html_escape(e['matrix_label'])} "
+ f"{_html_escape(e['suite'])} "
+ f"{_html_escape(e['name'])} "
+ f"{e.get('retry_count', '?')} "
+ )
+ parts.append("
")
+
+ if (
+ not agg["all_new_failures"]
+ and not agg["all_recurring_failures"]
+ and not agg["all_flaky_tests"]
+ and not agg["all_resolved_tests"]
+ ):
+ parts.append(
+ ''
+ "All tests passed across all matrices!
"
+ )
+
+ # --- Matrix grid (at the end) ---
+ parts.append("Matrix Overview ")
+ parts.append(
+ "Test Type Matrix Status "
+ "Passed Failed Flaky Total Report "
+ )
+ for g in agg["matrix_grid"]:
+ counts = g["counts"]
+ report_link = ""
+ if s3_reports_prefix:
+ report_filename = f"{g['test_type']}-{g['matrix_label']}.html"
+ report_link = (
+ f'View '
+ )
+ parts.append(
+ f"{_html_escape(g['test_type'])} "
+ f"{_html_escape(g['matrix_label'])} "
+ f"{_status_badge(g['status'])} "
+ f"{counts.get('passed', 0)} "
+ f"{counts.get('failed', 0)} "
+ f"{counts.get('flaky', 0)} "
+ f"{counts.get('total', 0)} "
+ f"{report_link} "
+ )
+ parts.append("
")
+
+ parts.append("")
+ return "\n".join(parts)
+
+
+# ---------------------------------------------------------------------------
+# Index management
+# ---------------------------------------------------------------------------
+
+MAX_INDEX_DAYS = 90 # Keep at most 90 days in the index
+
+
+def update_index(s3_index_uri, date_str, consolidated, output_dir):
+ """Download index.json, add today's entry, prune old entries, re-upload."""
+ local_index = str(output_dir / "index.json")
+
+ # Download existing index (or start fresh)
+ index = {"_schema_version": 1, "dates": {}}
+ if s3_download(s3_index_uri, local_index):
+ try:
+ with open(local_index) as f:
+ loaded = json.load(f)
+ if "dates" in loaded:
+ index = loaded
+ except (json.JSONDecodeError, OSError):
+ pass
+
+ # Add today's entry keyed by date/branch for multi-branch support
+ branch = consolidated.get("branch", "main")
+ entry_key = f"{date_str}/{branch}"
+ index["dates"][entry_key] = {
+ "date": date_str,
+ "branch": branch,
+ "job_summary": consolidated.get("job_summary", {}),
+ "test_totals": consolidated.get("test_totals", {}),
+ "has_new_failures": consolidated.get("has_new_failures", False),
+ "github_run_url": consolidated.get("github_run_url", ""),
+ }
+
+ # Prune to last N entries
+ dates_sorted = sorted(index["dates"].keys(), reverse=True)
+ if len(dates_sorted) > MAX_INDEX_DAYS:
+ for old_key in dates_sorted[MAX_INDEX_DAYS:]:
+ del index["dates"][old_key]
+
+ # Write and upload
+ with open(local_index, "w") as f:
+ json.dump(index, f, indent=2, sort_keys=True)
+ f.write("\n")
+ print(f"Updated index.json with {len(index['dates'])} date(s)")
+
+ s3_upload(local_index, s3_index_uri)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Aggregate per-matrix nightly test summaries"
+ )
+ parser.add_argument(
+ "--s3-summaries-prefix",
+ default="",
+ help="S3 prefix for per-matrix JSON summaries (e.g., s3://bucket/.../summaries/2026-04-13/)",
+ )
+ parser.add_argument(
+ "--s3-summaries-fallback",
+ default="",
+ help="Fallback S3 prefix if no summaries found at primary prefix",
+ )
+ parser.add_argument(
+ "--s3-reports-prefix",
+ default="",
+ help="S3 prefix where per-matrix HTML reports live (for linking)",
+ )
+ parser.add_argument(
+ "--s3-output-uri",
+ default="",
+ help="S3 URI to upload the consolidated JSON",
+ )
+ parser.add_argument(
+ "--s3-html-output-uri",
+ default="",
+ help="S3 URI to upload the consolidated HTML report",
+ )
+ parser.add_argument(
+ "--s3-index-uri",
+ default="",
+ help="S3 URI for the index.json that tracks all available dates (read + write)",
+ )
+ parser.add_argument(
+ "--s3-dashboard-uri",
+ default="",
+ help="S3 URI to upload the dashboard HTML (e.g., s3://bucket/.../dashboard/index.html)",
+ )
+ parser.add_argument(
+ "--dashboard-dir",
+ default="",
+ help="Local directory containing dashboard files to upload",
+ )
+ parser.add_argument(
+ "--local-summaries-dir",
+ default="",
+ help="Local directory with JSON summaries (alternative to S3, for testing)",
+ )
+ parser.add_argument(
+ "--output-dir",
+ default="aggregate-output",
+ help="Local directory to write output files",
+ )
+ parser.add_argument(
+ "--date",
+ default=datetime.now(timezone.utc).strftime("%Y-%m-%d"),
+ help="Date for this run (YYYY-MM-DD)",
+ )
+ parser.add_argument("--branch", default="main", help="Branch name")
+ parser.add_argument(
+ "--github-run-url",
+ default="",
+ help="URL to the GitHub Actions run",
+ )
+ parser.add_argument(
+ "--workflow-jobs",
+ default="",
+ help="Path to JSON file with GitHub Actions workflow job statuses",
+ )
+
+ args = parser.parse_args()
+ output_dir = Path(args.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # ---- Step 1: Collect summaries ----
+ if args.local_summaries_dir:
+ summaries = load_local_summaries(args.local_summaries_dir)
+ elif args.s3_summaries_prefix:
+ download_dir = output_dir / "downloaded_summaries"
+ summaries = download_summaries(
+ args.s3_summaries_prefix, download_dir, args.s3_summaries_fallback
+ )
+ else:
+ print(
+ "ERROR: Provide --s3-summaries-prefix or --local-summaries-dir",
+ file=sys.stderr,
+ )
+ return 1
+
+ if not summaries:
+ print(
+ "WARNING: No summaries found. Generating empty report.",
+ file=sys.stderr,
+ )
+
+ print(f"Loaded {len(summaries)} matrix summary file(s)")
+
+ # ---- Step 2: Aggregate ----
+ agg = aggregate_summaries(summaries)
+ print(
+ f"Matrix grid: {len(agg['matrix_grid'])} jobs — "
+ f"{sum(1 for g in agg['matrix_grid'] if g['status'] == 'passed')} passed, "
+ f"{sum(1 for g in agg['matrix_grid'] if g['status'].startswith('failed'))} failed, "
+ f"{sum(1 for g in agg['matrix_grid'] if g['status'] == 'flaky')} flaky"
+ )
+
+ # ---- Step 2b: Parse workflow job statuses ----
+ workflow_jobs = parse_workflow_jobs(args.workflow_jobs)
+ if workflow_jobs:
+ failed_wf = [j for j in workflow_jobs if j["conclusion"] == "failure"]
+ print(
+ f"Workflow jobs: {len(workflow_jobs)} total, "
+ f"{len(failed_wf)} failed"
+ )
+
+ # ---- Step 3: Generate outputs ----
+ consolidated = generate_consolidated_json(
+ agg,
+ args.date,
+ args.branch,
+ args.github_run_url,
+ workflow_jobs,
+ )
+
+ json_path = output_dir / "consolidated_summary.json"
+ json_path.write_text(json.dumps(consolidated, indent=2) + "\n")
+ print(f"Consolidated JSON written to {json_path}")
+
+ html_report = generate_consolidated_html(
+ agg,
+ args.date,
+ args.branch,
+ args.github_run_url,
+ args.s3_reports_prefix,
+ )
+ html_path = output_dir / "consolidated_report.html"
+ html_path.write_text(html_report)
+ print(f"Consolidated HTML written to {html_path}")
+
+ # ---- Step 4: Upload to S3 ----
+ if args.s3_output_uri:
+ s3_upload(str(json_path), args.s3_output_uri)
+ if args.s3_html_output_uri:
+ s3_upload(str(html_path), args.s3_html_output_uri)
+
+ # ---- Step 5: Update index.json ----
+ if args.s3_index_uri:
+ update_index(
+ args.s3_index_uri,
+ args.date,
+ consolidated,
+ output_dir,
+ )
+
+ # ---- Step 6: Upload dashboard (self-contained with embedded data) ----
+ if args.s3_dashboard_uri and args.dashboard_dir:
+ dashboard_file = Path(args.dashboard_dir) / "index.html"
+ if dashboard_file.exists():
+ # Read the index.json we just uploaded/created
+ index_path = output_dir / "index.json"
+ index_data = {}
+ if index_path.exists():
+ with open(index_path) as f:
+ index_data = json.load(f)
+
+ # Inject data into dashboard HTML so it works without S3 fetches
+ dashboard_html = dashboard_file.read_text()
+ inject_script = (
+ "\n"
+ )
+ # Insert before
+ dashboard_html = dashboard_html.replace(
+ "", inject_script + ""
+ )
+
+ embedded_path = output_dir / "dashboard.html"
+ embedded_path.write_text(dashboard_html)
+ s3_upload(str(embedded_path), args.s3_dashboard_uri)
+ print(f"Dashboard uploaded with embedded data")
+ else:
+ print(
+ f"WARNING: Dashboard not found at {dashboard_file}",
+ file=sys.stderr,
+ )
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/ci/utils/nightly_report.py b/ci/utils/nightly_report.py
new file mode 100755
index 0000000000..2bd23b1f18
--- /dev/null
+++ b/ci/utils/nightly_report.py
@@ -0,0 +1,1005 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Nightly test report generator for cuOpt CI.
+
+Parses JUnit XML test results, classifies failures as flaky vs genuine,
+maintains a failure history database on S3, and outputs:
+ - HTML report (detailed, uploaded to S3 and linked from Slack)
+ - Markdown summary (for $GITHUB_STEP_SUMMARY or terminal)
+ - JSON summary (for downstream consumers like Slack notifier and dashboard)
+
+Each CI matrix job (CUDA version x Python version x architecture) runs this
+script independently. The --test-type and --matrix-label flags identify the
+job so that history and summaries are stored per-matrix-combo.
+
+History lifecycle:
+ 1. Download history from S3 (falls back to empty if not found)
+ 2. Classify this run's results
+ 3. Update history: mark new failures, bump recurring counts, resolve stabilized tests
+ 4. Upload updated history back to S3
+ 5. Generate reports (HTML, Markdown, JSON, GitHub Step Summary)
+ 6. Upload per-run JSON snapshot to S3 summaries dir (for aggregation)
+
+Usage:
+ python ci/utils/nightly_report.py \\
+ --results-dir test-results/ \\
+ --output-dir report-output/ \\
+ --sha abc123 \\
+ --test-type python \\
+ --matrix-label cuda12.9-py3.12-x86_64 \\
+ --s3-history-uri s3://bucket/ci_test_reports/nightly/history/python-main-cuda12.9-py3.12-x86_64.json \\
+ --s3-summary-uri s3://bucket/ci_test_reports/nightly/summaries/2026-04-13/python-cuda12.9-py3.12-x86_64.json
+"""
+
+import argparse
+import json
+import os
+import sys
+from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+from xml.etree import ElementTree
+
+# Ensure ci/utils is importable when invoked as a script
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from s3_helpers import s3_download, s3_upload # noqa: E402
+
+EMPTY_HISTORY = {"_schema_version": 2, "tests": {}}
+
+# A test that resolves then fails again within this window is considered
+# "bouncing" (intermittently flaky) rather than a new failure.
+BOUNCE_WINDOW_DAYS = int(os.environ.get("CUOPT_BOUNCE_WINDOW_DAYS", 14))
+
+# Number of failure/resolve cycles that classify a test as cross-run flaky.
+BOUNCE_THRESHOLD = int(os.environ.get("CUOPT_BOUNCE_THRESHOLD", 2))
+
+
+# ---------------------------------------------------------------------------
+# JUnit XML parsing
+# ---------------------------------------------------------------------------
+
+
+def parse_junit_xml(xml_path):
+ """Parse a JUnit XML file and return a list of test result dicts."""
+ results = []
+ try:
+ tree = ElementTree.parse(xml_path)
+ except ElementTree.ParseError as e:
+ print(f"WARNING: Failed to parse {xml_path}: {e}", file=sys.stderr)
+ return results
+
+ root = tree.getroot()
+
+ if root.tag == "testsuites":
+ suites = root.findall("testsuite")
+ elif root.tag == "testsuite":
+ suites = [root]
+ else:
+ return results
+
+ for suite in suites:
+ suite_name = suite.get("name", os.path.basename(xml_path))
+ for testcase in suite.findall("testcase"):
+ name = testcase.get("name", "unknown")
+ classname = testcase.get("classname", "")
+ time_taken = testcase.get("time", "0")
+
+ failure = testcase.find("failure")
+ error = testcase.find("error")
+ skipped = testcase.find("skipped")
+
+ if skipped is not None:
+ status = "skipped"
+ message = skipped.get("message", "")
+ elif failure is not None:
+ status = "failed"
+ message = failure.get("message", "")
+ if failure.text:
+ message = failure.text[:500]
+ elif error is not None:
+ status = "error"
+ message = error.get("message", "")
+ if error.text:
+ message = error.text[:500]
+ else:
+ status = "passed"
+ message = ""
+
+ results.append(
+ {
+ "suite": suite_name,
+ "classname": classname,
+ "name": name,
+ "status": status,
+ "time": time_taken,
+ "message": message,
+ "source_file": str(xml_path),
+ }
+ )
+
+ return results
+
+
+def collect_all_results(results_dir):
+ """Collect test results from all JUnit XML files in a directory."""
+ results_dir = Path(results_dir)
+ all_results = []
+ for xml_file in sorted(results_dir.rglob("*.xml")):
+ all_results.extend(parse_junit_xml(xml_file))
+ return all_results
+
+
+# ---------------------------------------------------------------------------
+# Classification
+# ---------------------------------------------------------------------------
+
+
+def classify_failures(results):
+ """
+ Classify test results into passed, failed, flaky, skipped, and error.
+
+ pytest-rerunfailures records reruns as additional entries.
+ A test that failed then passed on rerun is flaky.
+ """
+ test_groups = defaultdict(list)
+ for r in results:
+ key = f"{r['suite']}::{r['classname']}::{r['name']}"
+ test_groups[key].append(r)
+
+ classified = {
+ "passed": [],
+ "failed": [],
+ "flaky": [],
+ "skipped": [],
+ "error": [],
+ }
+
+ for key, entries in test_groups.items():
+ statuses = [e["status"] for e in entries]
+
+ if all(s == "skipped" for s in statuses):
+ classified["skipped"].append(entries[0])
+ elif any(s == "passed" for s in statuses):
+ if any(s in ("failed", "error") for s in statuses):
+ entry = entries[-1].copy()
+ entry["status"] = "flaky"
+ entry["retry_count"] = sum(
+ 1 for s in statuses if s in ("failed", "error")
+ )
+ classified["flaky"].append(entry)
+ else:
+ classified["passed"].append(entries[-1])
+ elif any(s == "error" for s in statuses):
+ classified["error"].append(entries[-1])
+ else:
+ classified["failed"].append(entries[-1])
+
+ return classified
+
+
+# ---------------------------------------------------------------------------
+# History management
+# ---------------------------------------------------------------------------
+
+
+def load_history(history_path):
+ """Load failure history from a local JSON file."""
+ try:
+ with open(history_path) as f:
+ data = json.load(f)
+ if "tests" in data:
+ return data
+ except (FileNotFoundError, json.JSONDecodeError):
+ pass
+ return dict(EMPTY_HISTORY)
+
+
+def _days_between(date_a, date_b):
+ """Return absolute number of days between two YYYY-MM-DD strings."""
+ try:
+ a = datetime.strptime(date_a, "%Y-%m-%d")
+ b = datetime.strptime(date_b, "%Y-%m-%d")
+ return abs((a - b).days)
+ except (ValueError, TypeError):
+ return 999
+
+
+def _is_recent_resolve(rec, date_str):
+ """Check if a test was resolved recently (within bounce window)."""
+ resolved_date = rec.get("resolved_date", "")
+ if not resolved_date:
+ return False
+ return _days_between(resolved_date, date_str) <= BOUNCE_WINDOW_DAYS
+
+
+def update_history(history, classified, sha, date_str):
+ """
+ Update failure history with this run's results.
+
+ Returns (history, new_failures, recurring_failures, resolved_tests).
+
+ Classification logic:
+ - "new failure": never seen before (no history entry at all)
+ - "recurring": was already active (failing on previous runs)
+ - "bouncing": was resolved recently but failed again — reactivated
+ as recurring (not new), and marked cross-run flaky after 2+ bounces
+ - "resolved": was active, now passes — notified once, then silent
+ on subsequent passes
+ """
+ tests = history.setdefault("tests", {})
+ new_failures = []
+ recurring_failures = []
+ resolved_tests = []
+
+ # --- Genuine failures ---
+ for entry in classified["failed"] + classified["error"]:
+ test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}"
+
+ if test_key in tests:
+ rec = tests[test_key]
+
+ if rec["status"] == "active":
+ # Still failing — bump count
+ rec["last_seen_date"] = date_str
+ rec["last_seen_sha"] = sha
+ rec["failure_count"] += 1
+ recurring_failures.append(
+ {**entry, "first_seen": rec["first_seen_date"]}
+ )
+ elif rec["status"] == "resolved" and _is_recent_resolve(
+ rec, date_str
+ ):
+ # Bouncing: resolved recently but failed again.
+ # Reactivate as recurring, not new. Track the bounce.
+ rec["status"] = "active"
+ rec["last_seen_date"] = date_str
+ rec["last_seen_sha"] = sha
+ rec["failure_count"] += 1
+ rec["bounce_count"] = rec.get("bounce_count", 0) + 1
+ if rec["bounce_count"] >= BOUNCE_THRESHOLD:
+ rec["is_flaky"] = True
+ recurring_failures.append(
+ {
+ **entry,
+ "first_seen": rec["first_seen_date"],
+ "is_bouncing": True,
+ }
+ )
+ else:
+ # Resolved long ago — treat as new cycle but keep history
+ rec["status"] = "active"
+ rec["last_seen_date"] = date_str
+ rec["last_seen_sha"] = sha
+ rec["failure_count"] += 1
+ rec["bounce_count"] = rec.get("bounce_count", 0) + 1
+ new_failures.append(entry)
+ else:
+ # Truly new — never seen before
+ tests[test_key] = {
+ "suite": entry["suite"],
+ "classname": entry["classname"],
+ "name": entry["name"],
+ "first_seen_date": date_str,
+ "first_seen_sha": sha,
+ "last_seen_date": date_str,
+ "last_seen_sha": sha,
+ "failure_count": 1,
+ "is_flaky": False,
+ "bounce_count": 0,
+ "status": "active",
+ }
+ new_failures.append(entry)
+
+ # --- Flaky tests (passed on retry within this run) ---
+ for entry in classified["flaky"]:
+ test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}"
+ if test_key in tests:
+ rec = tests[test_key]
+ rec["last_seen_date"] = date_str
+ rec["last_seen_sha"] = sha
+ rec["failure_count"] += 1
+ rec["is_flaky"] = True
+ # If it was resolved, reactivate — it's still unstable
+ if rec["status"] == "resolved":
+ rec["status"] = "active"
+ rec["bounce_count"] = rec.get("bounce_count", 0) + 1
+ else:
+ tests[test_key] = {
+ "suite": entry["suite"],
+ "classname": entry["classname"],
+ "name": entry["name"],
+ "first_seen_date": date_str,
+ "first_seen_sha": sha,
+ "last_seen_date": date_str,
+ "last_seen_sha": sha,
+ "failure_count": 1,
+ "is_flaky": True,
+ "bounce_count": 0,
+ "status": "active",
+ }
+
+ # --- Resolve stabilized tests ---
+ passed_keys = set()
+ for entry in classified["passed"]:
+ test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}"
+ passed_keys.add(test_key)
+
+ for test_key in passed_keys:
+ if test_key in tests and tests[test_key]["status"] == "active":
+ rec = tests[test_key]
+ rec["status"] = "resolved"
+ rec["resolved_date"] = date_str
+ rec["resolved_sha"] = sha
+ resolved_tests.append(
+ {
+ "suite": rec["suite"],
+ "classname": rec["classname"],
+ "name": rec["name"],
+ "first_seen": rec["first_seen_date"],
+ "failure_count": rec["failure_count"],
+ "bounce_count": rec.get("bounce_count", 0),
+ "was_flaky": rec.get("is_flaky", False),
+ }
+ )
+ # If already "resolved" and passes again — no notification.
+ # The resolved notification was sent once when it first stabilized.
+
+ return history, new_failures, recurring_failures, resolved_tests
+
+
+def save_history(history, history_path):
+ """Write history to a local JSON file."""
+ with open(history_path, "w") as f:
+ json.dump(history, f, indent=2, sort_keys=True)
+ f.write("\n")
+
+
+# ---------------------------------------------------------------------------
+# Report generation
+# ---------------------------------------------------------------------------
+
+
+def generate_markdown_report(
+ classified,
+ new_failures,
+ recurring_failures,
+ resolved_tests,
+ history,
+ test_type="",
+ matrix_label="",
+ sha="",
+ date_str="",
+):
+ """Generate a Markdown summary report."""
+ lines = []
+ title = "# Nightly Test Report"
+ if test_type:
+ title += f" — {test_type}"
+ if matrix_label:
+ title += f" [{matrix_label}]"
+ lines.append(title)
+ lines.append("")
+ if date_str or sha:
+ meta_parts = []
+ if date_str:
+ meta_parts.append(f"**Date:** {date_str}")
+ if sha:
+ meta_parts.append(f"**Commit:** `{sha[:12]}`")
+ if matrix_label:
+ meta_parts.append(f"**Matrix:** {matrix_label}")
+ lines.append(" | ".join(meta_parts))
+ lines.append("")
+
+ total_passed = len(classified["passed"])
+ total_failed = len(classified["failed"]) + len(classified["error"])
+ total_flaky = len(classified["flaky"])
+ total_skipped = len(classified["skipped"])
+ total = total_passed + total_failed + total_flaky + total_skipped
+
+ lines.append("## Summary")
+ lines.append("")
+ lines.append("| Metric | Count |")
+ lines.append("|--------|-------|")
+ lines.append(f"| Total tests | {total} |")
+ lines.append(f"| Passed | {total_passed} |")
+ lines.append(f"| **Genuine failures** | **{total_failed}** |")
+ lines.append(f"| Flaky (passed on retry) | {total_flaky} |")
+ lines.append(f"| Skipped | {total_skipped} |")
+ if resolved_tests:
+ lines.append(
+ f"| **Stabilized (were failing, now pass)** | **{len(resolved_tests)}** |"
+ )
+ lines.append("")
+
+ # -- New genuine failures (highest priority) --
+ if new_failures:
+ lines.append("## NEW Failures (not previously seen)")
+ lines.append("")
+ lines.append("| Suite | Test | Error |")
+ lines.append("|-------|------|-------|")
+ for entry in new_failures:
+ short_msg = (
+ entry.get("message", "")[:80]
+ .replace("\n", " ")
+ .replace("|", "\\|")
+ )
+ lines.append(
+ f"| {entry['suite']} | `{entry['name']}` | {short_msg} |"
+ )
+ lines.append("")
+
+ # -- Recurring failures --
+ if recurring_failures:
+ lines.append("## Recurring Failures")
+ lines.append("")
+ lines.append("| Suite | Test | First seen | Failure count | Error |")
+ lines.append("|-------|------|------------|---------------|-------|")
+ for entry in recurring_failures:
+ short_msg = (
+ entry.get("message", "")[:60]
+ .replace("\n", " ")
+ .replace("|", "\\|")
+ )
+ first_seen = entry.get("first_seen", "unknown")
+ test_key = (
+ f"{entry['suite']}::{entry['classname']}::{entry['name']}"
+ )
+ count = (
+ history.get("tests", {})
+ .get(test_key, {})
+ .get("failure_count", "?")
+ )
+ lines.append(
+ f"| {entry['suite']} | `{entry['name']}` | {first_seen} | {count} | {short_msg} |"
+ )
+ lines.append("")
+
+ # -- Stabilized tests --
+ if resolved_tests:
+ lines.append("## Stabilized Tests (were failing, now passing)")
+ lines.append("")
+ lines.append(
+ "| Suite | Test | Was failing since | Total failure count | Was flaky? |"
+ )
+ lines.append(
+ "|-------|------|-------------------|---------------------|------------|"
+ )
+ for entry in resolved_tests:
+ flaky_badge = "Yes" if entry.get("was_flaky") else "No"
+ lines.append(
+ f"| {entry['suite']} | `{entry['name']}` | {entry['first_seen']} "
+ f"| {entry['failure_count']} | {flaky_badge} |"
+ )
+ lines.append("")
+
+ # -- Flaky tests --
+ if classified["flaky"]:
+ lines.append("## Flaky Tests (passed on retry)")
+ lines.append("")
+ lines.append("| Suite | Test | Retries needed |")
+ lines.append("|-------|------|----------------|")
+ for entry in classified["flaky"]:
+ retry_count = entry.get("retry_count", "?")
+ lines.append(
+ f"| {entry['suite']} | `{entry['name']}` | {retry_count} |"
+ )
+ lines.append("")
+
+ # -- Detailed errors --
+ all_failures = classified["failed"] + classified["error"]
+ if all_failures:
+ lines.append("## All Failure Details")
+ lines.append("")
+ for entry in all_failures:
+ lines.append(f"### `{entry['classname']}::{entry['name']}`")
+ lines.append(f"- **Suite**: {entry['suite']}")
+ lines.append(f"- **Source**: {entry['source_file']}")
+ msg = entry.get("message", "").strip()
+ if msg:
+ lines.append("- **Error**:")
+ lines.append("```")
+ for line in msg.split("\n")[:20]:
+ lines.append(line)
+ lines.append("```")
+ lines.append("")
+
+ if not all_failures and not classified["flaky"] and not resolved_tests:
+ lines.append("All tests passed! No failures or flaky tests detected.")
+ lines.append("")
+
+ return "\n".join(lines)
+
+
+def generate_json_summary(
+ classified,
+ new_failures,
+ recurring_failures,
+ resolved_tests,
+ test_type="",
+ matrix_label="",
+ sha="",
+ date_str="",
+):
+ """Generate a JSON summary for downstream tools (Slack notifier, dashboard)."""
+ return {
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "test_type": test_type,
+ "matrix_label": matrix_label,
+ "sha": sha,
+ "date": date_str,
+ "counts": {
+ "total": sum(len(v) for v in classified.values()),
+ "passed": len(classified["passed"]),
+ "failed": len(classified["failed"]) + len(classified["error"]),
+ "flaky": len(classified["flaky"]),
+ "skipped": len(classified["skipped"]),
+ "resolved": len(resolved_tests),
+ },
+ "has_new_failures": len(new_failures) > 0,
+ "new_failures": [
+ {
+ "suite": e["suite"],
+ "name": e["name"],
+ "classname": e["classname"],
+ "message": e.get("message", "")[:200],
+ }
+ for e in new_failures
+ ],
+ "recurring_failures": [
+ {
+ "suite": e["suite"],
+ "name": e["name"],
+ "classname": e["classname"],
+ "first_seen": e.get("first_seen", "unknown"),
+ "message": e.get("message", "")[:200],
+ }
+ for e in recurring_failures
+ ],
+ "flaky_tests": [
+ {
+ "suite": e["suite"],
+ "name": e["name"],
+ "classname": e["classname"],
+ "retry_count": e.get("retry_count", 0),
+ }
+ for e in classified["flaky"]
+ ],
+ "resolved_tests": [
+ {
+ "suite": e["suite"],
+ "name": e["name"],
+ "classname": e["classname"],
+ "first_seen": e.get("first_seen", "unknown"),
+ "failure_count": e.get("failure_count", 0),
+ "was_flaky": e.get("was_flaky", False),
+ }
+ for e in resolved_tests
+ ],
+ }
+
+
+# ---------------------------------------------------------------------------
+# HTML report
+# ---------------------------------------------------------------------------
+
+
+def _html_escape(text):
+ """Escape HTML special characters."""
+ return (
+ text.replace("&", "&")
+ .replace("<", "<")
+ .replace(">", ">")
+ .replace('"', """)
+ )
+
+
+def generate_html_report(
+ classified,
+ new_failures,
+ recurring_failures,
+ resolved_tests,
+ history,
+ test_type="",
+ matrix_label="",
+ sha="",
+ date_str="",
+):
+ """Generate a self-contained HTML report with detailed failure info."""
+ total_passed = len(classified["passed"])
+ total_failed = len(classified["failed"]) + len(classified["error"])
+ total_flaky = len(classified["flaky"])
+ total_skipped = len(classified["skipped"])
+ total = total_passed + total_failed + total_flaky + total_skipped
+
+ title = "Nightly Test Report"
+ if test_type:
+ title += f" — {_html_escape(test_type)}"
+ if matrix_label:
+ title += f" [{_html_escape(matrix_label)}]"
+
+ # Determine overall status color
+ if total_failed > 0:
+ status_color = "#d32f2f"
+ status_text = f"{total_failed} failure(s)"
+ elif total_flaky > 0:
+ status_color = "#f9a825"
+ status_text = "All passed (flaky detected)"
+ else:
+ status_color = "#388e3c"
+ status_text = "All passed"
+
+ parts = []
+ parts.append(f"""
+
+
+
+
+{title}
+
+
+
+{title}
+""")
+
+ meta_parts = []
+ if date_str:
+ meta_parts.append(f"Date: {_html_escape(date_str)} ")
+ if sha:
+ meta_parts.append(f"Commit: {_html_escape(sha[:12])}")
+ if matrix_label:
+ meta_parts.append(
+ f"Matrix: {_html_escape(matrix_label)} "
+ )
+ parts.append(" | ".join(meta_parts))
+
+ parts.append(f"""
+{status_text}
+
+
+
+
+
+
+
{len(resolved_tests)}
Stabilized
+
""")
+
+ # --- New failures ---
+ if new_failures:
+ parts.append("New Failures ")
+ parts.append("Suite Test Error ")
+ for e in new_failures:
+ msg = _html_escape(e.get("message", ""))
+ short = _html_escape(e.get("message", "")[:100])
+ parts.append(
+ f"{_html_escape(e['suite'])} "
+ f"{_html_escape(e['name'])} "
+ f'NEW '
+ f"{short} "
+ f'{msg} '
+ )
+ parts.append("
")
+
+ # --- Recurring failures ---
+ if recurring_failures:
+ parts.append("Recurring Failures ")
+ parts.append(
+ "Suite Test First Seen "
+ "Count Error "
+ )
+ for e in recurring_failures:
+ msg = _html_escape(e.get("message", ""))
+ short = _html_escape(e.get("message", "")[:100])
+ first_seen = _html_escape(e.get("first_seen", "unknown"))
+ test_key = f"{e['suite']}::{e['classname']}::{e['name']}"
+ count = (
+ history.get("tests", {})
+ .get(test_key, {})
+ .get("failure_count", "?")
+ )
+ parts.append(
+ f"{_html_escape(e['suite'])} "
+ f"{_html_escape(e['name'])} "
+ f'RECURRING '
+ f"{first_seen} {count} "
+ f"{short} "
+ f'{msg} '
+ )
+ parts.append("
")
+
+ # --- Stabilized ---
+ if resolved_tests:
+ parts.append("Stabilized Tests ")
+ parts.append(
+ "Suite Test Failing Since "
+ "Failure Count Was Flaky? "
+ )
+ for e in resolved_tests:
+ flaky_tag = "Yes" if e.get("was_flaky") else "No"
+ parts.append(
+ f"{_html_escape(e['suite'])} "
+ f"{_html_escape(e['name'])} "
+ f'FIXED '
+ f"{_html_escape(e.get('first_seen', '?'))} "
+ f"{e.get('failure_count', '?')} "
+ f"{flaky_tag} "
+ )
+ parts.append("
")
+
+ # --- Flaky ---
+ if classified["flaky"]:
+ parts.append("Flaky Tests (passed on retry) ")
+ parts.append("Suite Test Retries ")
+ for e in classified["flaky"]:
+ parts.append(
+ f"{_html_escape(e['suite'])} "
+ f"{_html_escape(e['name'])} "
+ f'FLAKY '
+ f"{e.get('retry_count', '?')} "
+ )
+ parts.append("
")
+
+ # --- All failure details ---
+ all_failures = classified["failed"] + classified["error"]
+ if all_failures:
+ parts.append("All Failure Details ")
+ for e in all_failures:
+ msg = _html_escape(e.get("message", "").strip())
+ parts.append(
+ f''
+ f"{_html_escape(e['classname'])}::{_html_escape(e['name'])} "
+ f''
+ f"Suite: {_html_escape(e['suite'])} | "
+ f"Source: {_html_escape(e['source_file'])}
"
+ )
+ if msg:
+ parts.append(f'{msg} ')
+ parts.append(" ")
+
+ if not all_failures and not classified["flaky"] and not resolved_tests:
+ parts.append(
+ 'All tests passed! No failures or flaky tests detected.
'
+ )
+
+ parts.append("")
+ return "\n".join(parts)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Generate nightly test failure report from JUnit XML results"
+ )
+ parser.add_argument(
+ "--results-dir",
+ required=True,
+ help="Directory containing JUnit XML test result files",
+ )
+ parser.add_argument(
+ "--output-dir",
+ default="report-output",
+ help="Directory to write report files to",
+ )
+ parser.add_argument(
+ "--sha",
+ default=os.environ.get("GITHUB_SHA", "unknown"),
+ help="Git commit SHA for this run",
+ )
+ parser.add_argument(
+ "--date",
+ default=datetime.now(timezone.utc).strftime("%Y-%m-%d"),
+ help="Date for this run (YYYY-MM-DD)",
+ )
+ parser.add_argument(
+ "--test-type",
+ default="unknown",
+ help=(
+ "Test type identifier (e.g., cpp, python, wheel-python, "
+ "wheel-server, notebooks)"
+ ),
+ )
+ parser.add_argument(
+ "--matrix-label",
+ default="",
+ help=(
+ "Matrix combination label (e.g., cuda12.9-py3.12-x86_64). "
+ "Included in reports and JSON summary to identify the CI job."
+ ),
+ )
+ parser.add_argument(
+ "--s3-history-uri",
+ default="",
+ help=(
+ "S3 URI for persistent failure history JSON. "
+ "Downloaded before analysis, uploaded after update. "
+ "Example: s3://bucket/ci_test_reports/nightly/history/"
+ "python-main-cuda12.9-py3.12-x86_64.json"
+ ),
+ )
+ parser.add_argument(
+ "--s3-summary-uri",
+ default="",
+ help=(
+ "S3 URI to upload this run's JSON snapshot for aggregation. "
+ "Example: s3://bucket/ci_test_reports/nightly/summaries/"
+ "2026-04-13/python-cuda12.9-py3.12-x86_64.json"
+ ),
+ )
+ parser.add_argument(
+ "--s3-html-uri",
+ default="",
+ help=(
+ "S3 URI to upload the HTML report. "
+ "Example: s3://bucket/ci_test_reports/nightly/reports/"
+ "2026-04-13/python-cuda12.9-py3.12-x86_64.html"
+ ),
+ )
+ parser.add_argument(
+ "--github-step-summary",
+ default=os.environ.get("GITHUB_STEP_SUMMARY", ""),
+ help="Path to write GitHub Actions step summary",
+ )
+
+ args = parser.parse_args()
+
+ output_dir = Path(args.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+ local_history_path = str(output_dir / "test_failure_history.json")
+
+ # ---- Step 1: Download history from S3 ----
+ if args.s3_history_uri:
+ s3_download(args.s3_history_uri, local_history_path)
+
+ # ---- Step 2: Collect and classify results ----
+ print(f"Collecting test results from {args.results_dir} ...")
+ results = collect_all_results(args.results_dir)
+ if not results:
+ print("WARNING: No test results found.", file=sys.stderr)
+
+ print(f"Found {len(results)} test case entries across all XML files")
+ classified = classify_failures(results)
+
+ print(
+ f"Classification: {len(classified['passed'])} passed, "
+ f"{len(classified['failed'])} failed, "
+ f"{len(classified['error'])} errors, "
+ f"{len(classified['flaky'])} flaky, "
+ f"{len(classified['skipped'])} skipped"
+ )
+
+ # ---- Step 3: Update history ----
+ history = load_history(local_history_path)
+ history, new_failures, recurring_failures, resolved_tests = update_history(
+ history, classified, args.sha, args.date
+ )
+
+ if resolved_tests:
+ print(
+ f"Stabilized: {len(resolved_tests)} previously-failing test(s) now pass"
+ )
+
+ save_history(history, local_history_path)
+ print(f"Updated local history at {local_history_path}")
+
+ # ---- Step 4: Upload history back to S3 ----
+ if args.s3_history_uri:
+ s3_upload(local_history_path, args.s3_history_uri)
+
+ # ---- Step 5: Generate reports ----
+ report_kwargs = dict(
+ test_type=args.test_type,
+ matrix_label=args.matrix_label,
+ sha=args.sha,
+ date_str=args.date,
+ )
+
+ md_report = generate_markdown_report(
+ classified,
+ new_failures,
+ recurring_failures,
+ resolved_tests,
+ history,
+ **report_kwargs,
+ )
+ md_path = output_dir / "nightly_report.md"
+ md_path.write_text(md_report)
+ print(f"Markdown report written to {md_path}")
+
+ html_report = generate_html_report(
+ classified,
+ new_failures,
+ recurring_failures,
+ resolved_tests,
+ history,
+ **report_kwargs,
+ )
+ html_path = output_dir / "nightly_report.html"
+ html_path.write_text(html_report)
+ print(f"HTML report written to {html_path}")
+
+ json_summary = generate_json_summary(
+ classified,
+ new_failures,
+ recurring_failures,
+ resolved_tests,
+ **report_kwargs,
+ )
+ json_path = output_dir / "nightly_summary.json"
+ json_path.write_text(json.dumps(json_summary, indent=2) + "\n")
+ print(f"JSON summary written to {json_path}")
+
+ if args.github_step_summary:
+ with open(args.github_step_summary, "a") as f:
+ f.write(md_report)
+ print(f"Wrote GitHub Step Summary to {args.github_step_summary}")
+
+ # ---- Step 6: Upload per-run snapshot and HTML to S3 ----
+ if args.s3_summary_uri:
+ s3_upload(str(json_path), args.s3_summary_uri)
+
+ if args.s3_html_uri:
+ s3_upload(str(html_path), args.s3_html_uri)
+
+ # ---- Exit code ----
+ genuine_failures = len(classified["failed"]) + len(classified["error"])
+ if genuine_failures > 0:
+ print(
+ f"\nFAILED: {genuine_failures} genuine test failure(s) detected."
+ )
+ return 1
+ if classified["flaky"]:
+ print(
+ f"\nWARNING: All tests passed but {len(classified['flaky'])} flaky test(s) detected."
+ )
+ else:
+ print("\nAll tests passed.")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/ci/utils/nightly_report_helper.sh b/ci/utils/nightly_report_helper.sh
new file mode 100755
index 0000000000..c3b77e6b7a
--- /dev/null
+++ b/ci/utils/nightly_report_helper.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Shared helper for generating nightly test reports with matrix-aware S3 paths.
+#
+# Usage (source from any test script):
+#
+# # For C++ tests (no Python version in matrix label):
+# generate_nightly_report "cpp"
+#
+# # For Python tests (includes Python version in matrix label):
+# generate_nightly_report "python" --with-python-version
+#
+# # For wheel tests:
+# generate_nightly_report "wheel-python" --with-python-version
+#
+# Prerequisites (set before calling):
+# RAPIDS_TESTS_DIR - directory containing JUnit XML test results
+#
+# Optional environment variables (auto-detected if not set):
+# RAPIDS_CUDA_VERSION - CUDA version (e.g., "12.9")
+# RAPIDS_PY_VERSION - Python version (e.g., "3.12"), used with --with-python-version
+# RAPIDS_BRANCH - branch name (e.g., "main")
+# CUOPT_S3_URI - S3 bucket root (e.g., s3://cuopt-datasets/)
+# GITHUB_SHA - commit SHA
+# GITHUB_STEP_SUMMARY - path for GitHub Actions step summary
+
+# Resolve the directory where THIS helper lives (ci/utils/)
+_HELPER_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
+
+generate_nightly_report() {
+ local test_type="${1:?Usage: generate_nightly_report [--with-python-version]}"
+ local include_py_version=false
+
+ shift
+ while [ $# -gt 0 ]; do
+ case "$1" in
+ --with-python-version) include_py_version=true ;;
+ *) echo "WARNING: Unknown option: $1" >&2 ;;
+ esac
+ shift
+ done
+
+ # --- Build matrix label ---
+ local cuda_tag="cuda${RAPIDS_CUDA_VERSION:-unknown}"
+ local arch_tag
+ arch_tag="$(arch)"
+ local matrix_label="${cuda_tag}-${arch_tag}"
+
+ if [ "${include_py_version}" = true ]; then
+ local py_tag="py${RAPIDS_PY_VERSION:-unknown}"
+ matrix_label="${cuda_tag}-${py_tag}-${arch_tag}"
+ fi
+
+ local branch_slug
+ branch_slug=$(echo "${RAPIDS_BRANCH:-main}" | tr '/' '-')
+ local run_date
+ run_date="$(date +%F)"
+
+ # --- Ensure results dir exists ---
+ RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}"
+ mkdir -p "${RAPIDS_TESTS_DIR}"
+
+ local report_output_dir="${RAPIDS_TESTS_DIR}/report"
+ mkdir -p "${report_output_dir}"
+
+ # --- Build S3 URIs ---
+ local s3_history_uri=""
+ local s3_summary_uri=""
+ local s3_html_uri=""
+
+ if [ -n "${CUOPT_S3_URI:-}" ]; then
+ local s3_base="${CUOPT_S3_URI}ci_test_reports/nightly"
+ s3_history_uri="${s3_base}/history/${branch_slug}/${test_type}-${matrix_label}.json"
+ s3_summary_uri="${s3_base}/summaries/${run_date}/${branch_slug}/${test_type}-${matrix_label}.json"
+ s3_html_uri="${s3_base}/reports/${run_date}/${branch_slug}/${test_type}-${matrix_label}.html"
+ fi
+
+ # --- Run nightly report ---
+ python3 "${_HELPER_DIR}/nightly_report.py" \
+ --results-dir "${RAPIDS_TESTS_DIR}" \
+ --output-dir "${report_output_dir}" \
+ --sha "${GITHUB_SHA:-unknown}" \
+ --date "${run_date}" \
+ --test-type "${test_type}" \
+ --matrix-label "${matrix_label}" \
+ --s3-history-uri "${s3_history_uri}" \
+ --s3-summary-uri "${s3_summary_uri}" \
+ --s3-html-uri "${s3_html_uri}" \
+ --github-step-summary "${GITHUB_STEP_SUMMARY:-}" \
+ || true
+}
diff --git a/ci/utils/s3_helpers.py b/ci/utils/s3_helpers.py
new file mode 100644
index 0000000000..54e8b96d21
--- /dev/null
+++ b/ci/utils/s3_helpers.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Shared S3 helper functions for cuOpt CI scripts.
+
+Maps CUOPT_AWS_* credentials to standard AWS env vars and provides
+download / upload / list wrappers around the aws CLI.
+"""
+
+import os
+import subprocess
+import sys
+
+
+def s3_env():
+ """Build env dict for AWS CLI calls using CUOPT-specific credentials.
+
+ The cuOpt S3 bucket requires explicit CUOPT_AWS_* static credentials.
+ Role-based credentials from aws-actions/configure-aws-credentials do not
+ have access. We override AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY with
+ the CUOPT_* values and unset AWS_SESSION_TOKEN to avoid mixing with
+ role-based session tokens (matching the pattern in datasets/*.sh).
+ """
+ env = os.environ.copy()
+ if os.environ.get("CUOPT_AWS_ACCESS_KEY_ID"):
+ env["AWS_ACCESS_KEY_ID"] = os.environ["CUOPT_AWS_ACCESS_KEY_ID"]
+ if os.environ.get("CUOPT_AWS_SECRET_ACCESS_KEY"):
+ env["AWS_SECRET_ACCESS_KEY"] = os.environ[
+ "CUOPT_AWS_SECRET_ACCESS_KEY"
+ ]
+ # Unset session token to avoid mixing role-based tokens with static keys
+ env.pop("AWS_SESSION_TOKEN", None)
+ if os.environ.get("CUOPT_AWS_REGION"):
+ env["AWS_DEFAULT_REGION"] = os.environ["CUOPT_AWS_REGION"]
+ elif "AWS_DEFAULT_REGION" not in env:
+ env["AWS_DEFAULT_REGION"] = "us-east-1"
+ return env
+
+
+def s3_download(s3_uri, local_path):
+ """Download a file from S3. Returns True on success, False on any error."""
+ env = s3_env()
+ try:
+ subprocess.run(
+ ["aws", "s3", "cp", s3_uri, local_path],
+ env=env,
+ check=True,
+ capture_output=True,
+ text=True,
+ )
+ print(f"Downloaded {s3_uri}")
+ return True
+ except FileNotFoundError:
+ print(
+ "WARNING: aws CLI not found, skipping S3 download", file=sys.stderr
+ )
+ return False
+ except subprocess.CalledProcessError as exc:
+ print(
+ f"WARNING: S3 download failed (first run?): {exc.stderr.strip()}",
+ file=sys.stderr,
+ )
+ return False
+
+
+def s3_upload(local_path, s3_uri):
+ """Upload a file to S3. Returns True on success."""
+ env = s3_env()
+ try:
+ subprocess.run(
+ ["aws", "s3", "cp", local_path, s3_uri],
+ env=env,
+ check=True,
+ capture_output=True,
+ text=True,
+ )
+ print(f"Uploaded {local_path} to {s3_uri}")
+ return True
+ except FileNotFoundError:
+ print(
+ "WARNING: aws CLI not found, skipping S3 upload", file=sys.stderr
+ )
+ return False
+ except subprocess.CalledProcessError as exc:
+ print(
+ f"WARNING: S3 upload failed: {exc.stderr.strip()}", file=sys.stderr
+ )
+ return False
+
+
+def s3_list(s3_prefix):
+ """List objects under an S3 prefix (recursive). Returns list of S3 URIs."""
+ env = s3_env()
+ # Extract bucket and prefix from s3_prefix for reconstructing full URIs
+ # s3_prefix looks like "s3://bucket/path/to/prefix/"
+ try:
+ result = subprocess.run(
+ ["aws", "s3", "ls", "--recursive", s3_prefix],
+ env=env,
+ check=True,
+ capture_output=True,
+ text=True,
+ )
+ except (FileNotFoundError, subprocess.CalledProcessError) as exc:
+ print(f"WARNING: S3 ls failed: {exc}", file=sys.stderr)
+ return []
+
+ # --recursive output format: "2026-04-16 12:00:00 1234 path/to/file.json"
+ # We need to reconstruct full S3 URIs from the key paths
+ # Parse bucket from s3_prefix
+ if not s3_prefix.startswith("s3://"):
+ return []
+ without_scheme = s3_prefix[5:] # remove "s3://"
+ bucket = without_scheme.split("/")[0]
+ base_uri = f"s3://{bucket}/"
+
+ uris = []
+ for line in result.stdout.strip().splitlines():
+ parts = line.split(None, 3) # date, time, size, key
+ if len(parts) == 4:
+ uris.append(f"{base_uri}{parts[3]}")
+ return uris
diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh
new file mode 100755
index 0000000000..195a7d5797
--- /dev/null
+++ b/ci/utils/send_consolidated_summary.sh
@@ -0,0 +1,401 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Send a consolidated Slack notification for the entire nightly run.
+# Reads the aggregated JSON produced by aggregate_nightly.py and sends:
+# - Main message: Header + status summary + test totals + failed CI jobs
+# - Thread replies: matrix details, failure details, links, HTML report
+#
+# If SLACK_BOT_TOKEN is available, posts via chat.postMessage (enables
+# threading). Falls back to webhook (no threading) otherwise.
+#
+# Required environment variables:
+# SLACK_WEBHOOK_URL - Slack incoming webhook URL (fallback)
+# CONSOLIDATED_SUMMARY - Path to consolidated_summary.json
+#
+# Optional environment variables:
+# CONSOLIDATED_HTML - Path to consolidated HTML file to upload
+# SLACK_BOT_TOKEN - Slack Bot Token (xoxb-*) for threading + file uploads
+# SLACK_CHANNEL_ID - Slack channel ID (required with bot token)
+# PRESIGNED_REPORT_URL - Presigned URL for consolidated HTML report
+# PRESIGNED_DASHBOARD_URL - Presigned URL for dashboard
+
+set -euo pipefail
+
+CONSOLIDATED_SUMMARY="${CONSOLIDATED_SUMMARY:?CONSOLIDATED_SUMMARY must point to consolidated_summary.json}"
+SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:?SLACK_WEBHOOK_URL is required}"
+CONSOLIDATED_HTML="${CONSOLIDATED_HTML:-}"
+SLACK_BOT_TOKEN="${SLACK_BOT_TOKEN:-}"
+SLACK_CHANNEL_ID="${SLACK_CHANNEL_ID:-}"
+PRESIGNED_REPORT_URL="${PRESIGNED_REPORT_URL:-}"
+PRESIGNED_DASHBOARD_URL="${PRESIGNED_DASHBOARD_URL:-}"
+
+if [ ! -f "${CONSOLIDATED_SUMMARY}" ]; then
+ echo "ERROR: Summary file not found: ${CONSOLIDATED_SUMMARY}" >&2
+ exit 1
+fi
+
+# Generate Slack payloads — one JSON object per line.
+# Line 1 = main message, lines 2+ = thread replies.
+PAYLOADS=$(python3 - "${CONSOLIDATED_SUMMARY}" "${PRESIGNED_REPORT_URL}" "${PRESIGNED_DASHBOARD_URL}" <<'PYEOF'
+import json, sys
+
+summary_path = sys.argv[1]
+presigned_report_url = sys.argv[2] if len(sys.argv) > 2 else ""
+presigned_dashboard_url = sys.argv[3] if len(sys.argv) > 3 else ""
+
+with open(summary_path) as f:
+ d = json.load(f)
+
+branch = d.get("branch", "main")
+date = d.get("date", "unknown")
+github_run_url = d.get("github_run_url", "")
+jobs = d.get("job_summary", {})
+totals = d.get("test_totals", {})
+grid = d.get("matrix_grid", [])
+has_new = d.get("has_new_failures", False)
+failed_ci_jobs = d.get("failed_ci_jobs", [])
+untracked_failed = d.get("untracked_failed_ci_jobs", [])
+workflow_jobs = d.get("workflow_jobs", [])
+
+total_jobs = jobs.get("total", 0)
+failed_jobs = jobs.get("failed", 0)
+flaky_jobs = jobs.get("flaky", 0)
+passed_jobs = jobs.get("passed", 0)
+
+total_ci_jobs = len(workflow_jobs)
+failed_ci_count = len(failed_ci_jobs)
+passed_ci_count = sum(1 for j in workflow_jobs if j["conclusion"] == "success")
+
+status_icons = {
+ "passed": ":white_check_mark:",
+ "failed-new": ":rotating_light:",
+ "failed-recurring": ":x:",
+ "flaky": ":warning:",
+ "no-results": ":grey_question:",
+}
+
+def make_payload(blocks):
+ return json.dumps({
+ "username": "cuOpt Nightly Bot",
+ "icon_emoji": ":robot_face:",
+ "blocks": blocks,
+ })
+
+
+# ══════════════════════════════════════════════════════════════════════
+# MAIN MESSAGE (line 1) — posted to channel, becomes thread parent
+# ══════════════════════════════════════════════════════════════════════
+blocks = []
+
+# Identify which workflows have failures (from both CI jobs and matrix grid)
+failing_workflows = set()
+for j in failed_ci_jobs:
+ prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"]
+ failing_workflows.add(prefix)
+for g in grid:
+ if g["status"].startswith("failed"):
+ failing_workflows.add(g["test_type"])
+flaky_workflows = set()
+for g in grid:
+ if g["status"] == "flaky":
+ flaky_workflows.add(g["test_type"])
+
+has_failures = len(failing_workflows) > 0
+untracked_count = len(untracked_failed)
+
+if has_failures and (has_new or untracked_count > 0):
+ emoji = ":rotating_light:"
+ text = f"{len(failing_workflows)} workflow(s) with failures"
+ mention = ""
+elif has_failures:
+ emoji = ":x:"
+ text = f"Recurring failures in {len(failing_workflows)} workflow(s)"
+ mention = ""
+elif flaky_workflows:
+ emoji = ":large_yellow_circle:"
+ text = "All jobs passed but flaky tests detected"
+ mention = ""
+else:
+ emoji = ":white_check_mark:"
+ text = f"All {total_jobs} matrix jobs passed"
+ if total_ci_jobs > 0:
+ text += f", all {passed_ci_count} CI jobs succeeded"
+ mention = ""
+
+stats_parts = []
+if totals.get("failed", 0) > 0:
+ stats_parts.append(f":x: {totals['failed']} failed")
+if totals.get("flaky", 0) > 0:
+ stats_parts.append(f":warning: {totals['flaky']} flaky")
+if not stats_parts:
+ stats_parts.append(f":white_check_mark: {totals.get('total', 0)} tests passed")
+stats = " | ".join(stats_parts)
+
+blocks.append({
+ "type": "header",
+ "text": {
+ "type": "plain_text",
+ "text": f"cuOpt Nightly Tests \u2014 {branch} \u2014 {date}",
+ "emoji": True,
+ },
+})
+blocks.append({
+ "type": "section",
+ "text": {
+ "type": "mrkdwn",
+ "text": f"{mention}{emoji} *{text}*\n\n{stats}",
+ },
+})
+
+# Per-workflow failure summary using CI job counts from GitHub API
+# Build a lookup: workflow prefix -> (failed, total) from workflow_jobs
+wf_counts = {}
+for j in workflow_jobs:
+ prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"]
+ wf_counts.setdefault(prefix, {"failed": 0, "total": 0})
+ wf_counts[prefix]["total"] += 1
+ if j["conclusion"] == "failure":
+ wf_counts[prefix]["failed"] += 1
+
+if failing_workflows:
+ lines = []
+ for wf in sorted(failing_workflows):
+ counts = wf_counts.get(wf, {})
+ f_count = counts.get("failed", 0)
+ t_count = counts.get("total", 0)
+ if t_count > 0:
+ lines.append(f":x: *{wf}* — {f_count}/{t_count} failed")
+ else:
+ lines.append(f":x: *{wf}* — failed")
+ blocks.append({"type": "divider"})
+ blocks.append({
+ "type": "section",
+ "text": {"type": "mrkdwn", "text": "\n".join(lines)},
+ })
+
+# Links in main message
+link_parts = []
+if github_run_url:
+ link_parts.append(f"<{github_run_url}|:github: GitHub Actions>")
+if presigned_report_url:
+ link_parts.append(f"<{presigned_report_url}|:bar_chart: Full Report>")
+if presigned_dashboard_url:
+ link_parts.append(f"<{presigned_dashboard_url}|:chart_with_upwards_trend: Dashboard>")
+if link_parts:
+ blocks.append({"type": "divider"})
+ blocks.append({
+ "type": "context",
+ "elements": [{"type": "mrkdwn", "text": " | ".join(link_parts)}],
+ })
+
+print(make_payload(blocks))
+
+
+# ══════════════════════════════════════════════════════════════════════
+# THREAD REPLIES (lines 2+) — posted as replies to main message
+# ══════════════════════════════════════════════════════════════════════
+
+# ── Thread 1: Failing and flaky tests (grouped by workflow) ───────────
+# Build per-workflow test issue lists
+new_failures = d.get("new_failures", [])
+recurring = d.get("recurring_failures", [])
+flaky = d.get("flaky_tests", [])
+resolved = d.get("resolved_tests", [])
+
+# Collect all test issues by test_type (workflow)
+issues_by_wf = {}
+for f_entry in new_failures:
+ tt = f_entry.get("test_type", "unknown")
+ issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []})
+ issues_by_wf[tt]["new"].append(f_entry)
+for f_entry in recurring:
+ tt = f_entry.get("test_type", "unknown")
+ issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []})
+ issues_by_wf[tt]["recurring"].append(f_entry)
+for f_entry in flaky:
+ tt = f_entry.get("test_type", "unknown")
+ issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []})
+ issues_by_wf[tt]["flaky"].append(f_entry)
+for r in resolved:
+ tt = r.get("test_type", "unknown")
+ issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []})
+ issues_by_wf[tt]["resolved"].append(r)
+
+if issues_by_wf:
+ for wf_name, issues in sorted(issues_by_wf.items()):
+ wf_blocks = []
+ wf_text = f"*{wf_name}*\n"
+
+ # New failures
+ for f_entry in issues["new"][:10]:
+ msg = f_entry.get("message", "")[:60].replace("\n", " ")
+ matrix = f_entry.get("matrix_label", "")
+ wf_text += f":new: `{f_entry['name']}` ({matrix}) — {msg}\n"
+
+ # Recurring failures
+ for f_entry in issues["recurring"][:10]:
+ matrix = f_entry.get("matrix_label", "")
+ first = f_entry.get("first_seen", "?")
+ wf_text += f":repeat: `{f_entry['name']}` ({matrix}) — since {first}\n"
+
+ # Flaky
+ for f_entry in issues["flaky"][:10]:
+ matrix = f_entry.get("matrix_label", "")
+ wf_text += f":warning: `{f_entry['name']}` ({matrix})\n"
+
+ # Resolved
+ for r in issues["resolved"][:5]:
+ matrix = r.get("matrix_label", "")
+ count = r.get("failure_count", "?")
+ wf_text += f":white_check_mark: `{r['name']}` ({matrix}) — was failing {count}x\n"
+
+ # Truncation notes
+ for category, label, limit in [("new", "new failures", 10), ("recurring", "recurring", 10),
+ ("flaky", "flaky", 10), ("resolved", "resolved", 5)]:
+ if len(issues[category]) > limit:
+ wf_text += f"_...+{len(issues[category]) - limit} more {label}_\n"
+
+ # Chunk if needed
+ while wf_text:
+ chunk = wf_text[:2900]
+ wf_blocks.append({
+ "type": "section",
+ "text": {"type": "mrkdwn", "text": chunk.rstrip()},
+ })
+ wf_text = wf_text[2900:]
+
+ print(make_payload(wf_blocks))
+
+PYEOF
+)
+
+# ── Send messages ─────────────────────────────────────────────────────
+echo "Sending consolidated Slack notification..."
+
+THREAD_TS=""
+FIRST=true
+
+while IFS= read -r payload; do
+ if [ "${FIRST}" = true ] && [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ]; then
+ # Post main message via chat.postMessage to get thread_ts
+ BOT_PAYLOAD=$(python3 -c "
+import json, sys
+p = json.loads(sys.argv[1])
+p['channel'] = sys.argv[2]
+print(json.dumps(p))
+" "${payload}" "${SLACK_CHANNEL_ID}")
+
+ RESPONSE=$(curl -s -X POST \
+ -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \
+ -H "Content-Type: application/json" \
+ --data "${BOT_PAYLOAD}" \
+ "https://slack.com/api/chat.postMessage")
+
+ THREAD_TS=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ts',''))" 2>/dev/null || echo "")
+ OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "")
+
+ if [ "${OK}" != "True" ]; then
+ echo "WARNING: chat.postMessage failed: ${RESPONSE}" >&2
+ # Fall back to webhook for this and remaining messages
+ THREAD_TS=""
+ curl -s -X POST -H 'Content-type: application/json' --data "${payload}" "${SLACK_WEBHOOK_URL}" || true
+ else
+ echo "Main message posted (ts=${THREAD_TS})"
+ fi
+ FIRST=false
+ elif [ -n "${THREAD_TS}" ] && [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ]; then
+ # Post thread reply via chat.postMessage
+ THREAD_PAYLOAD=$(python3 -c "
+import json, sys
+p = json.loads(sys.argv[1])
+p['channel'] = sys.argv[2]
+p['thread_ts'] = sys.argv[3]
+print(json.dumps(p))
+" "${payload}" "${SLACK_CHANNEL_ID}" "${THREAD_TS}")
+
+ RESPONSE=$(curl -s -X POST \
+ -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \
+ -H "Content-Type: application/json" \
+ --data "${THREAD_PAYLOAD}" \
+ "https://slack.com/api/chat.postMessage")
+
+ OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "")
+ if [ "${OK}" != "True" ]; then
+ echo "WARNING: Thread reply failed: ${RESPONSE}" >&2
+ fi
+ else
+ # Fallback: webhook (no threading)
+ response=$(curl -s -X POST \
+ -H 'Content-type: application/json' \
+ --data "${payload}" \
+ "${SLACK_WEBHOOK_URL}")
+ if [ "${response}" != "ok" ]; then
+ echo "WARNING: Slack webhook returned: ${response}" >&2
+ fi
+ FIRST=false
+ fi
+done <<< "${PAYLOADS}"
+echo "Consolidated Slack notification sent."
+
+# ── Upload HTML report as file in thread ──────────────────────────────
+if [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ] && [ -n "${CONSOLIDATED_HTML}" ] && [ -f "${CONSOLIDATED_HTML}" ]; then
+ echo "Uploading HTML report to Slack..."
+
+ REPORT_DATE=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('date','report'))" "${CONSOLIDATED_SUMMARY}" 2>/dev/null || echo "report")
+ REPORT_BRANCH=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('branch','main'))" "${CONSOLIDATED_SUMMARY}" 2>/dev/null || echo "main")
+ UPLOAD_FILENAME="cuopt-nightly-${REPORT_BRANCH}-${REPORT_DATE}.html"
+ FILE_SIZE=$(stat --format=%s "${CONSOLIDATED_HTML}")
+ UPLOAD_TITLE="cuOpt Nightly Report — ${REPORT_BRANCH} — ${REPORT_DATE}"
+
+ # Step 1: Get an upload URL from Slack
+ URL_RESPONSE=$(curl -s -X POST \
+ -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \
+ -H "Content-Type: application/x-www-form-urlencoded" \
+ --data-urlencode "filename=${UPLOAD_FILENAME}" \
+ --data-urlencode "length=${FILE_SIZE}" \
+ "https://slack.com/api/files.getUploadURLExternal")
+
+ UPLOAD_URL=$(echo "${URL_RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('upload_url',''))" 2>/dev/null)
+ FILE_ID=$(echo "${URL_RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('file_id',''))" 2>/dev/null)
+
+ if [ -z "${UPLOAD_URL}" ] || [ -z "${FILE_ID}" ]; then
+ echo "WARNING: Slack file upload failed at getUploadURLExternal. Response: ${URL_RESPONSE}" >&2
+ else
+ # Step 2: Upload the file content to the presigned URL
+ curl -s -X POST \
+ -F "file=@${CONSOLIDATED_HTML}" \
+ "${UPLOAD_URL}"
+
+ # Step 3: Complete the upload and share to channel (in thread if available)
+ COMPLETE_PAYLOAD=$(python3 -c "
+import json, sys
+payload = {
+ 'files': [{'id': sys.argv[1], 'title': sys.argv[2]}],
+ 'channel_id': sys.argv[3],
+ 'initial_comment': 'Full nightly test report \u2014 download and open in a browser for interactive details.',
+}
+thread_ts = sys.argv[4] if len(sys.argv) > 4 and sys.argv[4] else ''
+if thread_ts:
+ payload['thread_ts'] = thread_ts
+print(json.dumps(payload))
+" "${FILE_ID}" "${UPLOAD_TITLE}" "${SLACK_CHANNEL_ID}" "${THREAD_TS}")
+
+ COMPLETE_RESPONSE=$(curl -s -X POST \
+ -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \
+ -H "Content-Type: application/json" \
+ --data "${COMPLETE_PAYLOAD}" \
+ "https://slack.com/api/files.completeUploadExternal")
+
+ if echo "${COMPLETE_RESPONSE}" | python3 -c "import json,sys; sys.exit(0 if json.load(sys.stdin).get('ok') else 1)" 2>/dev/null; then
+ echo "HTML report uploaded to Slack."
+ else
+ echo "WARNING: Slack file upload failed at completeUploadExternal. Response: ${COMPLETE_RESPONSE}" >&2
+ fi
+ fi
+else
+ if [ -n "${SLACK_BOT_TOKEN}" ] && [ -z "${SLACK_CHANNEL_ID}" ]; then
+ echo "WARNING: SLACK_BOT_TOKEN set but SLACK_CHANNEL_ID missing, skipping file upload." >&2
+ fi
+fi
diff --git a/ci/utils/send_nightly_summary.sh b/ci/utils/send_nightly_summary.sh
new file mode 100755
index 0000000000..7b39a02cec
--- /dev/null
+++ b/ci/utils/send_nightly_summary.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Enhanced Slack notification for nightly test results.
+# Reads the JSON summary produced by nightly_report.py and sends a rich
+# Slack message with:
+# - Failure tables with :new: / :repeat: badges
+# - @channel on new genuine failures
+# - Stabilized tests (were failing, now passing)
+# - Flaky test list
+#
+# Required environment variables:
+# SLACK_WEBHOOK_URL - Slack incoming webhook URL (set from CUOPT_SLACK_WEBHOOK_URL in CI)
+# NIGHTLY_SUMMARY - Path to nightly_summary.json from nightly_report.py
+#
+# Optional environment variables:
+# GITHUB_RUN_URL - Link to the GitHub Actions run
+# REPORT_URL - Link to the S3 HTML report
+# CUOPT_BRANCH - Branch name (e.g. main)
+
+set -euo pipefail
+
+NIGHTLY_SUMMARY="${NIGHTLY_SUMMARY:?NIGHTLY_SUMMARY must point to nightly_summary.json}"
+SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:?SLACK_WEBHOOK_URL is required}"
+GITHUB_RUN_URL="${GITHUB_RUN_URL:-}"
+REPORT_URL="${REPORT_URL:-}"
+CUOPT_BRANCH="${CUOPT_BRANCH:-main}"
+
+if [ ! -f "${NIGHTLY_SUMMARY}" ]; then
+ echo "ERROR: Summary file not found: ${NIGHTLY_SUMMARY}" >&2
+ exit 1
+fi
+
+# Build the entire Slack payload in Python for safe JSON handling.
+# Shell variable interpolation into nested JSON is brittle; Python reads the
+# summary file directly and produces a valid JSON payload on stdout.
+PAYLOAD=$(python3 - "${NIGHTLY_SUMMARY}" "${CUOPT_BRANCH}" "${GITHUB_RUN_URL}" "${REPORT_URL}" <<'PYEOF'
+import json, sys
+
+summary_path, branch, github_run_url, report_url = sys.argv[1:5]
+
+with open(summary_path) as f:
+ d = json.load(f)
+
+counts = d["counts"]
+total = counts["total"]
+passed = counts["passed"]
+failed = counts["failed"]
+flaky = counts["flaky"]
+skipped = counts["skipped"]
+resolved = counts.get("resolved", 0)
+has_new = d["has_new_failures"]
+
+# --- Status line ---
+if failed > 0:
+ if has_new:
+ emoji = ":rotating_light:"
+ text = "NEW test failures detected"
+ mention = " "
+ else:
+ emoji = ":x:"
+ text = "Recurring test failures"
+ mention = ""
+elif flaky > 0:
+ emoji = ":large_yellow_circle:"
+ text = "All passed but flaky tests detected"
+ mention = ""
+else:
+ emoji = ":white_check_mark:"
+ text = "All tests passed"
+ mention = ""
+
+stats = (
+ f":white_check_mark: {passed} passed | :x: {failed} failed | "
+ f":warning: {flaky} flaky | :fast_forward: {skipped} skipped | Total: {total}"
+)
+
+blocks = []
+
+# Header
+blocks.append({
+ "type": "header",
+ "text": {"type": "plain_text", "text": f"cuOpt Nightly Tests \u2014 {branch}", "emoji": True},
+})
+
+# Status summary
+blocks.append({
+ "type": "section",
+ "text": {"type": "mrkdwn", "text": f"{mention}{emoji} *{text}*\n\n{stats}"},
+})
+
+blocks.append({"type": "divider"})
+
+# --- Genuine failures ---
+if failed > 0:
+ lines = []
+ for f_entry in d.get("new_failures", []):
+ msg = f_entry.get("message", "")[:60].replace("\n", " ")
+ lines.append(f" :new: `{f_entry['name']}` ({f_entry['suite']}) \u2014 {msg}")
+ for f_entry in d.get("recurring_failures", []):
+ msg = f_entry.get("message", "")[:60].replace("\n", " ")
+ first = f_entry.get("first_seen", "?")
+ lines.append(f" :repeat: `{f_entry['name']}` ({f_entry['suite']}) \u2014 since {first}")
+ blocks.append({
+ "type": "section",
+ "text": {"type": "mrkdwn", "text": "*Genuine Failures:*\n" + "\n".join(lines)},
+ })
+
+# --- Stabilized tests ---
+resolved_list = d.get("resolved_tests", [])
+if resolved_list:
+ lines = []
+ for r in resolved_list:
+ since = r.get("first_seen", "?")
+ count = r.get("failure_count", "?")
+ flaky_tag = " (was flaky)" if r.get("was_flaky") else ""
+ lines.append(
+ f" :white_check_mark: `{r['name']}` ({r['suite']}) \u2014 "
+ f"failing since {since}, failed {count}x{flaky_tag}"
+ )
+ blocks.append({
+ "type": "section",
+ "text": {
+ "type": "mrkdwn",
+ "text": "*Stabilized (were failing, now pass):*\n" + "\n".join(lines),
+ },
+ })
+
+# --- Flaky tests ---
+flaky_list = d.get("flaky_tests", [])
+if flaky_list:
+ lines = []
+ for f_entry in flaky_list:
+ retries = f_entry.get("retry_count", "?")
+ lines.append(f" :warning: `{f_entry['name']}` ({f_entry['suite']}) \u2014 {retries} retries")
+ blocks.append({
+ "type": "section",
+ "text": {"type": "mrkdwn", "text": "*Flaky Tests (passed on retry):*\n" + "\n".join(lines)},
+ })
+
+# --- Links ---
+link_parts = []
+if github_run_url:
+ link_parts.append(f"<{github_run_url}|GitHub Actions>")
+if report_url:
+ link_parts.append(f"<{report_url}|Full Report>")
+if link_parts:
+ blocks.append({"type": "divider"})
+ blocks.append({
+ "type": "context",
+ "elements": [{"type": "mrkdwn", "text": " ".join(link_parts)}],
+ })
+
+payload = {
+ "channel": "cuopt-regression-testing",
+ "username": "cuOpt Nightly Bot",
+ "icon_emoji": ":robot_face:",
+ "blocks": blocks,
+}
+print(json.dumps(payload))
+PYEOF
+)
+
+echo "Sending Slack notification..."
+curl -s -X POST \
+ -H 'Content-type: application/json' \
+ --data "${PAYLOAD}" \
+ "${SLACK_WEBHOOK_URL}"
+
+echo ""
+echo "Slack notification sent."
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 04dc6bb83c..e8000ffbb3 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -58,6 +58,7 @@ dependencies:
- pylibraft==26.6.*,>=0.0.0a0
- pyrsistent
- pytest-cov
+- pytest-rerunfailures
- pytest<9.0
- python>=3.11,<3.15
- pyyaml>=6.0.0
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 21891cc9f2..43bc8996ad 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -58,6 +58,7 @@ dependencies:
- pylibraft==26.6.*,>=0.0.0a0
- pyrsistent
- pytest-cov
+- pytest-rerunfailures
- pytest<9.0
- python>=3.11,<3.15
- pyyaml>=6.0.0
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index 89147b18a7..5a53e13d37 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -58,6 +58,7 @@ dependencies:
- pylibraft==26.6.*,>=0.0.0a0
- pyrsistent
- pytest-cov
+- pytest-rerunfailures
- pytest<9.0
- python>=3.11,<3.15
- pyyaml>=6.0.0
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index 8df6f28bf7..2efc26c0cb 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -58,6 +58,7 @@ dependencies:
- pylibraft==26.6.*,>=0.0.0a0
- pyrsistent
- pytest-cov
+- pytest-rerunfailures
- pytest<9.0
- python>=3.11,<3.15
- pyyaml>=6.0.0
diff --git a/datasets/get_test_data.sh b/datasets/get_test_data.sh
index 528455e133..472813a003 100755
--- a/datasets/get_test_data.sh
+++ b/datasets/get_test_data.sh
@@ -8,7 +8,7 @@ set -o pipefail
################################################################################
# S3 Dataset Download Support
################################################################################
-# Set CUOPT_DATASET_S3_URI to base S3 path
+# Set CUOPT_S3_URI to S3 bucket root (e.g., s3://cuopt-datasets/)
# AWS credentials should be configured via:
# - Environment variables (CUOPT_AWS_ACCESS_KEY_ID, CUOPT_AWS_SECRET_ACCESS_KEY)
# - Standard AWS variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
@@ -18,8 +18,8 @@ set -o pipefail
function try_download_from_s3() {
local s3_dirs=("$@") # Array of directories to sync from S3
- if [ -z "${CUOPT_DATASET_S3_URI:-}" ]; then
- echo "CUOPT_DATASET_S3_URI not set, skipping S3 download..."
+ if [ -z "${CUOPT_S3_URI:-}" ]; then
+ echo "CUOPT_S3_URI not set, skipping S3 download..."
return 1
fi
@@ -35,7 +35,7 @@ function try_download_from_s3() {
fi
# Append routing subdirectory to base S3 URI
- local s3_uri="${CUOPT_DATASET_S3_URI}routing/"
+ local s3_uri="${CUOPT_S3_URI}ci_datasets/routing/"
echo "Downloading datasets from S3..."
# Use CUOPT-specific credentials only
diff --git a/dependencies.yaml b/dependencies.yaml
index 057fc2a318..18d479a99f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -340,6 +340,7 @@ dependencies:
packages:
- pytest<9.0
- pytest-cov
+ - pytest-rerunfailures
test_python_cuopt:
common:
- output_types: [conda]
diff --git a/python/cuopt/cuopt/linear_programming/pyproject.toml b/python/cuopt/cuopt/linear_programming/pyproject.toml
index 934b12f547..6e2c59c43c 100644
--- a/python/cuopt/cuopt/linear_programming/pyproject.toml
+++ b/python/cuopt/cuopt/linear_programming/pyproject.toml
@@ -37,6 +37,7 @@ Source = "https://github.com/nvidia/cuopt"
[project.optional-dependencies]
test = [
"pytest-cov",
+ "pytest-rerunfailures",
"pytest<9.0",
"rapids-logger==0.2.*,>=0.0.0a0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cuopt/pyproject.toml b/python/cuopt/pyproject.toml
index eff7e01769..18b6e75276 100644
--- a/python/cuopt/pyproject.toml
+++ b/python/cuopt/pyproject.toml
@@ -47,6 +47,7 @@ classifiers = [
test = [
"numpy>=1.23.5,<3.0",
"pytest-cov",
+ "pytest-rerunfailures",
"pytest<9.0",
"rapids-logger==0.2.*,>=0.0.0a0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cuopt_self_hosted/pyproject.toml b/python/cuopt_self_hosted/pyproject.toml
index 43aa80a5b3..f4a3b75a60 100644
--- a/python/cuopt_self_hosted/pyproject.toml
+++ b/python/cuopt_self_hosted/pyproject.toml
@@ -37,6 +37,7 @@ classifiers = [
[project.optional-dependencies]
test = [
"pytest-cov",
+ "pytest-rerunfailures",
"pytest<9.0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cuopt_server/pyproject.toml b/python/cuopt_server/pyproject.toml
index ce96c884be..4f9f141011 100644
--- a/python/cuopt_server/pyproject.toml
+++ b/python/cuopt_server/pyproject.toml
@@ -48,6 +48,7 @@ test = [
"msgpack==1.1.2",
"pexpect",
"pytest-cov",
+ "pytest-rerunfailures",
"pytest<9.0",
"requests",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/skills/cuopt-developer/SKILL.md b/skills/cuopt-developer/SKILL.md
index 99743f9171..66d41c003e 100644
--- a/skills/cuopt-developer/SKILL.md
+++ b/skills/cuopt-developer/SKILL.md
@@ -295,6 +295,16 @@ rmm::device_uvector data(100, stream);
| Missing `nvcc` | Set `$CUDACXX` or add CUDA to `$PATH` |
| CUDA out of memory | Reduce problem size |
| Slow debug library loading | Device symbols cause delay |
+
+| CI state doesn't persist between runs | CI containers are ephemeral. Never write persistent state to repo files from CI — use S3 (`CUOPT_S3_URI`) or artifact stores. Ask: "After this container dies, does tomorrow's run see today's data?" |
+| CI state transitions go unreported | When CI tracks state over time (e.g. test failures), every transition (new failure, recurring, stabilized) needs an explicit notification path. Ask: "When state X changes to Y, who learns about it and how?" |
+| Designing CI features without lifecycle check | Before shipping any CI feature that tracks state: (1) Where does state live between runs? (2) What writes/reads it? (3) What happens on state transitions? Verify end-to-end, not just the happy-path logic. |
+| Change applied to only some targets | Before implementing, audit the full scope of what needs the change. For CI: `ls ci/test*.sh`. For APIs: grep all callers. For patterns: find every instance. Enumerate ALL targets first, implement second. |
+| Shared resource ignores CI matrix parallelism | CI matrices run jobs in parallel across CUDA x Python x arch. Any shared resource (S3 paths, files, databases) must be keyed by the full execution context. Ask: "What happens when N parallel jobs access this simultaneously?" |
+| Same logic duplicated across files | When the same block (>10 lines) appears in 2+ places — any language, any context — extract a shared helper immediately. Don't duplicate first and refactor later. This applies to shell scripts, Python modules, C/C++ code equally. |
+| Feature not extensible for new variants | After implementing, ask: "If someone adds a new variant (test type, matrix entry, endpoint, etc.), what do they change?" If the answer is more than a one-line addition, the design needs a shared helper or auto-discovery. Avoid hardcoded lists of known variants. |
+| Reports generated without actionable detail | Reports and notifications must include enough context to act without digging: error messages, execution context (matrix, commit), history (new vs recurring), and links or attachments for full details. Provide downloadable artifacts when possible. |
+
## Canonical Documentation