From 42151cf3685a4fb0b911aa7bf63ed0303330229e Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 11:45:55 -0500 Subject: [PATCH 01/60] Add flaky test retry support for gtests and pytests Add retry logic for gtest binaries via GTEST_MAX_RETRIES (default 1) and pytest reruns via --reruns 2 --reruns-delay 5. Tests that fail then pass on retry are classified as flaky rather than failures. Add pytest-rerunfailures as a test dependency. --- ci/run_ctests.sh | 30 +++++++++++++++++++++++++++--- ci/run_cuopt_pytests.sh | 2 +- ci/run_cuopt_server_pytests.sh | 2 +- dependencies.yaml | 1 + 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh index fc1de8e1b4..f1d57519b1 100755 --- a/ci/run_ctests.sh +++ b/ci/run_ctests.sh @@ -21,16 +21,40 @@ else exit 1 fi -for gt in "${GTEST_DIR}"/*_TEST; do +GTEST_MAX_RETRIES=${GTEST_MAX_RETRIES:-1} + +run_gtest_with_retry() { + local gt="$1" + shift + local test_name test_name=$(basename "${gt}") + echo "Running gtest ${test_name}" - "${gt}" "$@" + if "${gt}" "$@"; then + return 0 + fi + + local attempt + for attempt in $(seq 1 "${GTEST_MAX_RETRIES}"); do + echo "WARNING: ${test_name} failed, retry ${attempt}/${GTEST_MAX_RETRIES}" + if "${gt}" "$@"; then + echo "FLAKY: ${test_name} passed on retry ${attempt}" + return 0 + fi + done + + echo "FAILED: ${test_name} failed after $((GTEST_MAX_RETRIES + 1)) attempts" + return 1 +} + +for gt in "${GTEST_DIR}"/*_TEST; do + run_gtest_with_retry "${gt}" "$@" done # Run C_API_TEST with CPU memory for local solves (excluding time limit tests) if [ -x "${GTEST_DIR}/C_API_TEST" ]; then echo "Running gtest C_API_TEST with CUOPT_USE_CPU_MEM_FOR_LOCAL" - CUOPT_USE_CPU_MEM_FOR_LOCAL=1 "${GTEST_DIR}/C_API_TEST" --gtest_filter=-c_api/TimeLimitTestFixture.* "$@" + CUOPT_USE_CPU_MEM_FOR_LOCAL=1 run_gtest_with_retry "${GTEST_DIR}/C_API_TEST" --gtest_filter=-c_api/TimeLimitTestFixture.* "$@" else echo "Skipping C_API_TEST with CUOPT_USE_CPU_MEM_FOR_LOCAL (binary not found)" fi diff --git a/ci/run_cuopt_pytests.sh b/ci/run_cuopt_pytests.sh index 66e996715a..080fa42a1b 100755 --- a/ci/run_cuopt_pytests.sh +++ b/ci/run_cuopt_pytests.sh @@ -9,4 +9,4 @@ set -euo pipefail # Support invoking run_cuopt_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuopt/cuopt/ -pytest -s --cache-clear "$@" tests +pytest -s --cache-clear --reruns 2 --reruns-delay 5 "$@" tests diff --git a/ci/run_cuopt_server_pytests.sh b/ci/run_cuopt_server_pytests.sh index 4cb361a473..75d87d255d 100755 --- a/ci/run_cuopt_server_pytests.sh +++ b/ci/run_cuopt_server_pytests.sh @@ -9,4 +9,4 @@ set -euo pipefail # Support invoking run_cuopt_server_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuopt_server/cuopt_server/ -pytest -s --cache-clear "$@" tests +pytest -s --cache-clear --reruns 2 --reruns-delay 5 "$@" tests diff --git a/dependencies.yaml b/dependencies.yaml index 057fc2a318..18d479a99f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -340,6 +340,7 @@ dependencies: packages: - pytest<9.0 - pytest-cov + - pytest-rerunfailures test_python_cuopt: common: - output_types: [conda] From 1fc5cef01faeb128d70e01dc7459e8913804b6d0 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 11:46:03 -0500 Subject: [PATCH 02/60] Add nightly report generator and shared helpers Add matrix-aware nightly test report generator that parses JUnit XML, classifies failures as new/recurring/flaky/stabilized, maintains per-matrix failure history on S3, and outputs Markdown, HTML, and JSON reports. Extract S3 helpers into shared module and shell helper to eliminate duplication across test scripts. --- ci/utils/nightly_report.py | 849 ++++++++++++++++++++++++++++++ ci/utils/nightly_report_helper.sh | 93 ++++ ci/utils/s3_helpers.py | 87 +++ 3 files changed, 1029 insertions(+) create mode 100755 ci/utils/nightly_report.py create mode 100755 ci/utils/nightly_report_helper.sh create mode 100644 ci/utils/s3_helpers.py diff --git a/ci/utils/nightly_report.py b/ci/utils/nightly_report.py new file mode 100755 index 0000000000..40e2e65798 --- /dev/null +++ b/ci/utils/nightly_report.py @@ -0,0 +1,849 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Nightly test report generator for cuOpt CI. + +Parses JUnit XML test results, classifies failures as flaky vs genuine, +maintains a failure history database on S3, and outputs: + - HTML report (detailed, uploaded to S3 and linked from Slack) + - Markdown summary (for $GITHUB_STEP_SUMMARY or terminal) + - JSON summary (for downstream consumers like Slack notifier and dashboard) + +Each CI matrix job (CUDA version x Python version x architecture) runs this +script independently. The --test-type and --matrix-label flags identify the +job so that history and summaries are stored per-matrix-combo. + +History lifecycle: + 1. Download history from S3 (falls back to empty if not found) + 2. Classify this run's results + 3. Update history: mark new failures, bump recurring counts, resolve stabilized tests + 4. Upload updated history back to S3 + 5. Generate reports (HTML, Markdown, JSON, GitHub Step Summary) + 6. Upload per-run JSON snapshot to S3 summaries dir (for aggregation) + +Usage: + python ci/utils/nightly_report.py \\ + --results-dir test-results/ \\ + --output-dir report-output/ \\ + --sha abc123 \\ + --test-type python \\ + --matrix-label cuda12.9-py3.12-x86_64 \\ + --s3-history-uri s3://bucket/ci_test_reports/nightly/history/python-main-cuda12.9-py3.12-x86_64.json \\ + --s3-summary-uri s3://bucket/ci_test_reports/nightly/summaries/2026-04-13/python-cuda12.9-py3.12-x86_64.json +""" + +import argparse +import json +import os +import sys +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path +from xml.etree import ElementTree + +# Ensure ci/utils is importable when invoked as a script +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from s3_helpers import s3_download, s3_upload # noqa: E402 + +EMPTY_HISTORY = {"_schema_version": 1, "tests": {}} + + +# --------------------------------------------------------------------------- +# JUnit XML parsing +# --------------------------------------------------------------------------- + +def parse_junit_xml(xml_path): + """Parse a JUnit XML file and return a list of test result dicts.""" + results = [] + try: + tree = ElementTree.parse(xml_path) + except ElementTree.ParseError as e: + print(f"WARNING: Failed to parse {xml_path}: {e}", file=sys.stderr) + return results + + root = tree.getroot() + + if root.tag == "testsuites": + suites = root.findall("testsuite") + elif root.tag == "testsuite": + suites = [root] + else: + return results + + for suite in suites: + suite_name = suite.get("name", os.path.basename(xml_path)) + for testcase in suite.findall("testcase"): + name = testcase.get("name", "unknown") + classname = testcase.get("classname", "") + time_taken = testcase.get("time", "0") + + failure = testcase.find("failure") + error = testcase.find("error") + skipped = testcase.find("skipped") + + if skipped is not None: + status = "skipped" + message = skipped.get("message", "") + elif failure is not None: + status = "failed" + message = failure.get("message", "") + if failure.text: + message = failure.text[:500] + elif error is not None: + status = "error" + message = error.get("message", "") + if error.text: + message = error.text[:500] + else: + status = "passed" + message = "" + + results.append({ + "suite": suite_name, + "classname": classname, + "name": name, + "status": status, + "time": time_taken, + "message": message, + "source_file": str(xml_path), + }) + + return results + + +def collect_all_results(results_dir): + """Collect test results from all JUnit XML files in a directory.""" + results_dir = Path(results_dir) + all_results = [] + for xml_file in sorted(results_dir.rglob("*.xml")): + all_results.extend(parse_junit_xml(xml_file)) + return all_results + + +# --------------------------------------------------------------------------- +# Classification +# --------------------------------------------------------------------------- + +def classify_failures(results): + """ + Classify test results into passed, failed, flaky, skipped, and error. + + pytest-rerunfailures records reruns as additional entries. + A test that failed then passed on rerun is flaky. + """ + test_groups = defaultdict(list) + for r in results: + key = f"{r['suite']}::{r['classname']}::{r['name']}" + test_groups[key].append(r) + + classified = { + "passed": [], + "failed": [], + "flaky": [], + "skipped": [], + "error": [], + } + + for key, entries in test_groups.items(): + statuses = [e["status"] for e in entries] + + if all(s == "skipped" for s in statuses): + classified["skipped"].append(entries[0]) + elif any(s == "passed" for s in statuses): + if any(s in ("failed", "error") for s in statuses): + entry = entries[-1].copy() + entry["status"] = "flaky" + entry["retry_count"] = sum( + 1 for s in statuses if s in ("failed", "error") + ) + classified["flaky"].append(entry) + else: + classified["passed"].append(entries[-1]) + elif any(s == "error" for s in statuses): + classified["error"].append(entries[-1]) + else: + classified["failed"].append(entries[-1]) + + return classified + + +# --------------------------------------------------------------------------- +# History management +# --------------------------------------------------------------------------- + +def load_history(history_path): + """Load failure history from a local JSON file.""" + try: + with open(history_path) as f: + data = json.load(f) + if "tests" in data: + return data + except (FileNotFoundError, json.JSONDecodeError): + pass + return dict(EMPTY_HISTORY) + + +def update_history(history, classified, sha, date_str): + """ + Update failure history with this run's results. + + Returns (history, new_failures, recurring_failures, resolved_tests). + resolved_tests = previously active failures that passed this run (stabilized). + """ + tests = history.setdefault("tests", {}) + new_failures = [] + recurring_failures = [] + resolved_tests = [] + + # --- Genuine failures --- + for entry in classified["failed"] + classified["error"]: + test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" + + if test_key in tests and tests[test_key]["status"] == "active": + tests[test_key]["last_seen_date"] = date_str + tests[test_key]["last_seen_sha"] = sha + tests[test_key]["failure_count"] += 1 + recurring_failures.append( + {**entry, "first_seen": tests[test_key]["first_seen_date"]} + ) + else: + tests[test_key] = { + "suite": entry["suite"], + "classname": entry["classname"], + "name": entry["name"], + "first_seen_date": date_str, + "first_seen_sha": sha, + "last_seen_date": date_str, + "last_seen_sha": sha, + "failure_count": 1, + "is_flaky": False, + "status": "active", + } + new_failures.append(entry) + + # --- Flaky tests --- + for entry in classified["flaky"]: + test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" + if test_key in tests: + tests[test_key]["last_seen_date"] = date_str + tests[test_key]["last_seen_sha"] = sha + tests[test_key]["failure_count"] += 1 + tests[test_key]["is_flaky"] = True + else: + tests[test_key] = { + "suite": entry["suite"], + "classname": entry["classname"], + "name": entry["name"], + "first_seen_date": date_str, + "first_seen_sha": sha, + "last_seen_date": date_str, + "last_seen_sha": sha, + "failure_count": 1, + "is_flaky": True, + "status": "active", + } + + # --- Resolve stabilized tests --- + passed_keys = set() + for entry in classified["passed"]: + test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" + passed_keys.add(test_key) + + for test_key in passed_keys: + if test_key in tests and tests[test_key]["status"] == "active": + rec = tests[test_key] + rec["status"] = "resolved" + rec["resolved_date"] = date_str + rec["resolved_sha"] = sha + resolved_tests.append({ + "suite": rec["suite"], + "classname": rec["classname"], + "name": rec["name"], + "first_seen": rec["first_seen_date"], + "failure_count": rec["failure_count"], + "was_flaky": rec.get("is_flaky", False), + }) + + return history, new_failures, recurring_failures, resolved_tests + + +def save_history(history, history_path): + """Write history to a local JSON file.""" + with open(history_path, "w") as f: + json.dump(history, f, indent=2, sort_keys=True) + f.write("\n") + + +# --------------------------------------------------------------------------- +# Report generation +# --------------------------------------------------------------------------- + +def generate_markdown_report( + classified, new_failures, recurring_failures, resolved_tests, history, + test_type="", matrix_label="", sha="", date_str="", +): + """Generate a Markdown summary report.""" + lines = [] + title = "# Nightly Test Report" + if test_type: + title += f" — {test_type}" + if matrix_label: + title += f" [{matrix_label}]" + lines.append(title) + lines.append("") + if date_str or sha: + meta_parts = [] + if date_str: + meta_parts.append(f"**Date:** {date_str}") + if sha: + meta_parts.append(f"**Commit:** `{sha[:12]}`") + if matrix_label: + meta_parts.append(f"**Matrix:** {matrix_label}") + lines.append(" | ".join(meta_parts)) + lines.append("") + + total_passed = len(classified["passed"]) + total_failed = len(classified["failed"]) + len(classified["error"]) + total_flaky = len(classified["flaky"]) + total_skipped = len(classified["skipped"]) + total = total_passed + total_failed + total_flaky + total_skipped + + lines.append("## Summary") + lines.append("") + lines.append("| Metric | Count |") + lines.append("|--------|-------|") + lines.append(f"| Total tests | {total} |") + lines.append(f"| Passed | {total_passed} |") + lines.append(f"| **Genuine failures** | **{total_failed}** |") + lines.append(f"| Flaky (passed on retry) | {total_flaky} |") + lines.append(f"| Skipped | {total_skipped} |") + if resolved_tests: + lines.append(f"| **Stabilized (were failing, now pass)** | **{len(resolved_tests)}** |") + lines.append("") + + # -- New genuine failures (highest priority) -- + if new_failures: + lines.append("## NEW Failures (not previously seen)") + lines.append("") + lines.append("| Suite | Test | Error |") + lines.append("|-------|------|-------|") + for entry in new_failures: + short_msg = ( + entry.get("message", "")[:80].replace("\n", " ").replace("|", "\\|") + ) + lines.append(f"| {entry['suite']} | `{entry['name']}` | {short_msg} |") + lines.append("") + + # -- Recurring failures -- + if recurring_failures: + lines.append("## Recurring Failures") + lines.append("") + lines.append("| Suite | Test | First seen | Failure count | Error |") + lines.append("|-------|------|------------|---------------|-------|") + for entry in recurring_failures: + short_msg = ( + entry.get("message", "")[:60].replace("\n", " ").replace("|", "\\|") + ) + first_seen = entry.get("first_seen", "unknown") + test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" + count = history.get("tests", {}).get(test_key, {}).get("failure_count", "?") + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {first_seen} | {count} | {short_msg} |" + ) + lines.append("") + + # -- Stabilized tests -- + if resolved_tests: + lines.append("## Stabilized Tests (were failing, now passing)") + lines.append("") + lines.append("| Suite | Test | Was failing since | Total failure count | Was flaky? |") + lines.append("|-------|------|-------------------|---------------------|------------|") + for entry in resolved_tests: + flaky_badge = "Yes" if entry.get("was_flaky") else "No" + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {entry['first_seen']} " + f"| {entry['failure_count']} | {flaky_badge} |" + ) + lines.append("") + + # -- Flaky tests -- + if classified["flaky"]: + lines.append("## Flaky Tests (passed on retry)") + lines.append("") + lines.append("| Suite | Test | Retries needed |") + lines.append("|-------|------|----------------|") + for entry in classified["flaky"]: + retry_count = entry.get("retry_count", "?") + lines.append(f"| {entry['suite']} | `{entry['name']}` | {retry_count} |") + lines.append("") + + # -- Detailed errors -- + all_failures = classified["failed"] + classified["error"] + if all_failures: + lines.append("## All Failure Details") + lines.append("") + for entry in all_failures: + lines.append(f"### `{entry['classname']}::{entry['name']}`") + lines.append(f"- **Suite**: {entry['suite']}") + lines.append(f"- **Source**: {entry['source_file']}") + msg = entry.get("message", "").strip() + if msg: + lines.append("- **Error**:") + lines.append("```") + for line in msg.split("\n")[:20]: + lines.append(line) + lines.append("```") + lines.append("") + + if not all_failures and not classified["flaky"] and not resolved_tests: + lines.append("All tests passed! No failures or flaky tests detected.") + lines.append("") + + return "\n".join(lines) + + +def generate_json_summary( + classified, new_failures, recurring_failures, resolved_tests, + test_type="", matrix_label="", sha="", date_str="", +): + """Generate a JSON summary for downstream tools (Slack notifier, dashboard).""" + return { + "timestamp": datetime.now(timezone.utc).isoformat(), + "test_type": test_type, + "matrix_label": matrix_label, + "sha": sha, + "date": date_str, + "counts": { + "total": sum(len(v) for v in classified.values()), + "passed": len(classified["passed"]), + "failed": len(classified["failed"]) + len(classified["error"]), + "flaky": len(classified["flaky"]), + "skipped": len(classified["skipped"]), + "resolved": len(resolved_tests), + }, + "has_new_failures": len(new_failures) > 0, + "new_failures": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "message": e.get("message", "")[:200], + } + for e in new_failures + ], + "recurring_failures": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "first_seen": e.get("first_seen", "unknown"), + "message": e.get("message", "")[:200], + } + for e in recurring_failures + ], + "flaky_tests": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "retry_count": e.get("retry_count", 0), + } + for e in classified["flaky"] + ], + "resolved_tests": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "first_seen": e.get("first_seen", "unknown"), + "failure_count": e.get("failure_count", 0), + "was_flaky": e.get("was_flaky", False), + } + for e in resolved_tests + ], + } + + +# --------------------------------------------------------------------------- +# HTML report +# --------------------------------------------------------------------------- + +def _html_escape(text): + """Escape HTML special characters.""" + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) + + +def generate_html_report( + classified, new_failures, recurring_failures, resolved_tests, history, + test_type="", matrix_label="", sha="", date_str="", +): + """Generate a self-contained HTML report with detailed failure info.""" + total_passed = len(classified["passed"]) + total_failed = len(classified["failed"]) + len(classified["error"]) + total_flaky = len(classified["flaky"]) + total_skipped = len(classified["skipped"]) + total = total_passed + total_failed + total_flaky + total_skipped + + title = "Nightly Test Report" + if test_type: + title += f" — {_html_escape(test_type)}" + if matrix_label: + title += f" [{_html_escape(matrix_label)}]" + + # Determine overall status color + if total_failed > 0: + status_color = "#d32f2f" + status_text = f"{total_failed} failure(s)" + elif total_flaky > 0: + status_color = "#f9a825" + status_text = "All passed (flaky detected)" + else: + status_color = "#388e3c" + status_text = "All passed" + + parts = [] + parts.append(f""" + + + + +{title} + + + +

{title}

+
""") + + meta_parts = [] + if date_str: + meta_parts.append(f"Date: {_html_escape(date_str)}") + if sha: + meta_parts.append(f"Commit: {_html_escape(sha[:12])}") + if matrix_label: + meta_parts.append(f"Matrix: {_html_escape(matrix_label)}") + parts.append("  |  ".join(meta_parts)) + + parts.append(f"""
+
{status_text}
+
+
{total}
Total
+
{total_passed}
Passed
+
{total_failed}
Failed
+
{total_flaky}
Flaky
+
Skipped
+
{len(resolved_tests)}
Stabilized
+
""") + + # --- New failures --- + if new_failures: + parts.append('

New Failures

') + parts.append('') + for e in new_failures: + msg = _html_escape(e.get("message", "")) + short = _html_escape(e.get("message", "")[:100]) + parts.append( + f'' + f'' + f'' + ) + parts.append("
SuiteTestError
{_html_escape(e["suite"])}{_html_escape(e["name"])} ' + f'NEW
{short}' + f'
{msg}
") + + # --- Recurring failures --- + if recurring_failures: + parts.append('

Recurring Failures

') + parts.append( + "" + "" + ) + for e in recurring_failures: + msg = _html_escape(e.get("message", "")) + short = _html_escape(e.get("message", "")[:100]) + first_seen = _html_escape(e.get("first_seen", "unknown")) + test_key = f"{e['suite']}::{e['classname']}::{e['name']}" + count = history.get("tests", {}).get(test_key, {}).get( + "failure_count", "?" + ) + parts.append( + f'' + f'' + f"" + f'' + ) + parts.append("
SuiteTestFirst SeenCountError
{_html_escape(e["suite"])}{_html_escape(e["name"])} ' + f'RECURRING{first_seen}{count}
{short}' + f'
{msg}
") + + # --- Stabilized --- + if resolved_tests: + parts.append('

Stabilized Tests

') + parts.append( + "" + "" + ) + for e in resolved_tests: + flaky_tag = "Yes" if e.get("was_flaky") else "No" + parts.append( + f'' + f'' + f'' + f'' + f"" + ) + parts.append("
SuiteTestFailing SinceFailure CountWas Flaky?
{_html_escape(e["suite"])}{_html_escape(e["name"])} ' + f'FIXED{_html_escape(e.get("first_seen", "?"))}{e.get("failure_count", "?")}{flaky_tag}
") + + # --- Flaky --- + if classified["flaky"]: + parts.append('

Flaky Tests (passed on retry)

') + parts.append("") + for e in classified["flaky"]: + parts.append( + f'' + f'' + f'' + ) + parts.append("
SuiteTestRetries
{_html_escape(e["suite"])}{_html_escape(e["name"])} ' + f'FLAKY{e.get("retry_count", "?")}
") + + # --- All failure details --- + all_failures = classified["failed"] + classified["error"] + if all_failures: + parts.append("

All Failure Details

") + for e in all_failures: + msg = _html_escape(e.get("message", "").strip()) + parts.append( + f'

' + f'{_html_escape(e["classname"])}::{_html_escape(e["name"])}

' + f'

' + f'Suite: {_html_escape(e["suite"])}  |  ' + f'Source: {_html_escape(e["source_file"])}

' + ) + if msg: + parts.append(f'
{msg}
') + parts.append("
") + + if not all_failures and not classified["flaky"] and not resolved_tests: + parts.append('

All tests passed! No failures or flaky tests detected.

') + + parts.append("") + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description="Generate nightly test failure report from JUnit XML results" + ) + parser.add_argument( + "--results-dir", required=True, + help="Directory containing JUnit XML test result files", + ) + parser.add_argument( + "--output-dir", default="report-output", + help="Directory to write report files to", + ) + parser.add_argument( + "--sha", default=os.environ.get("GITHUB_SHA", "unknown"), + help="Git commit SHA for this run", + ) + parser.add_argument( + "--date", default=datetime.now(timezone.utc).strftime("%Y-%m-%d"), + help="Date for this run (YYYY-MM-DD)", + ) + parser.add_argument( + "--test-type", default="unknown", + help=( + "Test type identifier (e.g., cpp, python, wheel-python, " + "wheel-server, notebooks)" + ), + ) + parser.add_argument( + "--matrix-label", default="", + help=( + "Matrix combination label (e.g., cuda12.9-py3.12-x86_64). " + "Included in reports and JSON summary to identify the CI job." + ), + ) + parser.add_argument( + "--s3-history-uri", default="", + help=( + "S3 URI for persistent failure history JSON. " + "Downloaded before analysis, uploaded after update. " + "Example: s3://bucket/ci_test_reports/nightly/history/" + "python-main-cuda12.9-py3.12-x86_64.json" + ), + ) + parser.add_argument( + "--s3-summary-uri", default="", + help=( + "S3 URI to upload this run's JSON snapshot for aggregation. " + "Example: s3://bucket/ci_test_reports/nightly/summaries/" + "2026-04-13/python-cuda12.9-py3.12-x86_64.json" + ), + ) + parser.add_argument( + "--s3-html-uri", default="", + help=( + "S3 URI to upload the HTML report. " + "Example: s3://bucket/ci_test_reports/nightly/reports/" + "2026-04-13/python-cuda12.9-py3.12-x86_64.html" + ), + ) + parser.add_argument( + "--github-step-summary", + default=os.environ.get("GITHUB_STEP_SUMMARY", ""), + help="Path to write GitHub Actions step summary", + ) + + args = parser.parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + local_history_path = str(output_dir / "test_failure_history.json") + + # ---- Step 1: Download history from S3 ---- + if args.s3_history_uri: + s3_download(args.s3_history_uri, local_history_path) + + # ---- Step 2: Collect and classify results ---- + print(f"Collecting test results from {args.results_dir} ...") + results = collect_all_results(args.results_dir) + if not results: + print("WARNING: No test results found.", file=sys.stderr) + + print(f"Found {len(results)} test case entries across all XML files") + classified = classify_failures(results) + + print( + f"Classification: {len(classified['passed'])} passed, " + f"{len(classified['failed'])} failed, " + f"{len(classified['error'])} errors, " + f"{len(classified['flaky'])} flaky, " + f"{len(classified['skipped'])} skipped" + ) + + # ---- Step 3: Update history ---- + history = load_history(local_history_path) + history, new_failures, recurring_failures, resolved_tests = update_history( + history, classified, args.sha, args.date + ) + + if resolved_tests: + print(f"Stabilized: {len(resolved_tests)} previously-failing test(s) now pass") + + save_history(history, local_history_path) + print(f"Updated local history at {local_history_path}") + + # ---- Step 4: Upload history back to S3 ---- + if args.s3_history_uri: + s3_upload(local_history_path, args.s3_history_uri) + + # ---- Step 5: Generate reports ---- + report_kwargs = dict( + test_type=args.test_type, + matrix_label=args.matrix_label, + sha=args.sha, + date_str=args.date, + ) + + md_report = generate_markdown_report( + classified, new_failures, recurring_failures, resolved_tests, history, + **report_kwargs, + ) + md_path = output_dir / "nightly_report.md" + md_path.write_text(md_report) + print(f"Markdown report written to {md_path}") + + html_report = generate_html_report( + classified, new_failures, recurring_failures, resolved_tests, history, + **report_kwargs, + ) + html_path = output_dir / "nightly_report.html" + html_path.write_text(html_report) + print(f"HTML report written to {html_path}") + + json_summary = generate_json_summary( + classified, new_failures, recurring_failures, resolved_tests, + **report_kwargs, + ) + json_path = output_dir / "nightly_summary.json" + json_path.write_text(json.dumps(json_summary, indent=2) + "\n") + print(f"JSON summary written to {json_path}") + + if args.github_step_summary: + with open(args.github_step_summary, "a") as f: + f.write(md_report) + print(f"Wrote GitHub Step Summary to {args.github_step_summary}") + + # ---- Step 6: Upload per-run snapshot and HTML to S3 ---- + if args.s3_summary_uri: + s3_upload(str(json_path), args.s3_summary_uri) + + if args.s3_html_uri: + s3_upload(str(html_path), args.s3_html_uri) + + # ---- Exit code ---- + genuine_failures = len(classified["failed"]) + len(classified["error"]) + if genuine_failures > 0: + print(f"\nFAILED: {genuine_failures} genuine test failure(s) detected.") + return 1 + if classified["flaky"]: + print(f"\nWARNING: All tests passed but {len(classified['flaky'])} flaky test(s) detected.") + else: + print("\nAll tests passed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ci/utils/nightly_report_helper.sh b/ci/utils/nightly_report_helper.sh new file mode 100755 index 0000000000..809b918df8 --- /dev/null +++ b/ci/utils/nightly_report_helper.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Shared helper for generating nightly test reports with matrix-aware S3 paths. +# +# Usage (source from any test script): +# +# # For C++ tests (no Python version in matrix label): +# generate_nightly_report "cpp" +# +# # For Python tests (includes Python version in matrix label): +# generate_nightly_report "python" --with-python-version +# +# # For wheel tests: +# generate_nightly_report "wheel-python" --with-python-version +# +# Prerequisites (set before calling): +# RAPIDS_TESTS_DIR - directory containing JUnit XML test results +# +# Optional environment variables (auto-detected if not set): +# RAPIDS_CUDA_VERSION - CUDA version (e.g., "12.9") +# RAPIDS_PY_VERSION - Python version (e.g., "3.12"), used with --with-python-version +# RAPIDS_BRANCH - branch name (e.g., "main") +# CUOPT_DATASET_S3_URI - S3 base URI for reports +# GITHUB_SHA - commit SHA +# GITHUB_STEP_SUMMARY - path for GitHub Actions step summary + +# Resolve the directory where THIS helper lives (ci/utils/) +_HELPER_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +generate_nightly_report() { + local test_type="${1:?Usage: generate_nightly_report [--with-python-version]}" + local include_py_version=false + + shift + while [ $# -gt 0 ]; do + case "$1" in + --with-python-version) include_py_version=true ;; + *) echo "WARNING: Unknown option: $1" >&2 ;; + esac + shift + done + + # --- Build matrix label --- + local cuda_tag="cuda${RAPIDS_CUDA_VERSION:-unknown}" + local arch_tag + arch_tag="$(arch)" + local matrix_label="${cuda_tag}-${arch_tag}" + + if [ "${include_py_version}" = true ]; then + local py_tag="py${RAPIDS_PY_VERSION:-unknown}" + matrix_label="${cuda_tag}-${py_tag}-${arch_tag}" + fi + + local branch_slug + branch_slug=$(echo "${RAPIDS_BRANCH:-main}" | tr '/' '-') + local run_date + run_date="$(date +%F)" + + # --- Ensure results dir exists --- + RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" + mkdir -p "${RAPIDS_TESTS_DIR}" + + local report_output_dir="${RAPIDS_TESTS_DIR}/report" + mkdir -p "${report_output_dir}" + + # --- Build S3 URIs --- + local s3_history_uri="" + local s3_summary_uri="" + local s3_html_uri="" + + if [ -n "${CUOPT_DATASET_S3_URI:-}" ]; then + local s3_base="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" + s3_history_uri="${s3_base}/history/${test_type}-${branch_slug}-${matrix_label}.json" + s3_summary_uri="${s3_base}/summaries/${run_date}/${test_type}-${matrix_label}.json" + s3_html_uri="${s3_base}/reports/${run_date}/${test_type}-${matrix_label}.html" + fi + + # --- Run nightly report --- + python3 "${_HELPER_DIR}/nightly_report.py" \ + --results-dir "${RAPIDS_TESTS_DIR}" \ + --output-dir "${report_output_dir}" \ + --sha "${GITHUB_SHA:-unknown}" \ + --date "${run_date}" \ + --test-type "${test_type}" \ + --matrix-label "${matrix_label}" \ + --s3-history-uri "${s3_history_uri}" \ + --s3-summary-uri "${s3_summary_uri}" \ + --s3-html-uri "${s3_html_uri}" \ + --github-step-summary "${GITHUB_STEP_SUMMARY:-}" \ + || true +} diff --git a/ci/utils/s3_helpers.py b/ci/utils/s3_helpers.py new file mode 100644 index 0000000000..f1f5795661 --- /dev/null +++ b/ci/utils/s3_helpers.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Shared S3 helper functions for cuOpt CI scripts. + +Maps CUOPT_AWS_* credentials to standard AWS env vars and provides +download / upload / list wrappers around the aws CLI. +""" + +import os +import subprocess +import sys + + +def s3_env(): + """Build env dict with CUOPT AWS credentials mapped to standard AWS vars.""" + env = os.environ.copy() + if os.environ.get("CUOPT_AWS_ACCESS_KEY_ID"): + env["AWS_ACCESS_KEY_ID"] = os.environ["CUOPT_AWS_ACCESS_KEY_ID"] + if os.environ.get("CUOPT_AWS_SECRET_ACCESS_KEY"): + env["AWS_SECRET_ACCESS_KEY"] = os.environ["CUOPT_AWS_SECRET_ACCESS_KEY"] + if os.environ.get("CUOPT_AWS_REGION"): + env["AWS_DEFAULT_REGION"] = os.environ["CUOPT_AWS_REGION"] + elif "AWS_DEFAULT_REGION" not in env: + env["AWS_DEFAULT_REGION"] = "us-east-1" + return env + + +def s3_download(s3_uri, local_path): + """Download a file from S3. Returns True on success, False on any error.""" + env = s3_env() + try: + subprocess.run( + ["aws", "s3", "cp", s3_uri, local_path], + env=env, check=True, capture_output=True, text=True, + ) + print(f"Downloaded {s3_uri}") + return True + except FileNotFoundError: + print("WARNING: aws CLI not found, skipping S3 download", file=sys.stderr) + return False + except subprocess.CalledProcessError as exc: + print( + f"WARNING: S3 download failed (first run?): {exc.stderr.strip()}", + file=sys.stderr, + ) + return False + + +def s3_upload(local_path, s3_uri): + """Upload a file to S3. Returns True on success.""" + env = s3_env() + try: + subprocess.run( + ["aws", "s3", "cp", local_path, s3_uri], + env=env, check=True, capture_output=True, text=True, + ) + print(f"Uploaded {local_path} to {s3_uri}") + return True + except FileNotFoundError: + print("WARNING: aws CLI not found, skipping S3 upload", file=sys.stderr) + return False + except subprocess.CalledProcessError as exc: + print(f"WARNING: S3 upload failed: {exc.stderr.strip()}", file=sys.stderr) + return False + + +def s3_list(s3_prefix): + """List objects under an S3 prefix. Returns list of S3 URIs.""" + env = s3_env() + try: + result = subprocess.run( + ["aws", "s3", "ls", s3_prefix], + env=env, check=True, capture_output=True, text=True, + ) + except (FileNotFoundError, subprocess.CalledProcessError) as exc: + print(f"WARNING: S3 ls failed: {exc}", file=sys.stderr) + return [] + + uris = [] + for line in result.stdout.strip().splitlines(): + parts = line.split() + if parts: + uris.append(f"{s3_prefix}{parts[-1]}") + return uris From 13926915994c22dfaa8736cefc49b7052506c509 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 11:46:09 -0500 Subject: [PATCH 03/60] Instrument all test scripts with nightly reporting Add nightly report generation to cpp, python, wheel-python, wheel-server, and notebook test scripts using the shared helper. Wheel and notebook scripts also gain JUnit XML output and EXITCODE trap pattern for consistent error handling. --- ci/test_cpp.sh | 4 ++++ ci/test_notebooks.sh | 6 ++++++ ci/test_python.sh | 4 ++++ ci/test_wheel_cuopt.sh | 17 ++++++++++++++++- ci/test_wheel_cuopt_server.sh | 17 ++++++++++++++++- 5 files changed, 46 insertions(+), 2 deletions(-) diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 653c44133a..4def832194 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -54,5 +54,9 @@ export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/ rapids-logger "Run gtests" timeout 40m ./ci/run_ctests.sh +rapids-logger "Generate nightly test report" +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "cpp" + rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index 22c41af84c..b58c9a1d32 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -64,5 +64,11 @@ for nb in ${NBLIST}; do fi done +popd + +rapids-logger "Generate nightly test report" +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "notebooks" --with-python-version + rapids-logger "Notebook test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_python.sh b/ci/test_python.sh index 4f91c83334..9af612ad76 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -77,5 +77,9 @@ timeout 20m ./ci/run_cuopt_server_pytests.sh \ rapids-logger "Test skills/ assets (Python, C, CLI)" timeout 10m ./ci/test_skills_assets.sh +rapids-logger "Generate nightly test report" +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "python" --with-python-version + rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh index a327082e83..5d002731b0 100755 --- a/ci/test_wheel_cuopt.sh +++ b/ci/test_wheel_cuopt.sh @@ -63,6 +63,13 @@ cd - RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)" export RAPIDS_DATASET_ROOT_DIR +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +mkdir -p "${RAPIDS_TESTS_DIR}" + +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + # Run CLI tests timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh @@ -71,7 +78,9 @@ timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh # Due to race condition in certain cases UCX might not be able to cleanup properly, so we set the number of threads to 1 export OMP_NUM_THREADS=1 -timeout 30m ./ci/run_cuopt_pytests.sh --verbose --capture=no +timeout 30m ./ci/run_cuopt_pytests.sh \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-wheel-cuopt.xml" \ + --verbose --capture=no # run thirdparty integration tests for only nightly builds if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then @@ -80,3 +89,9 @@ if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then ./ci/thirdparty-testing/run_pulp_tests.sh ./ci/thirdparty-testing/run_pyomo_tests.sh fi + +# Generate nightly test report +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "wheel-python" --with-python-version + +exit ${EXITCODE} diff --git a/ci/test_wheel_cuopt_server.sh b/ci/test_wheel_cuopt_server.sh index a76969b965..55852a913c 100755 --- a/ci/test_wheel_cuopt_server.sh +++ b/ci/test_wheel_cuopt_server.sh @@ -39,7 +39,22 @@ rapids-pip-retry install \ RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)" export RAPIDS_DATASET_ROOT_DIR -timeout 30m ./ci/run_cuopt_server_pytests.sh --verbose --capture=no +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +mkdir -p "${RAPIDS_TESTS_DIR}" + +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + +timeout 30m ./ci/run_cuopt_server_pytests.sh \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-wheel-cuopt-server.xml" \ + --verbose --capture=no # Run documentation tests ./ci/test_doc_examples.sh + +# Generate nightly test report +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "wheel-server" --with-python-version + +exit ${EXITCODE} From f6b0f75555c5e149ddb1f3edc84a3a37612da1e9 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 11:46:23 -0500 Subject: [PATCH 04/60] Add aggregation, Slack notifications, and test dashboard Add aggregate_nightly.py to merge per-matrix JSON summaries into a consolidated report with matrix grid. Add Slack notifiers for both per-job and consolidated messages with HTML file upload support. Add nightly_summary.sh wrapper for the post-test aggregation job. Add static HTML dashboard with matrix overview, failure drill-down, and trend charts reading from S3 index.json. --- ci/dashboard/index.html | 652 ++++++++++++++++++++++++++ ci/nightly_summary.sh | 71 +++ ci/utils/aggregate_nightly.py | 605 ++++++++++++++++++++++++ ci/utils/send_consolidated_summary.sh | 287 ++++++++++++ ci/utils/send_nightly_summary.sh | 172 +++++++ 5 files changed, 1787 insertions(+) create mode 100644 ci/dashboard/index.html create mode 100755 ci/nightly_summary.sh create mode 100644 ci/utils/aggregate_nightly.py create mode 100755 ci/utils/send_consolidated_summary.sh create mode 100755 ci/utils/send_nightly_summary.sh diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html new file mode 100644 index 0000000000..9b56a7c915 --- /dev/null +++ b/ci/dashboard/index.html @@ -0,0 +1,652 @@ + + + + + +cuOpt Nightly Test Dashboard + + + + + + + + + + + + +
+
Loading dashboard data...
+
+ + + + + + + + + diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh new file mode 100755 index 0000000000..93576e1795 --- /dev/null +++ b/ci/nightly_summary.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Aggregate all per-matrix nightly test summaries and send a single +# consolidated Slack notification. Runs as a post-test job after all +# matrix CI jobs finish. +# +# Required environment variables: +# CUOPT_DATASET_S3_URI - S3 base URI +# CUOPT_AWS_ACCESS_KEY_ID - AWS credentials +# CUOPT_AWS_SECRET_ACCESS_KEY +# +# Optional: +# CUOPT_SLACK_WEBHOOK_URL - sends Slack if set +# RAPIDS_BRANCH - branch name (default: main) +# RAPIDS_BUILD_TYPE - build type (nightly, pull-request, etc.) + +set -euo pipefail + +SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" +OUTPUT_DIR="${PWD}/aggregate-output" +mkdir -p "${OUTPUT_DIR}" + +RUN_DATE="$(date +%F)" +BRANCH="${RAPIDS_BRANCH:-main}" + +GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}" + +if [ -z "${CUOPT_DATASET_S3_URI:-}" ]; then + echo "ERROR: CUOPT_DATASET_S3_URI is not set. Cannot aggregate." >&2 + exit 1 +fi + +S3_BASE="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" +S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/${RUN_DATE}/" +S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/" +S3_CONSOLIDATED_JSON="${S3_BASE}/summaries/${RUN_DATE}/consolidated.json" +S3_CONSOLIDATED_HTML="${S3_BASE}/reports/${RUN_DATE}/consolidated.html" +S3_INDEX_URI="${S3_BASE}/index.json" +S3_DASHBOARD_URI="${S3_BASE}/dashboard/index.html" +DASHBOARD_DIR="${SCRIPT_DIR}/dashboard" + +echo "Aggregating nightly summaries from ${S3_SUMMARIES_PREFIX}" + +python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \ + --s3-summaries-prefix "${S3_SUMMARIES_PREFIX}" \ + --s3-reports-prefix "${S3_REPORTS_PREFIX}" \ + --s3-output-uri "${S3_CONSOLIDATED_JSON}" \ + --s3-html-output-uri "${S3_CONSOLIDATED_HTML}" \ + --s3-index-uri "${S3_INDEX_URI}" \ + --s3-dashboard-uri "${S3_DASHBOARD_URI}" \ + --dashboard-dir "${DASHBOARD_DIR}" \ + --output-dir "${OUTPUT_DIR}" \ + --date "${RUN_DATE}" \ + --branch "${BRANCH}" \ + --github-run-url "${GITHUB_RUN_URL}" + +# Send consolidated Slack notification if webhook is available and this is a nightly build +if [ -n "${CUOPT_SLACK_WEBHOOK_URL:-}" ] && [ "${RAPIDS_BUILD_TYPE:-}" = "nightly" ]; then + echo "Sending consolidated Slack notification" + CONSOLIDATED_SUMMARY="${OUTPUT_DIR}/consolidated_summary.json" \ + CONSOLIDATED_HTML="${OUTPUT_DIR}/consolidated_report.html" \ + SLACK_WEBHOOK_URL="${CUOPT_SLACK_WEBHOOK_URL}" \ + SLACK_BOT_TOKEN="${CUOPT_SLACK_BOT_TOKEN:-}" \ + SLACK_CHANNEL_ID="${CUOPT_SLACK_CHANNEL_ID:-}" \ + REPORT_URL="${S3_CONSOLIDATED_HTML}" \ + bash "${SCRIPT_DIR}/utils/send_consolidated_summary.sh" +fi + +echo "Nightly summary complete." diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py new file mode 100644 index 0000000000..56ade2796e --- /dev/null +++ b/ci/utils/aggregate_nightly.py @@ -0,0 +1,605 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Aggregate per-matrix nightly test summaries into a single consolidated report. + +Runs as a post-test job after all matrix CI jobs finish. It: + 1. Lists all JSON summaries uploaded to S3 for today's date + 2. Downloads and merges them + 3. Builds a matrix grid (test_type x matrix_label → status) + 4. Generates a consolidated JSON, HTML report, and Slack payload + 5. Uploads the consolidated report to S3 + +Usage: + python ci/utils/aggregate_nightly.py \\ + --s3-summaries-prefix s3://bucket/ci_test_reports/nightly/summaries/2026-04-13/ \\ + --s3-reports-prefix s3://bucket/ci_test_reports/nightly/reports/2026-04-13/ \\ + --output-dir /tmp/aggregate-output \\ + --date 2026-04-13 \\ + --branch main +""" + +import argparse +import json +import os +import sys +from datetime import datetime, timezone +from pathlib import Path + +# Ensure ci/utils is importable when invoked as a script +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from s3_helpers import s3_download, s3_upload, s3_list # noqa: E402 + + +# --------------------------------------------------------------------------- +# Download and merge summaries +# --------------------------------------------------------------------------- + +def download_summaries(s3_prefix, local_dir): + """Download all JSON summaries from S3 prefix into local_dir. + Returns list of loaded summary dicts.""" + local_dir = Path(local_dir) + local_dir.mkdir(parents=True, exist_ok=True) + + uris = s3_list(s3_prefix) + json_uris = [u for u in uris if u.endswith(".json")] + print(f"Found {len(json_uris)} summary file(s) at {s3_prefix}") + + summaries = [] + for uri in json_uris: + filename = uri.rsplit("/", 1)[-1] + local_path = str(local_dir / filename) + if s3_download(uri, local_path): + try: + with open(local_path) as f: + summaries.append(json.load(f)) + except (json.JSONDecodeError, OSError) as exc: + print(f"WARNING: Failed to parse {local_path}: {exc}", + file=sys.stderr) + return summaries + + +def load_local_summaries(local_dir): + """Load summaries from a local directory (for testing without S3).""" + local_dir = Path(local_dir) + summaries = [] + for json_file in sorted(local_dir.glob("*.json")): + try: + with open(json_file) as f: + summaries.append(json.load(f)) + except (json.JSONDecodeError, OSError) as exc: + print(f"WARNING: Failed to parse {json_file}: {exc}", + file=sys.stderr) + return summaries + + +# --------------------------------------------------------------------------- +# Aggregation +# --------------------------------------------------------------------------- + +def aggregate_summaries(summaries): + """Merge per-matrix summaries into a consolidated view. + + Returns a dict with: + - matrix_grid: list of {test_type, matrix_label, status, counts, ...} + - totals: aggregate counts + - all_new_failures, all_recurring_failures, all_flaky_tests, + all_resolved_tests: merged lists with matrix context added + """ + grid = [] + totals = { + "total": 0, "passed": 0, "failed": 0, + "flaky": 0, "skipped": 0, "resolved": 0, + } + all_new_failures = [] + all_recurring_failures = [] + all_flaky_tests = [] + all_resolved_tests = [] + + for s in summaries: + test_type = s.get("test_type", "unknown") + matrix_label = s.get("matrix_label", "unknown") + counts = s.get("counts", {}) + + # Determine job status + failed = counts.get("failed", 0) + flaky = counts.get("flaky", 0) + has_new = s.get("has_new_failures", False) + + if failed > 0: + status = "failed-new" if has_new else "failed-recurring" + elif flaky > 0: + status = "flaky" + elif counts.get("total", 0) == 0: + status = "no-results" + else: + status = "passed" + + grid.append({ + "test_type": test_type, + "matrix_label": matrix_label, + "status": status, + "counts": counts, + "sha": s.get("sha", ""), + }) + + # Accumulate totals + for key in totals: + totals[key] += counts.get(key, 0) + + # Merge failure lists with matrix context + ctx = {"test_type": test_type, "matrix_label": matrix_label} + for entry in s.get("new_failures", []): + all_new_failures.append({**entry, **ctx}) + for entry in s.get("recurring_failures", []): + all_recurring_failures.append({**entry, **ctx}) + for entry in s.get("flaky_tests", []): + all_flaky_tests.append({**entry, **ctx}) + for entry in s.get("resolved_tests", []): + all_resolved_tests.append({**entry, **ctx}) + + # Sort grid for consistent display + grid.sort(key=lambda g: (g["test_type"], g["matrix_label"])) + + return { + "matrix_grid": grid, + "totals": totals, + "all_new_failures": all_new_failures, + "all_recurring_failures": all_recurring_failures, + "all_flaky_tests": all_flaky_tests, + "all_resolved_tests": all_resolved_tests, + } + + +# --------------------------------------------------------------------------- +# Consolidated JSON +# --------------------------------------------------------------------------- + +def generate_consolidated_json(agg, date_str, branch, github_run_url=""): + """Generate the consolidated JSON for Slack and dashboard.""" + total_jobs = len(agg["matrix_grid"]) + failed_jobs = sum( + 1 for g in agg["matrix_grid"] if g["status"].startswith("failed") + ) + flaky_jobs = sum( + 1 for g in agg["matrix_grid"] if g["status"] == "flaky" + ) + passed_jobs = sum( + 1 for g in agg["matrix_grid"] if g["status"] == "passed" + ) + + return { + "timestamp": datetime.now(timezone.utc).isoformat(), + "date": date_str, + "branch": branch, + "github_run_url": github_run_url, + "job_summary": { + "total": total_jobs, + "passed": passed_jobs, + "failed": failed_jobs, + "flaky": flaky_jobs, + }, + "test_totals": agg["totals"], + "has_new_failures": len(agg["all_new_failures"]) > 0, + "matrix_grid": agg["matrix_grid"], + "new_failures": agg["all_new_failures"], + "recurring_failures": agg["all_recurring_failures"], + "flaky_tests": agg["all_flaky_tests"], + "resolved_tests": agg["all_resolved_tests"], + } + + +# --------------------------------------------------------------------------- +# Consolidated HTML +# --------------------------------------------------------------------------- + +def _html_escape(text): + return ( + str(text).replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) + + +def _status_badge(status): + """Return an HTML badge for a matrix cell status.""" + colors = { + "passed": ("#388e3c", "PASS"), + "failed-new": ("#d32f2f", "NEW FAIL"), + "failed-recurring": ("#e65100", "RECURRING"), + "flaky": ("#f9a825", "FLAKY"), + "no-results": ("#757575", "NO DATA"), + } + bg, label = colors.get(status, ("#757575", status.upper())) + text_color = "#212121" if status == "flaky" else "#fff" + return ( + f'' + f'{label}' + ) + + +def generate_consolidated_html( + agg, date_str, branch, github_run_url="", s3_reports_prefix="", +): + """Generate a consolidated HTML dashboard for all matrix combos.""" + total_jobs = len(agg["matrix_grid"]) + failed_jobs = sum( + 1 for g in agg["matrix_grid"] if g["status"].startswith("failed") + ) + + if failed_jobs > 0: + bar_color = "#d32f2f" + bar_text = f"{failed_jobs} of {total_jobs} matrix jobs have failures" + elif any(g["status"] == "flaky" for g in agg["matrix_grid"]): + bar_color = "#f9a825" + bar_text = "All jobs passed (flaky tests detected)" + else: + bar_color = "#388e3c" + bar_text = f"All {total_jobs} matrix jobs passed" + + totals = agg["totals"] + + parts = [] + parts.append(f""" + + + + +cuOpt Nightly — {_html_escape(branch)} — {_html_escape(date_str)} + + + +

cuOpt Nightly Tests — {_html_escape(branch)}

+
+ Date: {_html_escape(date_str)}""") + + if github_run_url: + parts.append( + f'  |  ' + f'GitHub Actions Run' + ) + + parts.append(f"""
+
{bar_text}
+
+
{totals['total']}
Total Tests
+
{totals['passed']}
Passed
+
{totals['failed']}
Failed
+
{totals['flaky']}
Flaky
+
Skipped
+
{totals['resolved']}
Stabilized
+
""") + + # --- Matrix grid --- + parts.append("

Matrix Overview

") + parts.append( + "" + "" + ) + for g in agg["matrix_grid"]: + counts = g["counts"] + # Build link to per-matrix HTML report on S3 + report_link = "" + if s3_reports_prefix: + report_filename = ( + f'{g["test_type"]}-{g["matrix_label"]}.html' + ) + report_link = ( + f'View' + ) + parts.append( + f'' + f'' + f'' + f'' + f'' + f'' + f'' + f"" + ) + parts.append("
Test TypeMatrixStatusPassedFailedFlakyTotalReport
{_html_escape(g["test_type"])}{_html_escape(g["matrix_label"])}{_status_badge(g["status"])}{counts.get("passed", 0)}{counts.get("failed", 0)}{counts.get("flaky", 0)}{counts.get("total", 0)}{report_link}
") + + # --- New failures --- + if agg["all_new_failures"]: + parts.append("

New Failures

") + parts.append( + "" + "" + ) + for e in agg["all_new_failures"]: + msg = _html_escape(e.get("message", "")) + short = _html_escape(e.get("message", "")[:100]) + parts.append( + f'' + f'' + f'' + f'' + f'' + ) + parts.append("
Test TypeMatrixSuiteTestError
{_html_escape(e["test_type"])}{_html_escape(e["matrix_label"])}{_html_escape(e["suite"])}{_html_escape(e["name"])}
{short}' + f'
{msg}
") + + # --- Recurring failures --- + if agg["all_recurring_failures"]: + parts.append("

Recurring Failures

") + parts.append( + "" + "" + ) + for e in agg["all_recurring_failures"]: + msg = _html_escape(e.get("message", "")) + short = _html_escape(e.get("message", "")[:100]) + parts.append( + f'' + f'' + f'' + f'' + f'' + f'' + ) + parts.append("
Test TypeMatrixSuiteTestSinceError
{_html_escape(e["test_type"])}{_html_escape(e["matrix_label"])}{_html_escape(e["suite"])}{_html_escape(e["name"])}{_html_escape(e.get("first_seen", "?"))}
{short}' + f'
{msg}
") + + # --- Resolved --- + if agg["all_resolved_tests"]: + parts.append("

Stabilized Tests

") + parts.append( + "" + "" + ) + for e in agg["all_resolved_tests"]: + parts.append( + f'' + f'' + f'' + f'' + f'' + f'' + ) + parts.append("
Test TypeMatrixSuiteTestFailing SinceCount
{_html_escape(e["test_type"])}{_html_escape(e["matrix_label"])}{_html_escape(e["suite"])}{_html_escape(e["name"])}{_html_escape(e.get("first_seen", "?"))}{e.get("failure_count", "?")}
") + + # --- Flaky --- + if agg["all_flaky_tests"]: + parts.append("

Flaky Tests

") + parts.append( + "" + "" + ) + for e in agg["all_flaky_tests"]: + parts.append( + f'' + f'' + f'' + f'' + f'' + ) + parts.append("
Test TypeMatrixSuiteTestRetries
{_html_escape(e["test_type"])}{_html_escape(e["matrix_label"])}{_html_escape(e["suite"])}{_html_escape(e["name"])}{e.get("retry_count", "?")}
") + + if ( + not agg["all_new_failures"] + and not agg["all_recurring_failures"] + and not agg["all_flaky_tests"] + and not agg["all_resolved_tests"] + ): + parts.append( + '

' + "All tests passed across all matrices!

" + ) + + parts.append("") + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Index management +# --------------------------------------------------------------------------- + +MAX_INDEX_DAYS = 90 # Keep at most 90 days in the index + + +def update_index(s3_index_uri, date_str, consolidated, output_dir): + """Download index.json, add today's entry, prune old entries, re-upload.""" + local_index = str(output_dir / "index.json") + + # Download existing index (or start fresh) + index = {"_schema_version": 1, "dates": {}} + if s3_download(s3_index_uri, local_index): + try: + with open(local_index) as f: + loaded = json.load(f) + if "dates" in loaded: + index = loaded + except (json.JSONDecodeError, OSError): + pass + + # Add today's entry (compact — just enough for the dashboard trends) + index["dates"][date_str] = { + "job_summary": consolidated.get("job_summary", {}), + "test_totals": consolidated.get("test_totals", {}), + "has_new_failures": consolidated.get("has_new_failures", False), + "branch": consolidated.get("branch", ""), + "github_run_url": consolidated.get("github_run_url", ""), + } + + # Prune to last N days + dates_sorted = sorted(index["dates"].keys(), reverse=True) + if len(dates_sorted) > MAX_INDEX_DAYS: + for old_date in dates_sorted[MAX_INDEX_DAYS:]: + del index["dates"][old_date] + + # Write and upload + with open(local_index, "w") as f: + json.dump(index, f, indent=2, sort_keys=True) + f.write("\n") + print(f"Updated index.json with {len(index['dates'])} date(s)") + + s3_upload(local_index, s3_index_uri) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description="Aggregate per-matrix nightly test summaries" + ) + parser.add_argument( + "--s3-summaries-prefix", default="", + help="S3 prefix for per-matrix JSON summaries (e.g., s3://bucket/.../summaries/2026-04-13/)", + ) + parser.add_argument( + "--s3-reports-prefix", default="", + help="S3 prefix where per-matrix HTML reports live (for linking)", + ) + parser.add_argument( + "--s3-output-uri", default="", + help="S3 URI to upload the consolidated JSON", + ) + parser.add_argument( + "--s3-html-output-uri", default="", + help="S3 URI to upload the consolidated HTML report", + ) + parser.add_argument( + "--s3-index-uri", default="", + help="S3 URI for the index.json that tracks all available dates (read + write)", + ) + parser.add_argument( + "--s3-dashboard-uri", default="", + help="S3 URI to upload the dashboard HTML (e.g., s3://bucket/.../dashboard/index.html)", + ) + parser.add_argument( + "--dashboard-dir", default="", + help="Local directory containing dashboard files to upload", + ) + parser.add_argument( + "--local-summaries-dir", default="", + help="Local directory with JSON summaries (alternative to S3, for testing)", + ) + parser.add_argument( + "--output-dir", default="aggregate-output", + help="Local directory to write output files", + ) + parser.add_argument( + "--date", + default=datetime.now(timezone.utc).strftime("%Y-%m-%d"), + help="Date for this run (YYYY-MM-DD)", + ) + parser.add_argument("--branch", default="main", help="Branch name") + parser.add_argument( + "--github-run-url", default="", help="URL to the GitHub Actions run", + ) + + args = parser.parse_args() + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # ---- Step 1: Collect summaries ---- + if args.local_summaries_dir: + summaries = load_local_summaries(args.local_summaries_dir) + elif args.s3_summaries_prefix: + download_dir = output_dir / "downloaded_summaries" + summaries = download_summaries(args.s3_summaries_prefix, download_dir) + else: + print("ERROR: Provide --s3-summaries-prefix or --local-summaries-dir", + file=sys.stderr) + return 1 + + if not summaries: + print("WARNING: No summaries found. Generating empty report.", + file=sys.stderr) + + print(f"Loaded {len(summaries)} matrix summary file(s)") + + # ---- Step 2: Aggregate ---- + agg = aggregate_summaries(summaries) + print( + f"Matrix grid: {len(agg['matrix_grid'])} jobs — " + f"{sum(1 for g in agg['matrix_grid'] if g['status'] == 'passed')} passed, " + f"{sum(1 for g in agg['matrix_grid'] if g['status'].startswith('failed'))} failed, " + f"{sum(1 for g in agg['matrix_grid'] if g['status'] == 'flaky')} flaky" + ) + + # ---- Step 3: Generate outputs ---- + consolidated = generate_consolidated_json( + agg, args.date, args.branch, args.github_run_url, + ) + + json_path = output_dir / "consolidated_summary.json" + json_path.write_text(json.dumps(consolidated, indent=2) + "\n") + print(f"Consolidated JSON written to {json_path}") + + html_report = generate_consolidated_html( + agg, args.date, args.branch, args.github_run_url, + args.s3_reports_prefix, + ) + html_path = output_dir / "consolidated_report.html" + html_path.write_text(html_report) + print(f"Consolidated HTML written to {html_path}") + + # ---- Step 4: Upload to S3 ---- + if args.s3_output_uri: + s3_upload(str(json_path), args.s3_output_uri) + if args.s3_html_output_uri: + s3_upload(str(html_path), args.s3_html_output_uri) + + # ---- Step 5: Update index.json ---- + if args.s3_index_uri: + update_index( + args.s3_index_uri, args.date, consolidated, output_dir, + ) + + # ---- Step 6: Upload dashboard ---- + if args.s3_dashboard_uri and args.dashboard_dir: + dashboard_file = Path(args.dashboard_dir) / "index.html" + if dashboard_file.exists(): + s3_upload(str(dashboard_file), args.s3_dashboard_uri) + else: + print(f"WARNING: Dashboard not found at {dashboard_file}", + file=sys.stderr) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh new file mode 100755 index 0000000000..4f421678dc --- /dev/null +++ b/ci/utils/send_consolidated_summary.sh @@ -0,0 +1,287 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Send a single consolidated Slack notification for the entire nightly run. +# Reads the aggregated JSON produced by aggregate_nightly.py and sends a rich +# Slack message with: +# - Matrix grid overview (test_type x matrix → status) +# - Failure tables with :new: / :repeat: badges and matrix context +# - @channel on new genuine failures +# - Stabilized and flaky test summaries +# - Link to GitHub Actions run and consolidated HTML report +# +# Required environment variables: +# SLACK_WEBHOOK_URL - Slack incoming webhook URL +# CONSOLIDATED_SUMMARY - Path to consolidated_summary.json +# +# Optional environment variables: +# REPORT_URL - Link to the consolidated HTML report on S3 +# CONSOLIDATED_HTML - Path to consolidated HTML file to upload to Slack +# SLACK_BOT_TOKEN - Slack Bot Token (xoxb-*) for file uploads +# SLACK_CHANNEL_ID - Slack channel ID for file uploads (required with bot token) + +set -euo pipefail + +CONSOLIDATED_SUMMARY="${CONSOLIDATED_SUMMARY:?CONSOLIDATED_SUMMARY must point to consolidated_summary.json}" +SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:?SLACK_WEBHOOK_URL is required}" +REPORT_URL="${REPORT_URL:-}" +CONSOLIDATED_HTML="${CONSOLIDATED_HTML:-}" +SLACK_BOT_TOKEN="${SLACK_BOT_TOKEN:-}" +SLACK_CHANNEL_ID="${SLACK_CHANNEL_ID:-}" + +if [ ! -f "${CONSOLIDATED_SUMMARY}" ]; then + echo "ERROR: Summary file not found: ${CONSOLIDATED_SUMMARY}" >&2 + exit 1 +fi + +PAYLOAD=$(python3 - "${CONSOLIDATED_SUMMARY}" "${REPORT_URL}" <<'PYEOF' +import json, sys + +summary_path, report_url = sys.argv[1:3] + +with open(summary_path) as f: + d = json.load(f) + +branch = d.get("branch", "main") +date = d.get("date", "unknown") +github_run_url = d.get("github_run_url", "") +jobs = d.get("job_summary", {}) +totals = d.get("test_totals", {}) +grid = d.get("matrix_grid", []) +has_new = d.get("has_new_failures", False) + +total_jobs = jobs.get("total", 0) +failed_jobs = jobs.get("failed", 0) +flaky_jobs = jobs.get("flaky", 0) +passed_jobs = jobs.get("passed", 0) + +# --- Status line --- +if failed_jobs > 0 and has_new: + emoji = ":rotating_light:" + text = f"NEW test failures in {failed_jobs} matrix job(s)" + mention = " " +elif failed_jobs > 0: + emoji = ":x:" + text = f"Recurring failures in {failed_jobs} matrix job(s)" + mention = "" +elif flaky_jobs > 0: + emoji = ":large_yellow_circle:" + text = "All jobs passed but flaky tests detected" + mention = "" +else: + emoji = ":white_check_mark:" + text = f"All {total_jobs} matrix jobs passed" + mention = "" + +stats = ( + f":white_check_mark: {totals.get('passed', 0)} passed | " + f":x: {totals.get('failed', 0)} failed | " + f":warning: {totals.get('flaky', 0)} flaky | " + f":fast_forward: {totals.get('skipped', 0)} skipped | " + f"Total: {totals.get('total', 0)}" +) + +blocks = [] + +# Header +blocks.append({ + "type": "header", + "text": { + "type": "plain_text", + "text": f"cuOpt Nightly Tests \u2014 {branch} \u2014 {date}", + "emoji": True, + }, +}) + +# Status summary +blocks.append({ + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"{mention}{emoji} *{text}*\n\n{stats}", + }, +}) + +blocks.append({"type": "divider"}) + +# --- Matrix grid (compact) --- +# Group by test_type for readability +test_types = {} +for g in grid: + tt = g["test_type"] + test_types.setdefault(tt, []).append(g) + +status_icons = { + "passed": ":white_check_mark:", + "failed-new": ":rotating_light:", + "failed-recurring": ":x:", + "flaky": ":warning:", + "no-results": ":grey_question:", +} + +grid_lines = [] +for tt, entries in sorted(test_types.items()): + cells = [] + for g in entries: + icon = status_icons.get(g["status"], ":grey_question:") + label = g["matrix_label"] + failed_count = g["counts"].get("failed", 0) + if failed_count > 0: + cells.append(f"{icon} `{label}` ({failed_count} failures)") + else: + cells.append(f"{icon} `{label}`") + grid_lines.append(f"*{tt}*\n" + "\n".join(f" {c}" for c in cells)) + +# Slack blocks have a 3000 char limit per text field; truncate if needed +grid_text = "\n".join(grid_lines) +if len(grid_text) > 2900: + # Summarize instead of full grid + grid_text = ( + f"*Matrix Summary:* {passed_jobs} passed, {failed_jobs} failed, " + f"{flaky_jobs} flaky out of {total_jobs} jobs\n" + f"_(Full matrix in report link below)_" + ) + +blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": grid_text}, +}) + +# --- New failures (max 10 to avoid hitting Slack limits) --- +new_failures = d.get("new_failures", []) +if new_failures: + blocks.append({"type": "divider"}) + lines = [] + for f_entry in new_failures[:10]: + msg = f_entry.get("message", "")[:50].replace("\n", " ") + matrix = f_entry.get("matrix_label", "") + lines.append( + f" :new: `{f_entry['name']}` ({f_entry['test_type']} / {matrix}) \u2014 {msg}" + ) + if len(new_failures) > 10: + lines.append(f" _...and {len(new_failures) - 10} more_") + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": "*New Failures:*\n" + "\n".join(lines)}, + }) + +# --- Recurring failures (max 10) --- +recurring = d.get("recurring_failures", []) +if recurring: + blocks.append({"type": "divider"}) + lines = [] + for f_entry in recurring[:10]: + matrix = f_entry.get("matrix_label", "") + first = f_entry.get("first_seen", "?") + lines.append( + f" :repeat: `{f_entry['name']}` ({f_entry['test_type']} / {matrix}) \u2014 since {first}" + ) + if len(recurring) > 10: + lines.append(f" _...and {len(recurring) - 10} more_") + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": "*Recurring Failures:*\n" + "\n".join(lines)}, + }) + +# --- Stabilized --- +resolved = d.get("resolved_tests", []) +if resolved: + lines = [] + for r in resolved[:5]: + matrix = r.get("matrix_label", "") + count = r.get("failure_count", "?") + lines.append( + f" :white_check_mark: `{r['name']}` ({r['test_type']} / {matrix}) \u2014 failed {count}x" + ) + if len(resolved) > 5: + lines.append(f" _...and {len(resolved) - 5} more_") + blocks.append({ + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Stabilized (were failing, now pass):*\n" + "\n".join(lines), + }, + }) + +# --- Flaky summary (count only to save space) --- +flaky = d.get("flaky_tests", []) +if flaky: + # Group by test name to show unique flaky tests + unique_flaky = {} + for f_entry in flaky: + key = f_entry["name"] + unique_flaky.setdefault(key, []).append(f_entry.get("matrix_label", "")) + lines = [] + for name, matrices in sorted(unique_flaky.items())[:5]: + matrix_str = ", ".join(matrices[:3]) + if len(matrices) > 3: + matrix_str += f" +{len(matrices)-3} more" + lines.append(f" :warning: `{name}` ({matrix_str})") + if len(unique_flaky) > 5: + lines.append(f" _...and {len(unique_flaky) - 5} more unique flaky tests_") + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": "*Flaky Tests:*\n" + "\n".join(lines)}, + }) + +# --- Links --- +link_parts = [] +if github_run_url: + link_parts.append(f"<{github_run_url}|GitHub Actions>") +if report_url: + link_parts.append(f"<{report_url}|Full Report>") +if link_parts: + blocks.append({"type": "divider"}) + blocks.append({ + "type": "context", + "elements": [{"type": "mrkdwn", "text": " ".join(link_parts)}], + }) + +payload = { + "channel": "cuopt-regression-testing", + "username": "cuOpt Nightly Bot", + "icon_emoji": ":robot_face:", + "blocks": blocks, +} +print(json.dumps(payload)) +PYEOF +) + +echo "Sending consolidated Slack notification..." +curl -s -X POST \ + -H 'Content-type: application/json' \ + --data "${PAYLOAD}" \ + "${SLACK_WEBHOOK_URL}" + +echo "" +echo "Consolidated Slack notification sent." + +# Upload HTML report as a file to Slack (requires bot token) +if [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ] && [ -n "${CONSOLIDATED_HTML}" ] && [ -f "${CONSOLIDATED_HTML}" ]; then + echo "Uploading HTML report to Slack..." + + # Read date and branch from the summary for the filename + REPORT_DATE=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('date','report'))" "${CONSOLIDATED_SUMMARY}" 2>/dev/null || echo "report") + REPORT_BRANCH=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('branch','main'))" "${CONSOLIDATED_SUMMARY}" 2>/dev/null || echo "main") + UPLOAD_FILENAME="cuopt-nightly-${REPORT_BRANCH}-${REPORT_DATE}.html" + + UPLOAD_RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -F "channels=${SLACK_CHANNEL_ID}" \ + -F "file=@${CONSOLIDATED_HTML}" \ + -F "filename=${UPLOAD_FILENAME}" \ + -F "title=cuOpt Nightly Report — ${REPORT_BRANCH} — ${REPORT_DATE}" \ + -F "initial_comment=Full nightly test report attached. Download and open in a browser for interactive details." \ + "https://slack.com/api/files.upload") + + if echo "${UPLOAD_RESPONSE}" | python3 -c "import json,sys; sys.exit(0 if json.load(sys.stdin).get('ok') else 1)" 2>/dev/null; then + echo "HTML report uploaded to Slack." + else + echo "WARNING: Slack file upload failed. Response: ${UPLOAD_RESPONSE}" >&2 + fi +else + if [ -n "${SLACK_BOT_TOKEN}" ] && [ -z "${SLACK_CHANNEL_ID}" ]; then + echo "WARNING: SLACK_BOT_TOKEN set but SLACK_CHANNEL_ID missing, skipping file upload." >&2 + fi +fi diff --git a/ci/utils/send_nightly_summary.sh b/ci/utils/send_nightly_summary.sh new file mode 100755 index 0000000000..7c2d16519c --- /dev/null +++ b/ci/utils/send_nightly_summary.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Enhanced Slack notification for nightly test results. +# Reads the JSON summary produced by nightly_report.py and sends a rich +# Slack message with: +# - Failure tables with :new: / :repeat: badges +# - @channel on new genuine failures +# - Stabilized tests (were failing, now passing) +# - Flaky test list +# +# Required environment variables: +# SLACK_WEBHOOK_URL - Slack incoming webhook URL (set from CUOPT_SLACK_WEBHOOK_URL in CI) +# NIGHTLY_SUMMARY - Path to nightly_summary.json from nightly_report.py +# +# Optional environment variables: +# GITHUB_RUN_URL - Link to the GitHub Actions run +# REPORT_URL - Link to the S3 HTML report +# CUOPT_BRANCH - Branch name (e.g. main, release/26.06) + +set -euo pipefail + +NIGHTLY_SUMMARY="${NIGHTLY_SUMMARY:?NIGHTLY_SUMMARY must point to nightly_summary.json}" +SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:?SLACK_WEBHOOK_URL is required}" +GITHUB_RUN_URL="${GITHUB_RUN_URL:-}" +REPORT_URL="${REPORT_URL:-}" +CUOPT_BRANCH="${CUOPT_BRANCH:-main}" + +if [ ! -f "${NIGHTLY_SUMMARY}" ]; then + echo "ERROR: Summary file not found: ${NIGHTLY_SUMMARY}" >&2 + exit 1 +fi + +# Build the entire Slack payload in Python for safe JSON handling. +# Shell variable interpolation into nested JSON is brittle; Python reads the +# summary file directly and produces a valid JSON payload on stdout. +PAYLOAD=$(python3 - "${NIGHTLY_SUMMARY}" "${CUOPT_BRANCH}" "${GITHUB_RUN_URL}" "${REPORT_URL}" <<'PYEOF' +import json, sys + +summary_path, branch, github_run_url, report_url = sys.argv[1:5] + +with open(summary_path) as f: + d = json.load(f) + +counts = d["counts"] +total = counts["total"] +passed = counts["passed"] +failed = counts["failed"] +flaky = counts["flaky"] +skipped = counts["skipped"] +resolved = counts.get("resolved", 0) +has_new = d["has_new_failures"] + +# --- Status line --- +if failed > 0: + if has_new: + emoji = ":rotating_light:" + text = "NEW test failures detected" + mention = " " + else: + emoji = ":x:" + text = "Recurring test failures" + mention = "" +elif flaky > 0: + emoji = ":large_yellow_circle:" + text = "All passed but flaky tests detected" + mention = "" +else: + emoji = ":white_check_mark:" + text = "All tests passed" + mention = "" + +stats = ( + f":white_check_mark: {passed} passed | :x: {failed} failed | " + f":warning: {flaky} flaky | :fast_forward: {skipped} skipped | Total: {total}" +) + +blocks = [] + +# Header +blocks.append({ + "type": "header", + "text": {"type": "plain_text", "text": f"cuOpt Nightly Tests \u2014 {branch}", "emoji": True}, +}) + +# Status summary +blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": f"{mention}{emoji} *{text}*\n\n{stats}"}, +}) + +blocks.append({"type": "divider"}) + +# --- Genuine failures --- +if failed > 0: + lines = [] + for f_entry in d.get("new_failures", []): + msg = f_entry.get("message", "")[:60].replace("\n", " ") + lines.append(f" :new: `{f_entry['name']}` ({f_entry['suite']}) \u2014 {msg}") + for f_entry in d.get("recurring_failures", []): + msg = f_entry.get("message", "")[:60].replace("\n", " ") + first = f_entry.get("first_seen", "?") + lines.append(f" :repeat: `{f_entry['name']}` ({f_entry['suite']}) \u2014 since {first}") + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": "*Genuine Failures:*\n" + "\n".join(lines)}, + }) + +# --- Stabilized tests --- +resolved_list = d.get("resolved_tests", []) +if resolved_list: + lines = [] + for r in resolved_list: + since = r.get("first_seen", "?") + count = r.get("failure_count", "?") + flaky_tag = " (was flaky)" if r.get("was_flaky") else "" + lines.append( + f" :white_check_mark: `{r['name']}` ({r['suite']}) \u2014 " + f"failing since {since}, failed {count}x{flaky_tag}" + ) + blocks.append({ + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Stabilized (were failing, now pass):*\n" + "\n".join(lines), + }, + }) + +# --- Flaky tests --- +flaky_list = d.get("flaky_tests", []) +if flaky_list: + lines = [] + for f_entry in flaky_list: + retries = f_entry.get("retry_count", "?") + lines.append(f" :warning: `{f_entry['name']}` ({f_entry['suite']}) \u2014 {retries} retries") + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": "*Flaky Tests (passed on retry):*\n" + "\n".join(lines)}, + }) + +# --- Links --- +link_parts = [] +if github_run_url: + link_parts.append(f"<{github_run_url}|GitHub Actions>") +if report_url: + link_parts.append(f"<{report_url}|Full Report>") +if link_parts: + blocks.append({"type": "divider"}) + blocks.append({ + "type": "context", + "elements": [{"type": "mrkdwn", "text": " ".join(link_parts)}], + }) + +payload = { + "channel": "cuopt-regression-testing", + "username": "cuOpt Nightly Bot", + "icon_emoji": ":robot_face:", + "blocks": blocks, +} +print(json.dumps(payload)) +PYEOF +) + +echo "Sending Slack notification..." +curl -s -X POST \ + -H 'Content-type: application/json' \ + --data "${PAYLOAD}" \ + "${SLACK_WEBHOOK_URL}" + +echo "" +echo "Slack notification sent." From 043180fb407b6b6ed6f9d976d8ab53eabf313295 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 11:46:33 -0500 Subject: [PATCH 05/60] Add nightly-summary job and secrets to test workflow Pass Slack webhook secret to all test jobs. Add nightly-summary job that runs after all test jobs complete, aggregates results from S3, sends a consolidated Slack notification, and uploads the dashboard. Pass S3 and Slack secrets via container-options for the custom job. --- .github/workflows/test.yaml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a8cc5f2943..4b52dbffe3 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -41,6 +41,8 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + script-env-secret-4-key: CUOPT_SLACK_WEBHOOK_URL + script-env-secret-4-value: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} conda-python-tests: uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main with: @@ -57,6 +59,8 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + script-env-secret-4-key: CUOPT_SLACK_WEBHOOK_URL + script-env-secret-4-value: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} wheel-tests-cuopt: uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: @@ -72,6 +76,8 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + script-env-secret-4-key: CUOPT_SLACK_WEBHOOK_URL + script-env-secret-4-value: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} wheel-tests-cuopt-server: uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: @@ -87,6 +93,8 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + script-env-secret-4-key: CUOPT_SLACK_WEBHOOK_URL + script-env-secret-4-value: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} conda-notebook-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main @@ -99,3 +107,29 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:26.06-latest" script: ci/test_notebooks.sh + nightly-summary: + if: ${{ always() && inputs.build_type == 'nightly' }} + needs: + - conda-cpp-tests + - conda-python-tests + - wheel-tests-cuopt + - wheel-tests-cuopt-server + - conda-notebook-tests + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main + with: + build_type: ${{ inputs.build_type }} + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + node_type: "cpu4" + arch: "amd64" + container_image: "rapidsai/ci-conda:26.06-latest" + container-options: >- + -e CUOPT_DATASET_S3_URI=${{ secrets.CUOPT_DATASET_S3_URI }} + -e CUOPT_AWS_ACCESS_KEY_ID=${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} + -e CUOPT_AWS_SECRET_ACCESS_KEY=${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + -e CUOPT_SLACK_WEBHOOK_URL=${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} + -e CUOPT_SLACK_BOT_TOKEN=${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + -e CUOPT_SLACK_CHANNEL_ID=${{ secrets.CUOPT_SLACK_CHANNEL_ID }} + script: ci/nightly_summary.sh From a226607c5fb2d61b9348984b838affdd062eb9ed Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 11:47:19 -0500 Subject: [PATCH 06/60] Update developer skill with CI best practices Add pitfall entries for cross-cutting change discipline: full scope audits, code duplication, CI matrix parallelism, extensibility, and actionable reporting. --- skills/cuopt-developer/SKILL.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/skills/cuopt-developer/SKILL.md b/skills/cuopt-developer/SKILL.md index 99743f9171..98fe62f19c 100644 --- a/skills/cuopt-developer/SKILL.md +++ b/skills/cuopt-developer/SKILL.md @@ -295,6 +295,16 @@ rmm::device_uvector data(100, stream); | Missing `nvcc` | Set `$CUDACXX` or add CUDA to `$PATH` | | CUDA out of memory | Reduce problem size | | Slow debug library loading | Device symbols cause delay | + +| CI state doesn't persist between runs | CI containers are ephemeral. Never write persistent state to repo files from CI — use S3 (`CUOPT_DATASET_S3_URI`) or artifact stores. Ask: "After this container dies, does tomorrow's run see today's data?" | +| CI state transitions go unreported | When CI tracks state over time (e.g. test failures), every transition (new failure, recurring, stabilized) needs an explicit notification path. Ask: "When state X changes to Y, who learns about it and how?" | +| Designing CI features without lifecycle check | Before shipping any CI feature that tracks state: (1) Where does state live between runs? (2) What writes/reads it? (3) What happens on state transitions? Verify end-to-end, not just the happy-path logic. | +| Change applied to only some targets | Before implementing, audit the full scope of what needs the change. For CI: `ls ci/test*.sh`. For APIs: grep all callers. For patterns: find every instance. Enumerate ALL targets first, implement second. | +| Shared resource ignores CI matrix parallelism | CI matrices run jobs in parallel across CUDA x Python x arch. Any shared resource (S3 paths, files, databases) must be keyed by the full execution context. Ask: "What happens when N parallel jobs access this simultaneously?" | +| Same logic duplicated across files | When the same block (>10 lines) appears in 2+ places — any language, any context — extract a shared helper immediately. Don't duplicate first and refactor later. This applies to shell scripts, Python modules, C/C++ code equally. | +| Feature not extensible for new variants | After implementing, ask: "If someone adds a new variant (test type, matrix entry, endpoint, etc.), what do they change?" If the answer is more than a one-line addition, the design needs a shared helper or auto-discovery. Avoid hardcoded lists of known variants. | +| Reports generated without actionable detail | Reports and notifications must include enough context to act without digging: error messages, execution context (matrix, commit), history (new vs recurring), and links or attachments for full details. Provide downloadable artifacts when possible. | + ## Canonical Documentation From b6d13033a036ba335fff6e5c8fe0da47210df06b Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 12:10:42 -0500 Subject: [PATCH 07/60] Fix pre-commit: ruff format, copyright years, dependency files Apply ruff formatting to Python files, update copyright years to 2026 in shell scripts, regenerate conda environment files and pyproject.toml from dependencies.yaml, and remove hardcoded version from comment. --- ci/test_cpp.sh | 2 +- ci/test_notebooks.sh | 2 +- ci/utils/aggregate_nightly.py | 199 ++++++++------- ci/utils/nightly_report.py | 226 ++++++++++++------ ci/utils/s3_helpers.py | 31 ++- ci/utils/send_nightly_summary.sh | 2 +- .../all_cuda-129_arch-aarch64.yaml | 1 + .../all_cuda-129_arch-x86_64.yaml | 1 + .../all_cuda-131_arch-aarch64.yaml | 1 + .../all_cuda-131_arch-x86_64.yaml | 1 + .../cuopt/linear_programming/pyproject.toml | 1 + python/cuopt/pyproject.toml | 1 + python/cuopt_self_hosted/pyproject.toml | 1 + python/cuopt_server/pyproject.toml | 1 + 14 files changed, 308 insertions(+), 162 deletions(-) diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 4def832194..a68e0c7979 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index b58c9a1d32..0b2b339ba1 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index 56ade2796e..4517ab3c6a 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -37,6 +37,7 @@ # Download and merge summaries # --------------------------------------------------------------------------- + def download_summaries(s3_prefix, local_dir): """Download all JSON summaries from S3 prefix into local_dir. Returns list of loaded summary dicts.""" @@ -56,8 +57,10 @@ def download_summaries(s3_prefix, local_dir): with open(local_path) as f: summaries.append(json.load(f)) except (json.JSONDecodeError, OSError) as exc: - print(f"WARNING: Failed to parse {local_path}: {exc}", - file=sys.stderr) + print( + f"WARNING: Failed to parse {local_path}: {exc}", + file=sys.stderr, + ) return summaries @@ -70,8 +73,9 @@ def load_local_summaries(local_dir): with open(json_file) as f: summaries.append(json.load(f)) except (json.JSONDecodeError, OSError) as exc: - print(f"WARNING: Failed to parse {json_file}: {exc}", - file=sys.stderr) + print( + f"WARNING: Failed to parse {json_file}: {exc}", file=sys.stderr + ) return summaries @@ -79,6 +83,7 @@ def load_local_summaries(local_dir): # Aggregation # --------------------------------------------------------------------------- + def aggregate_summaries(summaries): """Merge per-matrix summaries into a consolidated view. @@ -90,8 +95,12 @@ def aggregate_summaries(summaries): """ grid = [] totals = { - "total": 0, "passed": 0, "failed": 0, - "flaky": 0, "skipped": 0, "resolved": 0, + "total": 0, + "passed": 0, + "failed": 0, + "flaky": 0, + "skipped": 0, + "resolved": 0, } all_new_failures = [] all_recurring_failures = [] @@ -117,13 +126,15 @@ def aggregate_summaries(summaries): else: status = "passed" - grid.append({ - "test_type": test_type, - "matrix_label": matrix_label, - "status": status, - "counts": counts, - "sha": s.get("sha", ""), - }) + grid.append( + { + "test_type": test_type, + "matrix_label": matrix_label, + "status": status, + "counts": counts, + "sha": s.get("sha", ""), + } + ) # Accumulate totals for key in totals: @@ -157,18 +168,15 @@ def aggregate_summaries(summaries): # Consolidated JSON # --------------------------------------------------------------------------- + def generate_consolidated_json(agg, date_str, branch, github_run_url=""): """Generate the consolidated JSON for Slack and dashboard.""" total_jobs = len(agg["matrix_grid"]) failed_jobs = sum( 1 for g in agg["matrix_grid"] if g["status"].startswith("failed") ) - flaky_jobs = sum( - 1 for g in agg["matrix_grid"] if g["status"] == "flaky" - ) - passed_jobs = sum( - 1 for g in agg["matrix_grid"] if g["status"] == "passed" - ) + flaky_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "flaky") + passed_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "passed") return { "timestamp": datetime.now(timezone.utc).isoformat(), @@ -195,9 +203,11 @@ def generate_consolidated_json(agg, date_str, branch, github_run_url=""): # Consolidated HTML # --------------------------------------------------------------------------- + def _html_escape(text): return ( - str(text).replace("&", "&") + str(text) + .replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) @@ -218,12 +228,16 @@ def _status_badge(status): return ( f'' - f'{label}' + f"{label}" ) def generate_consolidated_html( - agg, date_str, branch, github_run_url="", s3_reports_prefix="", + agg, + date_str, + branch, + github_run_url="", + s3_reports_prefix="", ): """Generate a consolidated HTML dashboard for all matrix combos.""" total_jobs = len(agg["matrix_grid"]) @@ -295,18 +309,18 @@ def generate_consolidated_html( if github_run_url: parts.append( f'  |  ' - f'GitHub Actions Run' + f"GitHub Actions Run" ) parts.append(f"""
{bar_text}
-
{totals['total']}
Total Tests
-
{totals['passed']}
Passed
-
{totals['failed']}
Failed
-
{totals['flaky']}
Flaky
-
Skipped
-
{totals['resolved']}
Stabilized
+
{totals["total"]}
Total Tests
+
{totals["passed"]}
Passed
+
{totals["failed"]}
Failed
+
{totals["flaky"]}
Flaky
+
Skipped
+
{totals["resolved"]}
Stabilized
""") # --- Matrix grid --- @@ -320,21 +334,19 @@ def generate_consolidated_html( # Build link to per-matrix HTML report on S3 report_link = "" if s3_reports_prefix: - report_filename = ( - f'{g["test_type"]}-{g["matrix_label"]}.html' - ) + report_filename = f"{g['test_type']}-{g['matrix_label']}.html" report_link = ( f'View' ) parts.append( - f'{_html_escape(g["test_type"])}' - f'{_html_escape(g["matrix_label"])}' - f'{_status_badge(g["status"])}' - f'{counts.get("passed", 0)}' - f'{counts.get("failed", 0)}' - f'{counts.get("flaky", 0)}' - f'{counts.get("total", 0)}' + f"{_html_escape(g['test_type'])}" + f"{_html_escape(g['matrix_label'])}" + f"{_status_badge(g['status'])}" + f"{counts.get('passed', 0)}" + f"{counts.get('failed', 0)}" + f"{counts.get('flaky', 0)}" + f"{counts.get('total', 0)}" f"{report_link}" ) parts.append("") @@ -350,11 +362,11 @@ def generate_consolidated_html( msg = _html_escape(e.get("message", "")) short = _html_escape(e.get("message", "")[:100]) parts.append( - f'{_html_escape(e["test_type"])}' - f'{_html_escape(e["matrix_label"])}' - f'{_html_escape(e["suite"])}' - f'{_html_escape(e["name"])}' - f'
{short}' + f"{_html_escape(e['test_type'])}" + f"{_html_escape(e['matrix_label'])}" + f"{_html_escape(e['suite'])}" + f"{_html_escape(e['name'])}" + f"
{short}" f'
{msg}
' ) parts.append("") @@ -370,12 +382,12 @@ def generate_consolidated_html( msg = _html_escape(e.get("message", "")) short = _html_escape(e.get("message", "")[:100]) parts.append( - f'{_html_escape(e["test_type"])}' - f'{_html_escape(e["matrix_label"])}' - f'{_html_escape(e["suite"])}' - f'{_html_escape(e["name"])}' - f'{_html_escape(e.get("first_seen", "?"))}' - f'
{short}' + f"{_html_escape(e['test_type'])}" + f"{_html_escape(e['matrix_label'])}" + f"{_html_escape(e['suite'])}" + f"{_html_escape(e['name'])}" + f"{_html_escape(e.get('first_seen', '?'))}" + f"
{short}" f'
{msg}
' ) parts.append("") @@ -389,12 +401,12 @@ def generate_consolidated_html( ) for e in agg["all_resolved_tests"]: parts.append( - f'{_html_escape(e["test_type"])}' - f'{_html_escape(e["matrix_label"])}' - f'{_html_escape(e["suite"])}' - f'{_html_escape(e["name"])}' - f'{_html_escape(e.get("first_seen", "?"))}' - f'{e.get("failure_count", "?")}' + f"{_html_escape(e['test_type'])}" + f"{_html_escape(e['matrix_label'])}" + f"{_html_escape(e['suite'])}" + f"{_html_escape(e['name'])}" + f"{_html_escape(e.get('first_seen', '?'))}" + f"{e.get('failure_count', '?')}" ) parts.append("") @@ -407,11 +419,11 @@ def generate_consolidated_html( ) for e in agg["all_flaky_tests"]: parts.append( - f'{_html_escape(e["test_type"])}' - f'{_html_escape(e["matrix_label"])}' - f'{_html_escape(e["suite"])}' - f'{_html_escape(e["name"])}' - f'{e.get("retry_count", "?")}' + f"{_html_escape(e['test_type'])}" + f"{_html_escape(e['matrix_label'])}" + f"{_html_escape(e['suite'])}" + f"{_html_escape(e['name'])}" + f"{e.get('retry_count', '?')}" ) parts.append("") @@ -480,44 +492,54 @@ def update_index(s3_index_uri, date_str, consolidated, output_dir): # Main # --------------------------------------------------------------------------- + def main(): parser = argparse.ArgumentParser( description="Aggregate per-matrix nightly test summaries" ) parser.add_argument( - "--s3-summaries-prefix", default="", + "--s3-summaries-prefix", + default="", help="S3 prefix for per-matrix JSON summaries (e.g., s3://bucket/.../summaries/2026-04-13/)", ) parser.add_argument( - "--s3-reports-prefix", default="", + "--s3-reports-prefix", + default="", help="S3 prefix where per-matrix HTML reports live (for linking)", ) parser.add_argument( - "--s3-output-uri", default="", + "--s3-output-uri", + default="", help="S3 URI to upload the consolidated JSON", ) parser.add_argument( - "--s3-html-output-uri", default="", + "--s3-html-output-uri", + default="", help="S3 URI to upload the consolidated HTML report", ) parser.add_argument( - "--s3-index-uri", default="", + "--s3-index-uri", + default="", help="S3 URI for the index.json that tracks all available dates (read + write)", ) parser.add_argument( - "--s3-dashboard-uri", default="", + "--s3-dashboard-uri", + default="", help="S3 URI to upload the dashboard HTML (e.g., s3://bucket/.../dashboard/index.html)", ) parser.add_argument( - "--dashboard-dir", default="", + "--dashboard-dir", + default="", help="Local directory containing dashboard files to upload", ) parser.add_argument( - "--local-summaries-dir", default="", + "--local-summaries-dir", + default="", help="Local directory with JSON summaries (alternative to S3, for testing)", ) parser.add_argument( - "--output-dir", default="aggregate-output", + "--output-dir", + default="aggregate-output", help="Local directory to write output files", ) parser.add_argument( @@ -527,7 +549,9 @@ def main(): ) parser.add_argument("--branch", default="main", help="Branch name") parser.add_argument( - "--github-run-url", default="", help="URL to the GitHub Actions run", + "--github-run-url", + default="", + help="URL to the GitHub Actions run", ) args = parser.parse_args() @@ -541,13 +565,17 @@ def main(): download_dir = output_dir / "downloaded_summaries" summaries = download_summaries(args.s3_summaries_prefix, download_dir) else: - print("ERROR: Provide --s3-summaries-prefix or --local-summaries-dir", - file=sys.stderr) + print( + "ERROR: Provide --s3-summaries-prefix or --local-summaries-dir", + file=sys.stderr, + ) return 1 if not summaries: - print("WARNING: No summaries found. Generating empty report.", - file=sys.stderr) + print( + "WARNING: No summaries found. Generating empty report.", + file=sys.stderr, + ) print(f"Loaded {len(summaries)} matrix summary file(s)") @@ -562,7 +590,10 @@ def main(): # ---- Step 3: Generate outputs ---- consolidated = generate_consolidated_json( - agg, args.date, args.branch, args.github_run_url, + agg, + args.date, + args.branch, + args.github_run_url, ) json_path = output_dir / "consolidated_summary.json" @@ -570,7 +601,10 @@ def main(): print(f"Consolidated JSON written to {json_path}") html_report = generate_consolidated_html( - agg, args.date, args.branch, args.github_run_url, + agg, + args.date, + args.branch, + args.github_run_url, args.s3_reports_prefix, ) html_path = output_dir / "consolidated_report.html" @@ -586,7 +620,10 @@ def main(): # ---- Step 5: Update index.json ---- if args.s3_index_uri: update_index( - args.s3_index_uri, args.date, consolidated, output_dir, + args.s3_index_uri, + args.date, + consolidated, + output_dir, ) # ---- Step 6: Upload dashboard ---- @@ -595,8 +632,10 @@ def main(): if dashboard_file.exists(): s3_upload(str(dashboard_file), args.s3_dashboard_uri) else: - print(f"WARNING: Dashboard not found at {dashboard_file}", - file=sys.stderr) + print( + f"WARNING: Dashboard not found at {dashboard_file}", + file=sys.stderr, + ) return 0 diff --git a/ci/utils/nightly_report.py b/ci/utils/nightly_report.py index 40e2e65798..c098b4d66c 100755 --- a/ci/utils/nightly_report.py +++ b/ci/utils/nightly_report.py @@ -54,6 +54,7 @@ # JUnit XML parsing # --------------------------------------------------------------------------- + def parse_junit_xml(xml_path): """Parse a JUnit XML file and return a list of test result dicts.""" results = [] @@ -100,15 +101,17 @@ def parse_junit_xml(xml_path): status = "passed" message = "" - results.append({ - "suite": suite_name, - "classname": classname, - "name": name, - "status": status, - "time": time_taken, - "message": message, - "source_file": str(xml_path), - }) + results.append( + { + "suite": suite_name, + "classname": classname, + "name": name, + "status": status, + "time": time_taken, + "message": message, + "source_file": str(xml_path), + } + ) return results @@ -126,6 +129,7 @@ def collect_all_results(results_dir): # Classification # --------------------------------------------------------------------------- + def classify_failures(results): """ Classify test results into passed, failed, flaky, skipped, and error. @@ -173,6 +177,7 @@ def classify_failures(results): # History management # --------------------------------------------------------------------------- + def load_history(history_path): """Load failure history from a local JSON file.""" try: @@ -257,14 +262,16 @@ def update_history(history, classified, sha, date_str): rec["status"] = "resolved" rec["resolved_date"] = date_str rec["resolved_sha"] = sha - resolved_tests.append({ - "suite": rec["suite"], - "classname": rec["classname"], - "name": rec["name"], - "first_seen": rec["first_seen_date"], - "failure_count": rec["failure_count"], - "was_flaky": rec.get("is_flaky", False), - }) + resolved_tests.append( + { + "suite": rec["suite"], + "classname": rec["classname"], + "name": rec["name"], + "first_seen": rec["first_seen_date"], + "failure_count": rec["failure_count"], + "was_flaky": rec.get("is_flaky", False), + } + ) return history, new_failures, recurring_failures, resolved_tests @@ -280,9 +287,17 @@ def save_history(history, history_path): # Report generation # --------------------------------------------------------------------------- + def generate_markdown_report( - classified, new_failures, recurring_failures, resolved_tests, history, - test_type="", matrix_label="", sha="", date_str="", + classified, + new_failures, + recurring_failures, + resolved_tests, + history, + test_type="", + matrix_label="", + sha="", + date_str="", ): """Generate a Markdown summary report.""" lines = [] @@ -320,7 +335,9 @@ def generate_markdown_report( lines.append(f"| Flaky (passed on retry) | {total_flaky} |") lines.append(f"| Skipped | {total_skipped} |") if resolved_tests: - lines.append(f"| **Stabilized (were failing, now pass)** | **{len(resolved_tests)}** |") + lines.append( + f"| **Stabilized (were failing, now pass)** | **{len(resolved_tests)}** |" + ) lines.append("") # -- New genuine failures (highest priority) -- @@ -331,9 +348,13 @@ def generate_markdown_report( lines.append("|-------|------|-------|") for entry in new_failures: short_msg = ( - entry.get("message", "")[:80].replace("\n", " ").replace("|", "\\|") + entry.get("message", "")[:80] + .replace("\n", " ") + .replace("|", "\\|") + ) + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {short_msg} |" ) - lines.append(f"| {entry['suite']} | `{entry['name']}` | {short_msg} |") lines.append("") # -- Recurring failures -- @@ -344,11 +365,19 @@ def generate_markdown_report( lines.append("|-------|------|------------|---------------|-------|") for entry in recurring_failures: short_msg = ( - entry.get("message", "")[:60].replace("\n", " ").replace("|", "\\|") + entry.get("message", "")[:60] + .replace("\n", " ") + .replace("|", "\\|") ) first_seen = entry.get("first_seen", "unknown") - test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" - count = history.get("tests", {}).get(test_key, {}).get("failure_count", "?") + test_key = ( + f"{entry['suite']}::{entry['classname']}::{entry['name']}" + ) + count = ( + history.get("tests", {}) + .get(test_key, {}) + .get("failure_count", "?") + ) lines.append( f"| {entry['suite']} | `{entry['name']}` | {first_seen} | {count} | {short_msg} |" ) @@ -358,8 +387,12 @@ def generate_markdown_report( if resolved_tests: lines.append("## Stabilized Tests (were failing, now passing)") lines.append("") - lines.append("| Suite | Test | Was failing since | Total failure count | Was flaky? |") - lines.append("|-------|------|-------------------|---------------------|------------|") + lines.append( + "| Suite | Test | Was failing since | Total failure count | Was flaky? |" + ) + lines.append( + "|-------|------|-------------------|---------------------|------------|" + ) for entry in resolved_tests: flaky_badge = "Yes" if entry.get("was_flaky") else "No" lines.append( @@ -376,7 +409,9 @@ def generate_markdown_report( lines.append("|-------|------|----------------|") for entry in classified["flaky"]: retry_count = entry.get("retry_count", "?") - lines.append(f"| {entry['suite']} | `{entry['name']}` | {retry_count} |") + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {retry_count} |" + ) lines.append("") # -- Detailed errors -- @@ -405,8 +440,14 @@ def generate_markdown_report( def generate_json_summary( - classified, new_failures, recurring_failures, resolved_tests, - test_type="", matrix_label="", sha="", date_str="", + classified, + new_failures, + recurring_failures, + resolved_tests, + test_type="", + matrix_label="", + sha="", + date_str="", ): """Generate a JSON summary for downstream tools (Slack notifier, dashboard).""" return { @@ -470,6 +511,7 @@ def generate_json_summary( # HTML report # --------------------------------------------------------------------------- + def _html_escape(text): """Escape HTML special characters.""" return ( @@ -481,8 +523,15 @@ def _html_escape(text): def generate_html_report( - classified, new_failures, recurring_failures, resolved_tests, history, - test_type="", matrix_label="", sha="", date_str="", + classified, + new_failures, + recurring_failures, + resolved_tests, + history, + test_type="", + matrix_label="", + sha="", + date_str="", ): """Generate a self-contained HTML report with detailed failure info.""" total_passed = len(classified["passed"]) @@ -566,7 +615,9 @@ def generate_html_report( if sha: meta_parts.append(f"Commit: {_html_escape(sha[:12])}") if matrix_label: - meta_parts.append(f"Matrix: {_html_escape(matrix_label)}") + meta_parts.append( + f"Matrix: {_html_escape(matrix_label)}" + ) parts.append("  |  ".join(meta_parts)) parts.append(f""" @@ -582,23 +633,23 @@ def generate_html_report( # --- New failures --- if new_failures: - parts.append('

New Failures

') - parts.append('') + parts.append("

New Failures

SuiteTestError
") + parts.append("") for e in new_failures: msg = _html_escape(e.get("message", "")) short = _html_escape(e.get("message", "")[:100]) parts.append( - f'' - f'" + f"' - f'' ) parts.append("
SuiteTestError
{_html_escape(e["suite"])}{_html_escape(e["name"])} ' + f"
{_html_escape(e['suite'])}{_html_escape(e['name'])} " f'NEW
{short}' + f"
{short}" f'
{msg}
") # --- Recurring failures --- if recurring_failures: - parts.append('

Recurring Failures

') + parts.append("

Recurring Failures

") parts.append( "" "" @@ -608,22 +659,24 @@ def generate_html_report( short = _html_escape(e.get("message", "")[:100]) first_seen = _html_escape(e.get("first_seen", "unknown")) test_key = f"{e['suite']}::{e['classname']}::{e['name']}" - count = history.get("tests", {}).get(test_key, {}).get( - "failure_count", "?" + count = ( + history.get("tests", {}) + .get(test_key, {}) + .get("failure_count", "?") ) parts.append( - f'' - f'" + f"' f"" - f'' ) parts.append("
SuiteTestFirst SeenCountError
{_html_escape(e["suite"])}{_html_escape(e["name"])} ' + f"
{_html_escape(e['suite'])}{_html_escape(e['name'])} " f'RECURRING{first_seen}{count}
{short}' + f"
{short}" f'
{msg}
") # --- Stabilized --- if resolved_tests: - parts.append('

Stabilized Tests

') + parts.append("

Stabilized Tests

") parts.append( "" "" @@ -631,25 +684,25 @@ def generate_html_report( for e in resolved_tests: flaky_tag = "Yes" if e.get("was_flaky") else "No" parts.append( - f'' - f'" + f"' - f'' - f'' + f"" + f"" f"" ) parts.append("
SuiteTestFailing SinceFailure CountWas Flaky?
{_html_escape(e["suite"])}{_html_escape(e["name"])} ' + f"
{_html_escape(e['suite'])}{_html_escape(e['name'])} " f'FIXED{_html_escape(e.get("first_seen", "?"))}{e.get("failure_count", "?")}{_html_escape(e.get('first_seen', '?'))}{e.get('failure_count', '?')}{flaky_tag}
") # --- Flaky --- if classified["flaky"]: - parts.append('

Flaky Tests (passed on retry)

') + parts.append("

Flaky Tests (passed on retry)

") parts.append("") for e in classified["flaky"]: parts.append( - f'' - f'" + f"' - f'' + f"" ) parts.append("
SuiteTestRetries
{_html_escape(e["suite"])}{_html_escape(e["name"])} ' + f"
{_html_escape(e['suite'])}{_html_escape(e['name'])} " f'FLAKY{e.get("retry_count", "?")}
{e.get('retry_count', '?')}
") @@ -661,17 +714,19 @@ def generate_html_report( msg = _html_escape(e.get("message", "").strip()) parts.append( f'

' - f'{_html_escape(e["classname"])}::{_html_escape(e["name"])}

' + f"{_html_escape(e['classname'])}::{_html_escape(e['name'])}" f'

' - f'Suite: {_html_escape(e["suite"])}  |  ' - f'Source: {_html_escape(e["source_file"])}

' + f"Suite: {_html_escape(e['suite'])}  |  " + f"Source: {_html_escape(e['source_file'])}

" ) if msg: parts.append(f'
{msg}
') parts.append("") if not all_failures and not classified["flaky"] and not resolved_tests: - parts.append('

All tests passed! No failures or flaky tests detected.

') + parts.append( + '

All tests passed! No failures or flaky tests detected.

' + ) parts.append("") return "\n".join(parts) @@ -681,42 +736,50 @@ def generate_html_report( # Main # --------------------------------------------------------------------------- + def main(): parser = argparse.ArgumentParser( description="Generate nightly test failure report from JUnit XML results" ) parser.add_argument( - "--results-dir", required=True, + "--results-dir", + required=True, help="Directory containing JUnit XML test result files", ) parser.add_argument( - "--output-dir", default="report-output", + "--output-dir", + default="report-output", help="Directory to write report files to", ) parser.add_argument( - "--sha", default=os.environ.get("GITHUB_SHA", "unknown"), + "--sha", + default=os.environ.get("GITHUB_SHA", "unknown"), help="Git commit SHA for this run", ) parser.add_argument( - "--date", default=datetime.now(timezone.utc).strftime("%Y-%m-%d"), + "--date", + default=datetime.now(timezone.utc).strftime("%Y-%m-%d"), help="Date for this run (YYYY-MM-DD)", ) parser.add_argument( - "--test-type", default="unknown", + "--test-type", + default="unknown", help=( "Test type identifier (e.g., cpp, python, wheel-python, " "wheel-server, notebooks)" ), ) parser.add_argument( - "--matrix-label", default="", + "--matrix-label", + default="", help=( "Matrix combination label (e.g., cuda12.9-py3.12-x86_64). " "Included in reports and JSON summary to identify the CI job." ), ) parser.add_argument( - "--s3-history-uri", default="", + "--s3-history-uri", + default="", help=( "S3 URI for persistent failure history JSON. " "Downloaded before analysis, uploaded after update. " @@ -725,7 +788,8 @@ def main(): ), ) parser.add_argument( - "--s3-summary-uri", default="", + "--s3-summary-uri", + default="", help=( "S3 URI to upload this run's JSON snapshot for aggregation. " "Example: s3://bucket/ci_test_reports/nightly/summaries/" @@ -733,7 +797,8 @@ def main(): ), ) parser.add_argument( - "--s3-html-uri", default="", + "--s3-html-uri", + default="", help=( "S3 URI to upload the HTML report. " "Example: s3://bucket/ci_test_reports/nightly/reports/" @@ -780,7 +845,9 @@ def main(): ) if resolved_tests: - print(f"Stabilized: {len(resolved_tests)} previously-failing test(s) now pass") + print( + f"Stabilized: {len(resolved_tests)} previously-failing test(s) now pass" + ) save_history(history, local_history_path) print(f"Updated local history at {local_history_path}") @@ -798,7 +865,11 @@ def main(): ) md_report = generate_markdown_report( - classified, new_failures, recurring_failures, resolved_tests, history, + classified, + new_failures, + recurring_failures, + resolved_tests, + history, **report_kwargs, ) md_path = output_dir / "nightly_report.md" @@ -806,7 +877,11 @@ def main(): print(f"Markdown report written to {md_path}") html_report = generate_html_report( - classified, new_failures, recurring_failures, resolved_tests, history, + classified, + new_failures, + recurring_failures, + resolved_tests, + history, **report_kwargs, ) html_path = output_dir / "nightly_report.html" @@ -814,7 +889,10 @@ def main(): print(f"HTML report written to {html_path}") json_summary = generate_json_summary( - classified, new_failures, recurring_failures, resolved_tests, + classified, + new_failures, + recurring_failures, + resolved_tests, **report_kwargs, ) json_path = output_dir / "nightly_summary.json" @@ -836,10 +914,14 @@ def main(): # ---- Exit code ---- genuine_failures = len(classified["failed"]) + len(classified["error"]) if genuine_failures > 0: - print(f"\nFAILED: {genuine_failures} genuine test failure(s) detected.") + print( + f"\nFAILED: {genuine_failures} genuine test failure(s) detected." + ) return 1 if classified["flaky"]: - print(f"\nWARNING: All tests passed but {len(classified['flaky'])} flaky test(s) detected.") + print( + f"\nWARNING: All tests passed but {len(classified['flaky'])} flaky test(s) detected." + ) else: print("\nAll tests passed.") return 0 diff --git a/ci/utils/s3_helpers.py b/ci/utils/s3_helpers.py index f1f5795661..572b61a409 100644 --- a/ci/utils/s3_helpers.py +++ b/ci/utils/s3_helpers.py @@ -20,7 +20,9 @@ def s3_env(): if os.environ.get("CUOPT_AWS_ACCESS_KEY_ID"): env["AWS_ACCESS_KEY_ID"] = os.environ["CUOPT_AWS_ACCESS_KEY_ID"] if os.environ.get("CUOPT_AWS_SECRET_ACCESS_KEY"): - env["AWS_SECRET_ACCESS_KEY"] = os.environ["CUOPT_AWS_SECRET_ACCESS_KEY"] + env["AWS_SECRET_ACCESS_KEY"] = os.environ[ + "CUOPT_AWS_SECRET_ACCESS_KEY" + ] if os.environ.get("CUOPT_AWS_REGION"): env["AWS_DEFAULT_REGION"] = os.environ["CUOPT_AWS_REGION"] elif "AWS_DEFAULT_REGION" not in env: @@ -34,12 +36,17 @@ def s3_download(s3_uri, local_path): try: subprocess.run( ["aws", "s3", "cp", s3_uri, local_path], - env=env, check=True, capture_output=True, text=True, + env=env, + check=True, + capture_output=True, + text=True, ) print(f"Downloaded {s3_uri}") return True except FileNotFoundError: - print("WARNING: aws CLI not found, skipping S3 download", file=sys.stderr) + print( + "WARNING: aws CLI not found, skipping S3 download", file=sys.stderr + ) return False except subprocess.CalledProcessError as exc: print( @@ -55,15 +62,22 @@ def s3_upload(local_path, s3_uri): try: subprocess.run( ["aws", "s3", "cp", local_path, s3_uri], - env=env, check=True, capture_output=True, text=True, + env=env, + check=True, + capture_output=True, + text=True, ) print(f"Uploaded {local_path} to {s3_uri}") return True except FileNotFoundError: - print("WARNING: aws CLI not found, skipping S3 upload", file=sys.stderr) + print( + "WARNING: aws CLI not found, skipping S3 upload", file=sys.stderr + ) return False except subprocess.CalledProcessError as exc: - print(f"WARNING: S3 upload failed: {exc.stderr.strip()}", file=sys.stderr) + print( + f"WARNING: S3 upload failed: {exc.stderr.strip()}", file=sys.stderr + ) return False @@ -73,7 +87,10 @@ def s3_list(s3_prefix): try: result = subprocess.run( ["aws", "s3", "ls", s3_prefix], - env=env, check=True, capture_output=True, text=True, + env=env, + check=True, + capture_output=True, + text=True, ) except (FileNotFoundError, subprocess.CalledProcessError) as exc: print(f"WARNING: S3 ls failed: {exc}", file=sys.stderr) diff --git a/ci/utils/send_nightly_summary.sh b/ci/utils/send_nightly_summary.sh index 7c2d16519c..7b39a02cec 100755 --- a/ci/utils/send_nightly_summary.sh +++ b/ci/utils/send_nightly_summary.sh @@ -17,7 +17,7 @@ # Optional environment variables: # GITHUB_RUN_URL - Link to the GitHub Actions run # REPORT_URL - Link to the S3 HTML report -# CUOPT_BRANCH - Branch name (e.g. main, release/26.06) +# CUOPT_BRANCH - Branch name (e.g. main) set -euo pipefail diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 04dc6bb83c..e8000ffbb3 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -58,6 +58,7 @@ dependencies: - pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 21891cc9f2..43bc8996ad 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -58,6 +58,7 @@ dependencies: - pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index 89147b18a7..5a53e13d37 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -58,6 +58,7 @@ dependencies: - pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index 8df6f28bf7..2efc26c0cb 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -58,6 +58,7 @@ dependencies: - pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 diff --git a/python/cuopt/cuopt/linear_programming/pyproject.toml b/python/cuopt/cuopt/linear_programming/pyproject.toml index 934b12f547..6e2c59c43c 100644 --- a/python/cuopt/cuopt/linear_programming/pyproject.toml +++ b/python/cuopt/cuopt/linear_programming/pyproject.toml @@ -37,6 +37,7 @@ Source = "https://github.com/nvidia/cuopt" [project.optional-dependencies] test = [ "pytest-cov", + "pytest-rerunfailures", "pytest<9.0", "rapids-logger==0.2.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cuopt/pyproject.toml b/python/cuopt/pyproject.toml index eff7e01769..18b6e75276 100644 --- a/python/cuopt/pyproject.toml +++ b/python/cuopt/pyproject.toml @@ -47,6 +47,7 @@ classifiers = [ test = [ "numpy>=1.23.5,<3.0", "pytest-cov", + "pytest-rerunfailures", "pytest<9.0", "rapids-logger==0.2.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cuopt_self_hosted/pyproject.toml b/python/cuopt_self_hosted/pyproject.toml index 43aa80a5b3..f4a3b75a60 100644 --- a/python/cuopt_self_hosted/pyproject.toml +++ b/python/cuopt_self_hosted/pyproject.toml @@ -37,6 +37,7 @@ classifiers = [ [project.optional-dependencies] test = [ "pytest-cov", + "pytest-rerunfailures", "pytest<9.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cuopt_server/pyproject.toml b/python/cuopt_server/pyproject.toml index ce96c884be..4f9f141011 100644 --- a/python/cuopt_server/pyproject.toml +++ b/python/cuopt_server/pyproject.toml @@ -48,6 +48,7 @@ test = [ "msgpack==1.1.2", "pexpect", "pytest-cov", + "pytest-rerunfailures", "pytest<9.0", "requests", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From 6b7605b900cc35fea9f1cb9ab3e61b4f915d62a1 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 12:23:08 -0500 Subject: [PATCH 08/60] Fix nightly-summary job: remove unsupported container-options secrets custom-job.yaml does not support secret references in container-options. Remove them and make nightly_summary.sh gracefully skip when CUOPT_DATASET_S3_URI is not available. --- .github/workflows/test.yaml | 7 ------- ci/nightly_summary.sh | 11 +++++------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4b52dbffe3..fcf1c5f42f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -125,11 +125,4 @@ jobs: node_type: "cpu4" arch: "amd64" container_image: "rapidsai/ci-conda:26.06-latest" - container-options: >- - -e CUOPT_DATASET_S3_URI=${{ secrets.CUOPT_DATASET_S3_URI }} - -e CUOPT_AWS_ACCESS_KEY_ID=${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} - -e CUOPT_AWS_SECRET_ACCESS_KEY=${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} - -e CUOPT_SLACK_WEBHOOK_URL=${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} - -e CUOPT_SLACK_BOT_TOKEN=${{ secrets.CUOPT_SLACK_BOT_TOKEN }} - -e CUOPT_SLACK_CHANNEL_ID=${{ secrets.CUOPT_SLACK_CHANNEL_ID }} script: ci/nightly_summary.sh diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 93576e1795..53075d18c6 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -6,10 +6,8 @@ # consolidated Slack notification. Runs as a post-test job after all # matrix CI jobs finish. # -# Required environment variables: -# CUOPT_DATASET_S3_URI - S3 base URI -# CUOPT_AWS_ACCESS_KEY_ID - AWS credentials -# CUOPT_AWS_SECRET_ACCESS_KEY +# The script needs S3 access. It tries CUOPT_DATASET_S3_URI first, then +# falls back to standard AWS env vars set by aws-actions/configure-aws-credentials. # # Optional: # CUOPT_SLACK_WEBHOOK_URL - sends Slack if set @@ -28,8 +26,9 @@ BRANCH="${RAPIDS_BRANCH:-main}" GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}" if [ -z "${CUOPT_DATASET_S3_URI:-}" ]; then - echo "ERROR: CUOPT_DATASET_S3_URI is not set. Cannot aggregate." >&2 - exit 1 + echo "WARNING: CUOPT_DATASET_S3_URI is not set. Skipping nightly aggregation." >&2 + echo "The per-matrix reports (uploaded by individual test jobs) are still available on S3." + exit 0 fi S3_BASE="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" From 84069f8b68e3b5fbc045caeb577f066f04f106c6 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 14:32:19 -0500 Subject: [PATCH 09/60] Remove unsupported script-env-secret-4 from test workflow The shared workflows only support 3 secret slots. The Slack webhook is only needed by the nightly-summary aggregation job which uses secrets: inherit. --- .github/workflows/test.yaml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index fcf1c5f42f..7de1f43bbb 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -41,8 +41,6 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} - script-env-secret-4-key: CUOPT_SLACK_WEBHOOK_URL - script-env-secret-4-value: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} conda-python-tests: uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main with: @@ -59,8 +57,6 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} - script-env-secret-4-key: CUOPT_SLACK_WEBHOOK_URL - script-env-secret-4-value: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} wheel-tests-cuopt: uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: @@ -76,8 +72,6 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} - script-env-secret-4-key: CUOPT_SLACK_WEBHOOK_URL - script-env-secret-4-value: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} wheel-tests-cuopt-server: uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: @@ -93,8 +87,6 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} - script-env-secret-4-key: CUOPT_SLACK_WEBHOOK_URL - script-env-secret-4-value: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} conda-notebook-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main From 9ae08520f5e41b29cf35c7e99be2446a1351e518 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 16:01:02 -0500 Subject: [PATCH 10/60] Add bounce detection and cross-run flaky classification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests that resolve then fail again within 14 days are recognized as bouncing rather than new failures. After 2+ bounces a test is automatically classified as cross-run flaky. Resolved tests only generate one notification — subsequent passes are silent. --- ci/utils/nightly_report.py | 102 ++++++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 14 deletions(-) diff --git a/ci/utils/nightly_report.py b/ci/utils/nightly_report.py index c098b4d66c..a64f8f5a28 100755 --- a/ci/utils/nightly_report.py +++ b/ci/utils/nightly_report.py @@ -47,7 +47,14 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from s3_helpers import s3_download, s3_upload # noqa: E402 -EMPTY_HISTORY = {"_schema_version": 1, "tests": {}} +EMPTY_HISTORY = {"_schema_version": 2, "tests": {}} + +# A test that resolves then fails again within this window is considered +# "bouncing" (intermittently flaky) rather than a new failure. +BOUNCE_WINDOW_DAYS = 14 + +# Number of failure/resolve cycles that classify a test as cross-run flaky. +BOUNCE_THRESHOLD = 2 # --------------------------------------------------------------------------- @@ -190,12 +197,37 @@ def load_history(history_path): return dict(EMPTY_HISTORY) +def _days_between(date_a, date_b): + """Return absolute number of days between two YYYY-MM-DD strings.""" + try: + a = datetime.strptime(date_a, "%Y-%m-%d") + b = datetime.strptime(date_b, "%Y-%m-%d") + return abs((a - b).days) + except (ValueError, TypeError): + return 999 + + +def _is_recent_resolve(rec, date_str): + """Check if a test was resolved recently (within bounce window).""" + resolved_date = rec.get("resolved_date", "") + if not resolved_date: + return False + return _days_between(resolved_date, date_str) <= BOUNCE_WINDOW_DAYS + + def update_history(history, classified, sha, date_str): """ Update failure history with this run's results. Returns (history, new_failures, recurring_failures, resolved_tests). - resolved_tests = previously active failures that passed this run (stabilized). + + Classification logic: + - "new failure": never seen before (no history entry at all) + - "recurring": was already active (failing on previous runs) + - "bouncing": was resolved recently but failed again — reactivated + as recurring (not new), and marked cross-run flaky after 2+ bounces + - "resolved": was active, now passes — notified once, then silent + on subsequent passes """ tests = history.setdefault("tests", {}) new_failures = [] @@ -206,14 +238,46 @@ def update_history(history, classified, sha, date_str): for entry in classified["failed"] + classified["error"]: test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" - if test_key in tests and tests[test_key]["status"] == "active": - tests[test_key]["last_seen_date"] = date_str - tests[test_key]["last_seen_sha"] = sha - tests[test_key]["failure_count"] += 1 - recurring_failures.append( - {**entry, "first_seen": tests[test_key]["first_seen_date"]} - ) + if test_key in tests: + rec = tests[test_key] + + if rec["status"] == "active": + # Still failing — bump count + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + recurring_failures.append( + {**entry, "first_seen": rec["first_seen_date"]} + ) + elif rec["status"] == "resolved" and _is_recent_resolve( + rec, date_str + ): + # Bouncing: resolved recently but failed again. + # Reactivate as recurring, not new. Track the bounce. + rec["status"] = "active" + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + rec["bounce_count"] = rec.get("bounce_count", 0) + 1 + if rec["bounce_count"] >= BOUNCE_THRESHOLD: + rec["is_flaky"] = True + recurring_failures.append( + { + **entry, + "first_seen": rec["first_seen_date"], + "is_bouncing": True, + } + ) + else: + # Resolved long ago — treat as new cycle but keep history + rec["status"] = "active" + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + rec["bounce_count"] = rec.get("bounce_count", 0) + 1 + new_failures.append(entry) else: + # Truly new — never seen before tests[test_key] = { "suite": entry["suite"], "classname": entry["classname"], @@ -224,18 +288,24 @@ def update_history(history, classified, sha, date_str): "last_seen_sha": sha, "failure_count": 1, "is_flaky": False, + "bounce_count": 0, "status": "active", } new_failures.append(entry) - # --- Flaky tests --- + # --- Flaky tests (passed on retry within this run) --- for entry in classified["flaky"]: test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" if test_key in tests: - tests[test_key]["last_seen_date"] = date_str - tests[test_key]["last_seen_sha"] = sha - tests[test_key]["failure_count"] += 1 - tests[test_key]["is_flaky"] = True + rec = tests[test_key] + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + rec["is_flaky"] = True + # If it was resolved, reactivate — it's still unstable + if rec["status"] == "resolved": + rec["status"] = "active" + rec["bounce_count"] = rec.get("bounce_count", 0) + 1 else: tests[test_key] = { "suite": entry["suite"], @@ -247,6 +317,7 @@ def update_history(history, classified, sha, date_str): "last_seen_sha": sha, "failure_count": 1, "is_flaky": True, + "bounce_count": 0, "status": "active", } @@ -269,9 +340,12 @@ def update_history(history, classified, sha, date_str): "name": rec["name"], "first_seen": rec["first_seen_date"], "failure_count": rec["failure_count"], + "bounce_count": rec.get("bounce_count", 0), "was_flaky": rec.get("is_flaky", False), } ) + # If already "resolved" and passes again — no notification. + # The resolved notification was sent once when it first stabilized. return history, new_failures, recurring_failures, resolved_tests From d34bf28e9359d8bb507766f42fdf9b8495d10784 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 16:03:38 -0500 Subject: [PATCH 11/60] Make bounce window and threshold configurable via env vars CUOPT_BOUNCE_WINDOW_DAYS (default 14) and CUOPT_BOUNCE_THRESHOLD (default 2) can now be set as environment variables to tune flaky test detection without code changes. --- ci/utils/nightly_report.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/utils/nightly_report.py b/ci/utils/nightly_report.py index a64f8f5a28..2bd23b1f18 100755 --- a/ci/utils/nightly_report.py +++ b/ci/utils/nightly_report.py @@ -51,10 +51,10 @@ # A test that resolves then fails again within this window is considered # "bouncing" (intermittently flaky) rather than a new failure. -BOUNCE_WINDOW_DAYS = 14 +BOUNCE_WINDOW_DAYS = int(os.environ.get("CUOPT_BOUNCE_WINDOW_DAYS", 14)) # Number of failure/resolve cycles that classify a test as cross-run flaky. -BOUNCE_THRESHOLD = 2 +BOUNCE_THRESHOLD = int(os.environ.get("CUOPT_BOUNCE_THRESHOLD", 2)) # --------------------------------------------------------------------------- From 2eae30b4ab8edfdf9d6611176bbb9c6aef5cd183 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 17:47:23 -0500 Subject: [PATCH 12/60] Convert nightly-summary to inline job for secret access The custom-job.yaml reusable workflow does not expose secrets as env vars. Convert nightly-summary to an inline job that directly sets all required secrets (S3, Slack) in the step environment. --- .github/workflows/test.yaml | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 7de1f43bbb..3ced840676 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -107,14 +107,21 @@ jobs: - wheel-tests-cuopt - wheel-tests-cuopt-server - conda-notebook-tests - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main - with: - build_type: ${{ inputs.build_type }} - branch: ${{ inputs.branch }} - date: ${{ inputs.date }} - sha: ${{ inputs.sha }} - node_type: "cpu4" - arch: "amd64" - container_image: "rapidsai/ci-conda:26.06-latest" - script: ci/nightly_summary.sh + runs-on: linux-amd64-cpu4 + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.sha }} + - name: Install dependencies + run: pip install awscli + - name: Run nightly summary + env: + CUOPT_DATASET_S3_URI: ${{ secrets.CUOPT_DATASET_S3_URI }} + CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} + CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} + CUOPT_SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + RAPIDS_BRANCH: ${{ inputs.branch }} + run: bash ci/nightly_summary.sh From e977784e9dc3466f59985dbb78fab0128ca2d48c Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Mon, 13 Apr 2026 17:48:35 -0500 Subject: [PATCH 13/60] Fix S3 auth: prefer role-based credentials over CUOPT_AWS overrides In CI, aws-actions/configure-aws-credentials sets role-based tokens (AWS_ACCESS_KEY_ID + AWS_SESSION_TOKEN). The CUOPT_AWS_* overrides were replacing these with static keys that lack the session token, causing InvalidToken errors. Now only fall back to CUOPT_AWS_* when standard AWS credentials are not already set. --- ci/utils/s3_helpers.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/ci/utils/s3_helpers.py b/ci/utils/s3_helpers.py index 572b61a409..a550a869d5 100644 --- a/ci/utils/s3_helpers.py +++ b/ci/utils/s3_helpers.py @@ -15,14 +15,21 @@ def s3_env(): - """Build env dict with CUOPT AWS credentials mapped to standard AWS vars.""" + """Build env dict for AWS CLI calls. + + Prefers credentials already set by aws-actions/configure-aws-credentials + (role-based tokens via AWS_ACCESS_KEY_ID / AWS_SESSION_TOKEN). Falls + back to CUOPT_AWS_* overrides only when standard AWS vars are absent. + """ env = os.environ.copy() - if os.environ.get("CUOPT_AWS_ACCESS_KEY_ID"): - env["AWS_ACCESS_KEY_ID"] = os.environ["CUOPT_AWS_ACCESS_KEY_ID"] - if os.environ.get("CUOPT_AWS_SECRET_ACCESS_KEY"): - env["AWS_SECRET_ACCESS_KEY"] = os.environ[ - "CUOPT_AWS_SECRET_ACCESS_KEY" - ] + # Only override if standard AWS credentials are not already configured + if not os.environ.get("AWS_ACCESS_KEY_ID"): + if os.environ.get("CUOPT_AWS_ACCESS_KEY_ID"): + env["AWS_ACCESS_KEY_ID"] = os.environ["CUOPT_AWS_ACCESS_KEY_ID"] + if os.environ.get("CUOPT_AWS_SECRET_ACCESS_KEY"): + env["AWS_SECRET_ACCESS_KEY"] = os.environ[ + "CUOPT_AWS_SECRET_ACCESS_KEY" + ] if os.environ.get("CUOPT_AWS_REGION"): env["AWS_DEFAULT_REGION"] = os.environ["CUOPT_AWS_REGION"] elif "AWS_DEFAULT_REGION" not in env: From 124aeb2ea6237eed3bc1e77a5ec2cb452a4cb340 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 09:48:39 -0500 Subject: [PATCH 14/60] Fix S3 auth: use CUOPT_AWS static keys and unset session token The cuOpt S3 bucket requires CUOPT_AWS_* static credentials. The role-based session token from aws-actions/configure-aws-credentials was causing InvalidToken errors. Always override with CUOPT_AWS_* and unset AWS_SESSION_TOKEN, matching the pattern in datasets/*.sh. --- ci/utils/s3_helpers.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/ci/utils/s3_helpers.py b/ci/utils/s3_helpers.py index a550a869d5..be1d2c872b 100644 --- a/ci/utils/s3_helpers.py +++ b/ci/utils/s3_helpers.py @@ -15,21 +15,23 @@ def s3_env(): - """Build env dict for AWS CLI calls. + """Build env dict for AWS CLI calls using CUOPT-specific credentials. - Prefers credentials already set by aws-actions/configure-aws-credentials - (role-based tokens via AWS_ACCESS_KEY_ID / AWS_SESSION_TOKEN). Falls - back to CUOPT_AWS_* overrides only when standard AWS vars are absent. + The cuOpt S3 bucket requires explicit CUOPT_AWS_* static credentials. + Role-based credentials from aws-actions/configure-aws-credentials do not + have access. We override AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY with + the CUOPT_* values and unset AWS_SESSION_TOKEN to avoid mixing with + role-based session tokens (matching the pattern in datasets/*.sh). """ env = os.environ.copy() - # Only override if standard AWS credentials are not already configured - if not os.environ.get("AWS_ACCESS_KEY_ID"): - if os.environ.get("CUOPT_AWS_ACCESS_KEY_ID"): - env["AWS_ACCESS_KEY_ID"] = os.environ["CUOPT_AWS_ACCESS_KEY_ID"] - if os.environ.get("CUOPT_AWS_SECRET_ACCESS_KEY"): - env["AWS_SECRET_ACCESS_KEY"] = os.environ[ - "CUOPT_AWS_SECRET_ACCESS_KEY" - ] + if os.environ.get("CUOPT_AWS_ACCESS_KEY_ID"): + env["AWS_ACCESS_KEY_ID"] = os.environ["CUOPT_AWS_ACCESS_KEY_ID"] + if os.environ.get("CUOPT_AWS_SECRET_ACCESS_KEY"): + env["AWS_SECRET_ACCESS_KEY"] = os.environ[ + "CUOPT_AWS_SECRET_ACCESS_KEY" + ] + # Unset session token to avoid mixing role-based tokens with static keys + env.pop("AWS_SESSION_TOKEN", None) if os.environ.get("CUOPT_AWS_REGION"): env["AWS_DEFAULT_REGION"] = os.environ["CUOPT_AWS_REGION"] elif "AWS_DEFAULT_REGION" not in env: From 338b7bbf21099ae853075e471c56ee03921020d0 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 11:26:18 -0500 Subject: [PATCH 15/60] Extract nightly-summary into reusable workflow Move the nightly-summary job out of test.yaml into its own nightly-summary.yaml reusable workflow. Runs in a python:3.12-slim container to avoid PEP 668 externally-managed-environment errors when installing awscli. Also adds workflow_dispatch trigger so the summary can be re-run manually against an earlier test run. --- .github/workflows/nightly-summary.yaml | 69 ++++++++++++++++++++++++++ .github/workflows/test.yaml | 30 +++++------ 2 files changed, 81 insertions(+), 18 deletions(-) create mode 100644 .github/workflows/nightly-summary.yaml diff --git a/.github/workflows/nightly-summary.yaml b/.github/workflows/nightly-summary.yaml new file mode 100644 index 0000000000..166853a1f3 --- /dev/null +++ b/.github/workflows/nightly-summary.yaml @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: nightly-summary + +on: + workflow_dispatch: + inputs: + branch: + description: "Branch name the run targets" + required: true + type: string + default: main + sha: + description: "Full git commit SHA to check out" + required: true + type: string + build_type: + description: "Build type (nightly, pull-request, branch)" + required: true + type: string + default: nightly + workflow_call: + inputs: + branch: + required: true + type: string + sha: + required: true + type: string + build_type: + required: true + type: string + secrets: + CUOPT_DATASET_S3_URI: + required: true + CUOPT_AWS_ACCESS_KEY_ID: + required: true + CUOPT_AWS_SECRET_ACCESS_KEY: + required: true + CUOPT_SLACK_WEBHOOK_URL: + required: false + CUOPT_SLACK_BOT_TOKEN: + required: false + CUOPT_SLACK_CHANNEL_ID: + required: false + +jobs: + nightly-summary: + runs-on: linux-amd64-cpu4 + container: + image: python:3.12-slim + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.sha }} + - name: Install dependencies + run: pip install awscli + - name: Run nightly summary + env: + CUOPT_DATASET_S3_URI: ${{ secrets.CUOPT_DATASET_S3_URI }} + CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} + CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} + CUOPT_SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + RAPIDS_BRANCH: ${{ inputs.branch }} + run: bash ci/nightly_summary.sh diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3ced840676..097f607244 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -107,21 +107,15 @@ jobs: - wheel-tests-cuopt - wheel-tests-cuopt-server - conda-notebook-tests - runs-on: linux-amd64-cpu4 - steps: - - uses: actions/checkout@v6 - with: - ref: ${{ inputs.sha }} - - name: Install dependencies - run: pip install awscli - - name: Run nightly summary - env: - CUOPT_DATASET_S3_URI: ${{ secrets.CUOPT_DATASET_S3_URI }} - CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} - CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} - CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} - CUOPT_SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} - CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} - RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} - RAPIDS_BRANCH: ${{ inputs.branch }} - run: bash ci/nightly_summary.sh + uses: ./.github/workflows/nightly-summary.yaml + with: + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + build_type: ${{ inputs.build_type }} + secrets: + CUOPT_DATASET_S3_URI: ${{ secrets.CUOPT_DATASET_S3_URI }} + CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} + CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} + CUOPT_SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} From 15641c508a1839b3354ee343b2be8a466c78cf6c Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 11:52:59 -0500 Subject: [PATCH 16/60] Add curl to nightly-summary container for Slack notifications The python:3.12-slim image doesn't include curl, which is needed by send_consolidated_summary.sh for Slack webhook and file upload. --- .github/workflows/nightly-summary.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nightly-summary.yaml b/.github/workflows/nightly-summary.yaml index 166853a1f3..214fe84c97 100644 --- a/.github/workflows/nightly-summary.yaml +++ b/.github/workflows/nightly-summary.yaml @@ -55,7 +55,9 @@ jobs: with: ref: ${{ inputs.sha }} - name: Install dependencies - run: pip install awscli + run: | + apt-get update && apt-get install -y --no-install-recommends curl + pip install awscli - name: Run nightly summary env: CUOPT_DATASET_S3_URI: ${{ secrets.CUOPT_DATASET_S3_URI }} From 477b273825445cac455b0a53e1cc3be273739f42 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 12:52:23 -0500 Subject: [PATCH 17/60] Improve nightly Slack reporting and thirdparty test visibility - Filter consolidated.json from S3 aggregation to fix "unknown" entry - Migrate Slack file upload from deprecated files.upload to getUploadURLExternal + completeUploadExternal - Chunk Slack messages into header/grid/details/links to stay within block and character limits - Remove S3 link from Slack in favor of HTML file attachment - Add --junitxml to Pyomo, CvxPy, and PuLP thirdparty test scripts so failures appear in nightly reports - Export RAPIDS_TESTS_DIR from test_wheel_cuopt.sh for subprocesses --- ci/nightly_summary.sh | 1 - ci/test_wheel_cuopt.sh | 1 + ci/thirdparty-testing/run_cvxpy_tests.sh | 4 + ci/thirdparty-testing/run_pulp_tests.sh | 4 + ci/thirdparty-testing/run_pyomo_tests.sh | 4 + ci/utils/aggregate_nightly.py | 5 +- ci/utils/send_consolidated_summary.sh | 275 +++++++++++++---------- 7 files changed, 177 insertions(+), 117 deletions(-) diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 53075d18c6..c0aab7a52d 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -63,7 +63,6 @@ if [ -n "${CUOPT_SLACK_WEBHOOK_URL:-}" ] && [ "${RAPIDS_BUILD_TYPE:-}" = "nightl SLACK_WEBHOOK_URL="${CUOPT_SLACK_WEBHOOK_URL}" \ SLACK_BOT_TOKEN="${CUOPT_SLACK_BOT_TOKEN:-}" \ SLACK_CHANNEL_ID="${CUOPT_SLACK_CHANNEL_ID:-}" \ - REPORT_URL="${S3_CONSOLIDATED_HTML}" \ bash "${SCRIPT_DIR}/utils/send_consolidated_summary.sh" fi diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh index 5d002731b0..878db67594 100755 --- a/ci/test_wheel_cuopt.sh +++ b/ci/test_wheel_cuopt.sh @@ -64,6 +64,7 @@ RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)" export RAPIDS_DATASET_ROOT_DIR RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +export RAPIDS_TESTS_DIR mkdir -p "${RAPIDS_TESTS_DIR}" EXITCODE=0 diff --git a/ci/thirdparty-testing/run_cvxpy_tests.sh b/ci/thirdparty-testing/run_cvxpy_tests.sh index c336f6a800..4b874fc4f0 100755 --- a/ci/thirdparty-testing/run_cvxpy_tests.sh +++ b/ci/thirdparty-testing/run_cvxpy_tests.sh @@ -32,10 +32,14 @@ python -m pip install \ # ensure that environment is still consistent (i.e. cvxpy requirements do not conflict with cuopt's) pip check +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +mkdir -p "${RAPIDS_TESTS_DIR}" + echo "running 'cvxpy' tests" timeout 3m python -m pytest \ --verbose \ --capture=no \ --error-for-skips \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-cvxpy.xml" \ -k "TestCUOPT" \ ./cvxpy/tests/test_conic_solvers.py diff --git a/ci/thirdparty-testing/run_pulp_tests.sh b/ci/thirdparty-testing/run_pulp_tests.sh index f9cb0ca8a5..2c26db7a23 100755 --- a/ci/thirdparty-testing/run_pulp_tests.sh +++ b/ci/thirdparty-testing/run_pulp_tests.sh @@ -23,6 +23,9 @@ python -m pip install \ pip check +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +mkdir -p "${RAPIDS_TESTS_DIR}" + rapids-logger "running PuLP tests (cuOpt-related)" # PuLP uses pytest; run only tests that reference cuopt/CUOPT # Exit code 5 = no tests collected; then try run_tests.py which detects solvers (including cuopt) @@ -30,6 +33,7 @@ pytest_rc=0 timeout 5m python -m pytest \ --verbose \ --capture=no \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-pulp.xml" \ -k "cuopt or CUOPT" \ pulp/tests/ || pytest_rc=$? diff --git a/ci/thirdparty-testing/run_pyomo_tests.sh b/ci/thirdparty-testing/run_pyomo_tests.sh index f50df676c9..d2b0639f6e 100755 --- a/ci/thirdparty-testing/run_pyomo_tests.sh +++ b/ci/thirdparty-testing/run_pyomo_tests.sh @@ -23,11 +23,15 @@ python -m pip install \ pip check +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +mkdir -p "${RAPIDS_TESTS_DIR}" + rapids-logger "running Pyomo tests (cuopt_direct / cuOpt-related)" # Run only tests that reference cuopt (cuopt_direct solver) timeout 5m python -m pytest \ --verbose \ --capture=no \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-pyomo.xml" \ -k "cuopt or CUOPT" \ pyomo/solvers/tests/ diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index 4517ab3c6a..31e567f487 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -45,7 +45,10 @@ def download_summaries(s3_prefix, local_dir): local_dir.mkdir(parents=True, exist_ok=True) uris = s3_list(s3_prefix) - json_uris = [u for u in uris if u.endswith(".json")] + json_uris = [ + u for u in uris + if u.endswith(".json") and not u.endswith("/consolidated.json") + ] print(f"Found {len(json_uris)} summary file(s) at {s3_prefix}") summaries = [] diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index 4f421678dc..32f9f8005b 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -2,21 +2,20 @@ # SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Send a single consolidated Slack notification for the entire nightly run. -# Reads the aggregated JSON produced by aggregate_nightly.py and sends a rich -# Slack message with: -# - Matrix grid overview (test_type x matrix → status) -# - Failure tables with :new: / :repeat: badges and matrix context -# - @channel on new genuine failures -# - Stabilized and flaky test summaries -# - Link to GitHub Actions run and consolidated HTML report +# Send a consolidated Slack notification for the entire nightly run. +# Reads the aggregated JSON produced by aggregate_nightly.py and sends +# chunked Slack messages: +# 1. Header + status summary + test totals +# 2. Matrix grid (passed / failed / flaky, chunked by test type) +# 3. Failure details (new, recurring, stabilized, flaky) +# 4. Links +# Then uploads the HTML report as a Slack file. # # Required environment variables: # SLACK_WEBHOOK_URL - Slack incoming webhook URL # CONSOLIDATED_SUMMARY - Path to consolidated_summary.json # # Optional environment variables: -# REPORT_URL - Link to the consolidated HTML report on S3 # CONSOLIDATED_HTML - Path to consolidated HTML file to upload to Slack # SLACK_BOT_TOKEN - Slack Bot Token (xoxb-*) for file uploads # SLACK_CHANNEL_ID - Slack channel ID for file uploads (required with bot token) @@ -25,7 +24,6 @@ set -euo pipefail CONSOLIDATED_SUMMARY="${CONSOLIDATED_SUMMARY:?CONSOLIDATED_SUMMARY must point to consolidated_summary.json}" SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:?SLACK_WEBHOOK_URL is required}" -REPORT_URL="${REPORT_URL:-}" CONSOLIDATED_HTML="${CONSOLIDATED_HTML:-}" SLACK_BOT_TOKEN="${SLACK_BOT_TOKEN:-}" SLACK_CHANNEL_ID="${SLACK_CHANNEL_ID:-}" @@ -35,10 +33,11 @@ if [ ! -f "${CONSOLIDATED_SUMMARY}" ]; then exit 1 fi -PAYLOAD=$(python3 - "${CONSOLIDATED_SUMMARY}" "${REPORT_URL}" <<'PYEOF' +# Generate chunked Slack payloads — one JSON object per line +PAYLOADS=$(python3 - "${CONSOLIDATED_SUMMARY}" <<'PYEOF' import json, sys -summary_path, report_url = sys.argv[1:3] +summary_path = sys.argv[1] with open(summary_path) as f: d = json.load(f) @@ -56,7 +55,25 @@ failed_jobs = jobs.get("failed", 0) flaky_jobs = jobs.get("flaky", 0) passed_jobs = jobs.get("passed", 0) -# --- Status line --- +status_icons = { + "passed": ":white_check_mark:", + "failed-new": ":rotating_light:", + "failed-recurring": ":x:", + "flaky": ":warning:", + "no-results": ":grey_question:", +} + +def make_payload(blocks): + return json.dumps({ + "username": "cuOpt Nightly Bot", + "icon_emoji": ":robot_face:", + "blocks": blocks, + }) + + +# ── Message 1: Header + status + totals ────────────────────────────── +blocks = [] + if failed_jobs > 0 and has_new: emoji = ":rotating_light:" text = f"NEW test failures in {failed_jobs} matrix job(s)" @@ -82,9 +99,6 @@ stats = ( f"Total: {totals.get('total', 0)}" ) -blocks = [] - -# Header blocks.append({ "type": "header", "text": { @@ -93,8 +107,6 @@ blocks.append({ "emoji": True, }, }) - -# Status summary blocks.append({ "type": "section", "text": { @@ -102,25 +114,18 @@ blocks.append({ "text": f"{mention}{emoji} *{text}*\n\n{stats}", }, }) +print(make_payload(blocks)) -blocks.append({"type": "divider"}) -# --- Matrix grid (compact) --- -# Group by test_type for readability +# ── Message 2: Matrix grid (chunked by test type) ──────────────────── test_types = {} for g in grid: tt = g["test_type"] test_types.setdefault(tt, []).append(g) -status_icons = { - "passed": ":white_check_mark:", - "failed-new": ":rotating_light:", - "failed-recurring": ":x:", - "flaky": ":warning:", - "no-results": ":grey_question:", -} - -grid_lines = [] +# Split into sections that fit within Slack's 3000 char limit per block +grid_blocks = [] +current_text = "" for tt, entries in sorted(test_types.items()): cells = [] for g in entries: @@ -131,154 +136,194 @@ for tt, entries in sorted(test_types.items()): cells.append(f"{icon} `{label}` ({failed_count} failures)") else: cells.append(f"{icon} `{label}`") - grid_lines.append(f"*{tt}*\n" + "\n".join(f" {c}" for c in cells)) - -# Slack blocks have a 3000 char limit per text field; truncate if needed -grid_text = "\n".join(grid_lines) -if len(grid_text) > 2900: - # Summarize instead of full grid - grid_text = ( - f"*Matrix Summary:* {passed_jobs} passed, {failed_jobs} failed, " - f"{flaky_jobs} flaky out of {total_jobs} jobs\n" - f"_(Full matrix in report link below)_" - ) + section = f"*{tt}*\n" + "\n".join(f" {c}" for c in cells) + "\n" + + # If adding this section would exceed limit, flush current block + if current_text and len(current_text) + len(section) > 2800: + grid_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current_text.rstrip()}, + }) + current_text = "" + current_text += section + +if current_text: + grid_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current_text.rstrip()}, + }) -blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": grid_text}, -}) +# Chunk grid blocks into messages of at most 48 blocks (leave room for divider) +for i in range(0, len(grid_blocks), 48): + chunk = grid_blocks[i:i+48] + print(make_payload([{"type": "divider"}] + chunk)) -# --- New failures (max 10 to avoid hitting Slack limits) --- + +# ── Message 3: Failure details ──────────────────────────────────────── +detail_blocks = [] + +# New failures new_failures = d.get("new_failures", []) if new_failures: - blocks.append({"type": "divider"}) lines = [] - for f_entry in new_failures[:10]: - msg = f_entry.get("message", "")[:50].replace("\n", " ") + for f_entry in new_failures[:15]: + msg = f_entry.get("message", "")[:80].replace("\n", " ") matrix = f_entry.get("matrix_label", "") lines.append( - f" :new: `{f_entry['name']}` ({f_entry['test_type']} / {matrix}) \u2014 {msg}" + f":new: `{f_entry['name']}` ({f_entry['test_type']} / {matrix})\n {msg}" ) - if len(new_failures) > 10: - lines.append(f" _...and {len(new_failures) - 10} more_") - blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": "*New Failures:*\n" + "\n".join(lines)}, - }) - -# --- Recurring failures (max 10) --- + if len(new_failures) > 15: + lines.append(f"_...and {len(new_failures) - 15} more_") + text = "*:rotating_light: New Failures:*\n" + "\n".join(lines) + # Split into 3000-char chunks if needed + while text: + detail_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": text[:2900]}, + }) + text = text[2900:] + +# Recurring failures recurring = d.get("recurring_failures", []) if recurring: - blocks.append({"type": "divider"}) lines = [] - for f_entry in recurring[:10]: + for f_entry in recurring[:15]: matrix = f_entry.get("matrix_label", "") first = f_entry.get("first_seen", "?") lines.append( - f" :repeat: `{f_entry['name']}` ({f_entry['test_type']} / {matrix}) \u2014 since {first}" + f":repeat: `{f_entry['name']}` ({f_entry['test_type']} / {matrix}) \u2014 since {first}" ) - if len(recurring) > 10: - lines.append(f" _...and {len(recurring) - 10} more_") - blocks.append({ + if len(recurring) > 15: + lines.append(f"_...and {len(recurring) - 15} more_") + detail_blocks.append({"type": "divider"}) + detail_blocks.append({ "type": "section", - "text": {"type": "mrkdwn", "text": "*Recurring Failures:*\n" + "\n".join(lines)}, + "text": {"type": "mrkdwn", "text": "*:x: Recurring Failures:*\n" + "\n".join(lines)}, }) -# --- Stabilized --- +# Stabilized resolved = d.get("resolved_tests", []) if resolved: lines = [] - for r in resolved[:5]: + for r in resolved[:10]: matrix = r.get("matrix_label", "") count = r.get("failure_count", "?") lines.append( - f" :white_check_mark: `{r['name']}` ({r['test_type']} / {matrix}) \u2014 failed {count}x" + f":white_check_mark: `{r['name']}` ({r['test_type']} / {matrix}) \u2014 failed {count}x" ) - if len(resolved) > 5: - lines.append(f" _...and {len(resolved) - 5} more_") - blocks.append({ + if len(resolved) > 10: + lines.append(f"_...and {len(resolved) - 10} more_") + detail_blocks.append({"type": "divider"}) + detail_blocks.append({ "type": "section", "text": { "type": "mrkdwn", - "text": "*Stabilized (were failing, now pass):*\n" + "\n".join(lines), + "text": "*:white_check_mark: Stabilized (were failing, now pass):*\n" + "\n".join(lines), }, }) -# --- Flaky summary (count only to save space) --- +# Flaky summary flaky = d.get("flaky_tests", []) if flaky: - # Group by test name to show unique flaky tests unique_flaky = {} for f_entry in flaky: key = f_entry["name"] unique_flaky.setdefault(key, []).append(f_entry.get("matrix_label", "")) lines = [] - for name, matrices in sorted(unique_flaky.items())[:5]: + for name, matrices in sorted(unique_flaky.items())[:10]: matrix_str = ", ".join(matrices[:3]) if len(matrices) > 3: matrix_str += f" +{len(matrices)-3} more" - lines.append(f" :warning: `{name}` ({matrix_str})") - if len(unique_flaky) > 5: - lines.append(f" _...and {len(unique_flaky) - 5} more unique flaky tests_") - blocks.append({ + lines.append(f":warning: `{name}` ({matrix_str})") + if len(unique_flaky) > 10: + lines.append(f"_...and {len(unique_flaky) - 10} more unique flaky tests_") + detail_blocks.append({"type": "divider"}) + detail_blocks.append({ "type": "section", - "text": {"type": "mrkdwn", "text": "*Flaky Tests:*\n" + "\n".join(lines)}, + "text": {"type": "mrkdwn", "text": "*:warning: Flaky Tests:*\n" + "\n".join(lines)}, }) -# --- Links --- +if detail_blocks: + print(make_payload(detail_blocks)) + + +# ── Message 4: Links ───────────────────────────────────────────────── link_parts = [] if github_run_url: - link_parts.append(f"<{github_run_url}|GitHub Actions>") -if report_url: - link_parts.append(f"<{report_url}|Full Report>") -if link_parts: - blocks.append({"type": "divider"}) - blocks.append({ - "type": "context", - "elements": [{"type": "mrkdwn", "text": " ".join(link_parts)}], - }) + link_parts.append(f"<{github_run_url}|:github: GitHub Actions>") +link_parts.append("_Full report attached below_") -payload = { - "channel": "cuopt-regression-testing", - "username": "cuOpt Nightly Bot", - "icon_emoji": ":robot_face:", - "blocks": blocks, -} -print(json.dumps(payload)) +if link_parts: + print(make_payload([ + {"type": "divider"}, + {"type": "context", + "elements": [{"type": "mrkdwn", "text": " | ".join(link_parts)}]}, + ])) PYEOF ) echo "Sending consolidated Slack notification..." -curl -s -X POST \ - -H 'Content-type: application/json' \ - --data "${PAYLOAD}" \ - "${SLACK_WEBHOOK_URL}" - -echo "" +while IFS= read -r payload; do + response=$(curl -s -X POST \ + -H 'Content-type: application/json' \ + --data "${payload}" \ + "${SLACK_WEBHOOK_URL}") + if [ "${response}" != "ok" ]; then + echo "WARNING: Slack webhook returned: ${response}" >&2 + fi +done <<< "${PAYLOADS}" echo "Consolidated Slack notification sent." # Upload HTML report as a file to Slack (requires bot token) if [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ] && [ -n "${CONSOLIDATED_HTML}" ] && [ -f "${CONSOLIDATED_HTML}" ]; then echo "Uploading HTML report to Slack..." - # Read date and branch from the summary for the filename REPORT_DATE=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('date','report'))" "${CONSOLIDATED_SUMMARY}" 2>/dev/null || echo "report") REPORT_BRANCH=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('branch','main'))" "${CONSOLIDATED_SUMMARY}" 2>/dev/null || echo "main") UPLOAD_FILENAME="cuopt-nightly-${REPORT_BRANCH}-${REPORT_DATE}.html" + FILE_SIZE=$(stat --format=%s "${CONSOLIDATED_HTML}") + UPLOAD_TITLE="cuOpt Nightly Report — ${REPORT_BRANCH} — ${REPORT_DATE}" - UPLOAD_RESPONSE=$(curl -s -X POST \ + # Step 1: Get an upload URL from Slack + URL_RESPONSE=$(curl -s -X POST \ -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ - -F "channels=${SLACK_CHANNEL_ID}" \ - -F "file=@${CONSOLIDATED_HTML}" \ - -F "filename=${UPLOAD_FILENAME}" \ - -F "title=cuOpt Nightly Report — ${REPORT_BRANCH} — ${REPORT_DATE}" \ - -F "initial_comment=Full nightly test report attached. Download and open in a browser for interactive details." \ - "https://slack.com/api/files.upload") - - if echo "${UPLOAD_RESPONSE}" | python3 -c "import json,sys; sys.exit(0 if json.load(sys.stdin).get('ok') else 1)" 2>/dev/null; then - echo "HTML report uploaded to Slack." + -H "Content-Type: application/x-www-form-urlencoded" \ + --data-urlencode "filename=${UPLOAD_FILENAME}" \ + --data-urlencode "length=${FILE_SIZE}" \ + "https://slack.com/api/files.getUploadURLExternal") + + UPLOAD_URL=$(echo "${URL_RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('upload_url',''))" 2>/dev/null) + FILE_ID=$(echo "${URL_RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('file_id',''))" 2>/dev/null) + + if [ -z "${UPLOAD_URL}" ] || [ -z "${FILE_ID}" ]; then + echo "WARNING: Slack file upload failed at getUploadURLExternal. Response: ${URL_RESPONSE}" >&2 else - echo "WARNING: Slack file upload failed. Response: ${UPLOAD_RESPONSE}" >&2 + # Step 2: Upload the file content to the presigned URL + curl -s -X POST \ + -F "file=@${CONSOLIDATED_HTML}" \ + "${UPLOAD_URL}" + + # Step 3: Complete the upload and share to channel + COMPLETE_PAYLOAD=$(python3 -c " +import json, sys +print(json.dumps({ + 'files': [{'id': sys.argv[1], 'title': sys.argv[2]}], + 'channel_id': sys.argv[3], + 'initial_comment': 'Full nightly test report \u2014 download and open in a browser for interactive details.' +})) +" "${FILE_ID}" "${UPLOAD_TITLE}" "${SLACK_CHANNEL_ID}") + + COMPLETE_RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${COMPLETE_PAYLOAD}" \ + "https://slack.com/api/files.completeUploadExternal") + + if echo "${COMPLETE_RESPONSE}" | python3 -c "import json,sys; sys.exit(0 if json.load(sys.stdin).get('ok') else 1)" 2>/dev/null; then + echo "HTML report uploaded to Slack." + else + echo "WARNING: Slack file upload failed at completeUploadExternal. Response: ${COMPLETE_RESPONSE}" >&2 + fi fi else if [ -n "${SLACK_BOT_TOKEN}" ] && [ -z "${SLACK_CHANNEL_ID}" ]; then From a85e8a0180784596b46dcffa79147dfa1b48f289 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 13:13:43 -0500 Subject: [PATCH 18/60] Add presigned URLs, workflow-level status, and show only failures in Slack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Generate presigned S3 URLs (7-day expiry) for consolidated HTML report and dashboard, linked in Slack messages - Query GitHub API for workflow job statuses to surface CI-level failures (notebooks, JuMP, etc.) that don't produce JUnit XML - Show only failed/flaky matrix entries in Slack instead of listing all passing ones — compact summary line for green runs - Pass GITHUB_TOKEN and GITHUB_RUN_ID to nightly-summary container - Remove temporary test workflow file --- .github/workflows/nightly-summary.yaml | 4 + ci/nightly_summary.sh | 26 ++++- ci/utils/aggregate_nightly.py | 54 ++++++++- ci/utils/send_consolidated_summary.sh | 150 +++++++++++++++++-------- 4 files changed, 185 insertions(+), 49 deletions(-) diff --git a/.github/workflows/nightly-summary.yaml b/.github/workflows/nightly-summary.yaml index 214fe84c97..c286466937 100644 --- a/.github/workflows/nightly-summary.yaml +++ b/.github/workflows/nightly-summary.yaml @@ -68,4 +68,8 @@ jobs: CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} RAPIDS_BRANCH: ${{ inputs.branch }} + GITHUB_TOKEN: ${{ github.token }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_SERVER_URL: ${{ github.server_url }} run: bash ci/nightly_summary.sh diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index c0aab7a52d..e23b11d1fc 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -13,6 +13,8 @@ # CUOPT_SLACK_WEBHOOK_URL - sends Slack if set # RAPIDS_BRANCH - branch name (default: main) # RAPIDS_BUILD_TYPE - build type (nightly, pull-request, etc.) +# GITHUB_TOKEN - for querying workflow job statuses +# GITHUB_RUN_ID - current workflow run ID set -euo pipefail @@ -40,6 +42,20 @@ S3_INDEX_URI="${S3_BASE}/index.json" S3_DASHBOARD_URI="${S3_BASE}/dashboard/index.html" DASHBOARD_DIR="${SCRIPT_DIR}/dashboard" +# --- Query GitHub API for workflow job statuses --- +WORKFLOW_JOBS_JSON="${OUTPUT_DIR}/workflow_jobs.json" +if [ -n "${GITHUB_TOKEN:-}" ] && [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then + echo "Fetching workflow job statuses from GitHub API..." + curl -s -L \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100" \ + > "${WORKFLOW_JOBS_JSON}" || echo "{}" > "${WORKFLOW_JOBS_JSON}" +else + echo "WARNING: GITHUB_TOKEN or GITHUB_RUN_ID not set, skipping workflow job status." >&2 + echo "{}" > "${WORKFLOW_JOBS_JSON}" +fi + echo "Aggregating nightly summaries from ${S3_SUMMARIES_PREFIX}" python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \ @@ -53,7 +69,13 @@ python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \ --output-dir "${OUTPUT_DIR}" \ --date "${RUN_DATE}" \ --branch "${BRANCH}" \ - --github-run-url "${GITHUB_RUN_URL}" + --github-run-url "${GITHUB_RUN_URL}" \ + --workflow-jobs "${WORKFLOW_JOBS_JSON}" + +# --- Generate presigned URLs for reports (7-day expiry) --- +PRESIGN_EXPIRY=604800 +PRESIGNED_HTML=$(aws s3 presign "${S3_CONSOLIDATED_HTML}" --expires-in "${PRESIGN_EXPIRY}" 2>/dev/null || echo "") +PRESIGNED_DASHBOARD=$(aws s3 presign "${S3_DASHBOARD_URI}" --expires-in "${PRESIGN_EXPIRY}" 2>/dev/null || echo "") # Send consolidated Slack notification if webhook is available and this is a nightly build if [ -n "${CUOPT_SLACK_WEBHOOK_URL:-}" ] && [ "${RAPIDS_BUILD_TYPE:-}" = "nightly" ]; then @@ -63,6 +85,8 @@ if [ -n "${CUOPT_SLACK_WEBHOOK_URL:-}" ] && [ "${RAPIDS_BUILD_TYPE:-}" = "nightl SLACK_WEBHOOK_URL="${CUOPT_SLACK_WEBHOOK_URL}" \ SLACK_BOT_TOKEN="${CUOPT_SLACK_BOT_TOKEN:-}" \ SLACK_CHANNEL_ID="${CUOPT_SLACK_CHANNEL_ID:-}" \ + PRESIGNED_REPORT_URL="${PRESIGNED_HTML}" \ + PRESIGNED_DASHBOARD_URL="${PRESIGNED_DASHBOARD}" \ bash "${SCRIPT_DIR}/utils/send_consolidated_summary.sh" fi diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index 31e567f487..78172ddbe5 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -172,7 +172,38 @@ def aggregate_summaries(summaries): # --------------------------------------------------------------------------- -def generate_consolidated_json(agg, date_str, branch, github_run_url=""): +def parse_workflow_jobs(workflow_jobs_path): + """Parse GitHub Actions workflow job statuses from JSON file. + Returns a list of dicts with job name, conclusion, and URL.""" + if not workflow_jobs_path or not Path(workflow_jobs_path).exists(): + return [] + try: + with open(workflow_jobs_path) as f: + data = json.load(f) + jobs_list = data.get("jobs", []) + result = [] + for job in jobs_list: + name = job.get("name", "") + # Skip the nightly-summary job itself + if "nightly-summary" in name.lower(): + continue + result.append({ + "name": name, + "conclusion": job.get("conclusion", "unknown"), + "status": job.get("status", "unknown"), + "url": job.get("html_url", ""), + }) + return result + except (json.JSONDecodeError, OSError) as exc: + print( + f"WARNING: Failed to parse workflow jobs: {exc}", + file=sys.stderr, + ) + return [] + + +def generate_consolidated_json(agg, date_str, branch, github_run_url="", + workflow_jobs=None): """Generate the consolidated JSON for Slack and dashboard.""" total_jobs = len(agg["matrix_grid"]) failed_jobs = sum( @@ -181,6 +212,10 @@ def generate_consolidated_json(agg, date_str, branch, github_run_url=""): flaky_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "flaky") passed_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "passed") + # Workflow-level CI job statuses (notebooks, JuMP, etc.) + wf_jobs = workflow_jobs or [] + failed_ci_jobs = [j for j in wf_jobs if j["conclusion"] == "failure"] + return { "timestamp": datetime.now(timezone.utc).isoformat(), "date": date_str, @@ -199,6 +234,8 @@ def generate_consolidated_json(agg, date_str, branch, github_run_url=""): "recurring_failures": agg["all_recurring_failures"], "flaky_tests": agg["all_flaky_tests"], "resolved_tests": agg["all_resolved_tests"], + "workflow_jobs": wf_jobs, + "failed_ci_jobs": failed_ci_jobs, } @@ -556,6 +593,11 @@ def main(): default="", help="URL to the GitHub Actions run", ) + parser.add_argument( + "--workflow-jobs", + default="", + help="Path to JSON file with GitHub Actions workflow job statuses", + ) args = parser.parse_args() output_dir = Path(args.output_dir) @@ -591,12 +633,22 @@ def main(): f"{sum(1 for g in agg['matrix_grid'] if g['status'] == 'flaky')} flaky" ) + # ---- Step 2b: Parse workflow job statuses ---- + workflow_jobs = parse_workflow_jobs(args.workflow_jobs) + if workflow_jobs: + failed_wf = [j for j in workflow_jobs if j["conclusion"] == "failure"] + print( + f"Workflow jobs: {len(workflow_jobs)} total, " + f"{len(failed_wf)} failed" + ) + # ---- Step 3: Generate outputs ---- consolidated = generate_consolidated_json( agg, args.date, args.branch, args.github_run_url, + workflow_jobs, ) json_path = output_dir / "consolidated_summary.json" diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index 32f9f8005b..d302185f26 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -5,10 +5,10 @@ # Send a consolidated Slack notification for the entire nightly run. # Reads the aggregated JSON produced by aggregate_nightly.py and sends # chunked Slack messages: -# 1. Header + status summary + test totals -# 2. Matrix grid (passed / failed / flaky, chunked by test type) +# 1. Header + status summary + test totals + failed CI jobs +# 2. Failed/flaky matrix entries only (not passing ones) # 3. Failure details (new, recurring, stabilized, flaky) -# 4. Links +# 4. Links (presigned URLs + GitHub Actions) # Then uploads the HTML report as a Slack file. # # Required environment variables: @@ -16,9 +16,11 @@ # CONSOLIDATED_SUMMARY - Path to consolidated_summary.json # # Optional environment variables: -# CONSOLIDATED_HTML - Path to consolidated HTML file to upload to Slack -# SLACK_BOT_TOKEN - Slack Bot Token (xoxb-*) for file uploads -# SLACK_CHANNEL_ID - Slack channel ID for file uploads (required with bot token) +# CONSOLIDATED_HTML - Path to consolidated HTML file to upload +# SLACK_BOT_TOKEN - Slack Bot Token (xoxb-*) for file uploads +# SLACK_CHANNEL_ID - Slack channel ID for file uploads +# PRESIGNED_REPORT_URL - Presigned URL for consolidated HTML report +# PRESIGNED_DASHBOARD_URL - Presigned URL for dashboard set -euo pipefail @@ -27,6 +29,8 @@ SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:?SLACK_WEBHOOK_URL is required}" CONSOLIDATED_HTML="${CONSOLIDATED_HTML:-}" SLACK_BOT_TOKEN="${SLACK_BOT_TOKEN:-}" SLACK_CHANNEL_ID="${SLACK_CHANNEL_ID:-}" +PRESIGNED_REPORT_URL="${PRESIGNED_REPORT_URL:-}" +PRESIGNED_DASHBOARD_URL="${PRESIGNED_DASHBOARD_URL:-}" if [ ! -f "${CONSOLIDATED_SUMMARY}" ]; then echo "ERROR: Summary file not found: ${CONSOLIDATED_SUMMARY}" >&2 @@ -34,10 +38,12 @@ if [ ! -f "${CONSOLIDATED_SUMMARY}" ]; then fi # Generate chunked Slack payloads — one JSON object per line -PAYLOADS=$(python3 - "${CONSOLIDATED_SUMMARY}" <<'PYEOF' +PAYLOADS=$(python3 - "${CONSOLIDATED_SUMMARY}" "${PRESIGNED_REPORT_URL}" "${PRESIGNED_DASHBOARD_URL}" <<'PYEOF' import json, sys summary_path = sys.argv[1] +presigned_report_url = sys.argv[2] if len(sys.argv) > 2 else "" +presigned_dashboard_url = sys.argv[3] if len(sys.argv) > 3 else "" with open(summary_path) as f: d = json.load(f) @@ -49,12 +55,19 @@ jobs = d.get("job_summary", {}) totals = d.get("test_totals", {}) grid = d.get("matrix_grid", []) has_new = d.get("has_new_failures", False) +failed_ci_jobs = d.get("failed_ci_jobs", []) +workflow_jobs = d.get("workflow_jobs", []) total_jobs = jobs.get("total", 0) failed_jobs = jobs.get("failed", 0) flaky_jobs = jobs.get("flaky", 0) passed_jobs = jobs.get("passed", 0) +# Count CI-level failures (jobs that failed at workflow level) +total_ci_jobs = len(workflow_jobs) +failed_ci_count = len(failed_ci_jobs) +passed_ci_count = sum(1 for j in workflow_jobs if j["conclusion"] == "success") + status_icons = { "passed": ":white_check_mark:", "failed-new": ":rotating_light:", @@ -71,12 +84,22 @@ def make_payload(blocks): }) -# ── Message 1: Header + status + totals ────────────────────────────── +# ── Message 1: Header + status + totals + CI job failures ──────────── blocks = [] -if failed_jobs > 0 and has_new: +# Determine overall status considering both test results and CI jobs +all_green = failed_jobs == 0 and failed_ci_count == 0 + +if failed_ci_count > 0 or (failed_jobs > 0 and has_new): emoji = ":rotating_light:" - text = f"NEW test failures in {failed_jobs} matrix job(s)" + parts = [] + if failed_ci_count > 0: + parts.append(f"{failed_ci_count} CI job(s) failed") + if failed_jobs > 0 and has_new: + parts.append(f"NEW test failures in {failed_jobs} matrix job(s)") + elif failed_jobs > 0: + parts.append(f"recurring failures in {failed_jobs} matrix job(s)") + text = " + ".join(parts) mention = " " elif failed_jobs > 0: emoji = ":x:" @@ -89,6 +112,8 @@ elif flaky_jobs > 0: else: emoji = ":white_check_mark:" text = f"All {total_jobs} matrix jobs passed" + if total_ci_jobs > 0: + text += f", all {passed_ci_count} CI jobs succeeded" mention = "" stats = ( @@ -114,49 +139,76 @@ blocks.append({ "text": f"{mention}{emoji} *{text}*\n\n{stats}", }, }) -print(make_payload(blocks)) - -# ── Message 2: Matrix grid (chunked by test type) ──────────────────── -test_types = {} -for g in grid: - tt = g["test_type"] - test_types.setdefault(tt, []).append(g) - -# Split into sections that fit within Slack's 3000 char limit per block -grid_blocks = [] -current_text = "" -for tt, entries in sorted(test_types.items()): - cells = [] - for g in entries: - icon = status_icons.get(g["status"], ":grey_question:") - label = g["matrix_label"] - failed_count = g["counts"].get("failed", 0) - if failed_count > 0: - cells.append(f"{icon} `{label}` ({failed_count} failures)") +# Show failed CI jobs (notebooks, JuMP, etc.) +if failed_ci_jobs: + lines = [] + for j in failed_ci_jobs: + url = j.get("url", "") + name = j["name"] + if url: + lines.append(f":x: <{url}|{name}>") else: - cells.append(f"{icon} `{label}`") - section = f"*{tt}*\n" + "\n".join(f" {c}" for c in cells) + "\n" + lines.append(f":x: {name}") + blocks.append({"type": "divider"}) + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": "*Failed CI Jobs:*\n" + "\n".join(lines)}, + }) - # If adding this section would exceed limit, flush current block - if current_text and len(current_text) + len(section) > 2800: +print(make_payload(blocks)) + + +# ── Message 2: Failed/flaky matrix entries only ────────────────────── +# Only show entries that are NOT passed +failed_grid = [g for g in grid if g["status"] != "passed"] + +if failed_grid: + test_types = {} + for g in failed_grid: + tt = g["test_type"] + test_types.setdefault(tt, []).append(g) + + grid_blocks = [] + current_text = "" + for tt, entries in sorted(test_types.items()): + cells = [] + for g in entries: + icon = status_icons.get(g["status"], ":grey_question:") + label = g["matrix_label"] + failed_count = g["counts"].get("failed", 0) + if failed_count > 0: + cells.append(f"{icon} `{label}` ({failed_count} failures)") + else: + cells.append(f"{icon} `{label}`") + section = f"*{tt}*\n" + "\n".join(f" {c}" for c in cells) + "\n" + + if current_text and len(current_text) + len(section) > 2800: + grid_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current_text.rstrip()}, + }) + current_text = "" + current_text += section + + if current_text: grid_blocks.append({ "type": "section", "text": {"type": "mrkdwn", "text": current_text.rstrip()}, }) - current_text = "" - current_text += section -if current_text: - grid_blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": current_text.rstrip()}, - }) - -# Chunk grid blocks into messages of at most 48 blocks (leave room for divider) -for i in range(0, len(grid_blocks), 48): - chunk = grid_blocks[i:i+48] - print(make_payload([{"type": "divider"}] + chunk)) + for i in range(0, len(grid_blocks), 48): + chunk = grid_blocks[i:i+48] + print(make_payload([{"type": "divider"}] + chunk)) +else: + # All passed — just a compact summary + if total_jobs > 0: + print(make_payload([ + {"type": "divider"}, + {"type": "section", + "text": {"type": "mrkdwn", + "text": f":white_check_mark: All {total_jobs} test matrix jobs passed"}}, + ])) # ── Message 3: Failure details ──────────────────────────────────────── @@ -175,7 +227,6 @@ if new_failures: if len(new_failures) > 15: lines.append(f"_...and {len(new_failures) - 15} more_") text = "*:rotating_light: New Failures:*\n" + "\n".join(lines) - # Split into 3000-char chunks if needed while text: detail_blocks.append({ "type": "section", @@ -251,7 +302,12 @@ if detail_blocks: link_parts = [] if github_run_url: link_parts.append(f"<{github_run_url}|:github: GitHub Actions>") -link_parts.append("_Full report attached below_") +if presigned_report_url: + link_parts.append(f"<{presigned_report_url}|:bar_chart: Full Report>") +if presigned_dashboard_url: + link_parts.append(f"<{presigned_dashboard_url}|:chart_with_upwards_trend: Dashboard>") +if not presigned_report_url: + link_parts.append("_Full report attached below_") if link_parts: print(make_payload([ From 3ee8c1fd1dd3f7d835e33f3bf038b91a2b2b24dc Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 13:22:38 -0500 Subject: [PATCH 19/60] Use Slack threading for nightly summary details Post the main summary (status + links) as a top-level message via chat.postMessage, then post matrix details, failure breakdowns, and the HTML report as thread replies. Keeps the channel clean while preserving full detail in the thread. Falls back to webhook (no threading) if bot token is not available. --- ci/utils/send_consolidated_summary.sh | 175 ++++++++++++++++---------- 1 file changed, 109 insertions(+), 66 deletions(-) diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index d302185f26..c6fced3676 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -3,22 +3,21 @@ # SPDX-License-Identifier: Apache-2.0 # Send a consolidated Slack notification for the entire nightly run. -# Reads the aggregated JSON produced by aggregate_nightly.py and sends -# chunked Slack messages: -# 1. Header + status summary + test totals + failed CI jobs -# 2. Failed/flaky matrix entries only (not passing ones) -# 3. Failure details (new, recurring, stabilized, flaky) -# 4. Links (presigned URLs + GitHub Actions) -# Then uploads the HTML report as a Slack file. +# Reads the aggregated JSON produced by aggregate_nightly.py and sends: +# - Main message: Header + status summary + test totals + failed CI jobs +# - Thread replies: matrix details, failure details, links, HTML report +# +# If SLACK_BOT_TOKEN is available, posts via chat.postMessage (enables +# threading). Falls back to webhook (no threading) otherwise. # # Required environment variables: -# SLACK_WEBHOOK_URL - Slack incoming webhook URL +# SLACK_WEBHOOK_URL - Slack incoming webhook URL (fallback) # CONSOLIDATED_SUMMARY - Path to consolidated_summary.json # # Optional environment variables: # CONSOLIDATED_HTML - Path to consolidated HTML file to upload -# SLACK_BOT_TOKEN - Slack Bot Token (xoxb-*) for file uploads -# SLACK_CHANNEL_ID - Slack channel ID for file uploads +# SLACK_BOT_TOKEN - Slack Bot Token (xoxb-*) for threading + file uploads +# SLACK_CHANNEL_ID - Slack channel ID (required with bot token) # PRESIGNED_REPORT_URL - Presigned URL for consolidated HTML report # PRESIGNED_DASHBOARD_URL - Presigned URL for dashboard @@ -37,7 +36,8 @@ if [ ! -f "${CONSOLIDATED_SUMMARY}" ]; then exit 1 fi -# Generate chunked Slack payloads — one JSON object per line +# Generate Slack payloads — one JSON object per line. +# Line 1 = main message, lines 2+ = thread replies. PAYLOADS=$(python3 - "${CONSOLIDATED_SUMMARY}" "${PRESIGNED_REPORT_URL}" "${PRESIGNED_DASHBOARD_URL}" <<'PYEOF' import json, sys @@ -63,7 +63,6 @@ failed_jobs = jobs.get("failed", 0) flaky_jobs = jobs.get("flaky", 0) passed_jobs = jobs.get("passed", 0) -# Count CI-level failures (jobs that failed at workflow level) total_ci_jobs = len(workflow_jobs) failed_ci_count = len(failed_ci_jobs) passed_ci_count = sum(1 for j in workflow_jobs if j["conclusion"] == "success") @@ -84,12 +83,11 @@ def make_payload(blocks): }) -# ── Message 1: Header + status + totals + CI job failures ──────────── +# ══════════════════════════════════════════════════════════════════════ +# MAIN MESSAGE (line 1) — posted to channel, becomes thread parent +# ══════════════════════════════════════════════════════════════════════ blocks = [] -# Determine overall status considering both test results and CI jobs -all_green = failed_jobs == 0 and failed_ci_count == 0 - if failed_ci_count > 0 or (failed_jobs > 0 and has_new): emoji = ":rotating_light:" parts = [] @@ -140,7 +138,7 @@ blocks.append({ }, }) -# Show failed CI jobs (notebooks, JuMP, etc.) +# Failed CI jobs in main message if failed_ci_jobs: lines = [] for j in failed_ci_jobs: @@ -156,11 +154,29 @@ if failed_ci_jobs: "text": {"type": "mrkdwn", "text": "*Failed CI Jobs:*\n" + "\n".join(lines)}, }) +# Links in main message +link_parts = [] +if github_run_url: + link_parts.append(f"<{github_run_url}|:github: GitHub Actions>") +if presigned_report_url: + link_parts.append(f"<{presigned_report_url}|:bar_chart: Full Report>") +if presigned_dashboard_url: + link_parts.append(f"<{presigned_dashboard_url}|:chart_with_upwards_trend: Dashboard>") +if link_parts: + blocks.append({"type": "divider"}) + blocks.append({ + "type": "context", + "elements": [{"type": "mrkdwn", "text": " | ".join(link_parts)}], + }) + print(make_payload(blocks)) -# ── Message 2: Failed/flaky matrix entries only ────────────────────── -# Only show entries that are NOT passed +# ══════════════════════════════════════════════════════════════════════ +# THREAD REPLIES (lines 2+) — posted as replies to main message +# ══════════════════════════════════════════════════════════════════════ + +# ── Thread 1: Failed/flaky matrix entries ───────────────────────────── failed_grid = [g for g in grid if g["status"] != "passed"] if failed_grid: @@ -199,22 +215,11 @@ if failed_grid: for i in range(0, len(grid_blocks), 48): chunk = grid_blocks[i:i+48] - print(make_payload([{"type": "divider"}] + chunk)) -else: - # All passed — just a compact summary - if total_jobs > 0: - print(make_payload([ - {"type": "divider"}, - {"type": "section", - "text": {"type": "mrkdwn", - "text": f":white_check_mark: All {total_jobs} test matrix jobs passed"}}, - ])) - + print(make_payload(chunk)) -# ── Message 3: Failure details ──────────────────────────────────────── +# ── Thread 2: Failure details ───────────────────────────────────────── detail_blocks = [] -# New failures new_failures = d.get("new_failures", []) if new_failures: lines = [] @@ -234,7 +239,6 @@ if new_failures: }) text = text[2900:] -# Recurring failures recurring = d.get("recurring_failures", []) if recurring: lines = [] @@ -252,7 +256,6 @@ if recurring: "text": {"type": "mrkdwn", "text": "*:x: Recurring Failures:*\n" + "\n".join(lines)}, }) -# Stabilized resolved = d.get("resolved_tests", []) if resolved: lines = [] @@ -273,7 +276,6 @@ if resolved: }, }) -# Flaky summary flaky = d.get("flaky_tests", []) if flaky: unique_flaky = {} @@ -296,41 +298,78 @@ if flaky: if detail_blocks: print(make_payload(detail_blocks)) - - -# ── Message 4: Links ───────────────────────────────────────────────── -link_parts = [] -if github_run_url: - link_parts.append(f"<{github_run_url}|:github: GitHub Actions>") -if presigned_report_url: - link_parts.append(f"<{presigned_report_url}|:bar_chart: Full Report>") -if presigned_dashboard_url: - link_parts.append(f"<{presigned_dashboard_url}|:chart_with_upwards_trend: Dashboard>") -if not presigned_report_url: - link_parts.append("_Full report attached below_") - -if link_parts: - print(make_payload([ - {"type": "divider"}, - {"type": "context", - "elements": [{"type": "mrkdwn", "text": " | ".join(link_parts)}]}, - ])) PYEOF ) +# ── Send messages ───────────────────────────────────────────────────── echo "Sending consolidated Slack notification..." + +THREAD_TS="" +FIRST=true + while IFS= read -r payload; do - response=$(curl -s -X POST \ - -H 'Content-type: application/json' \ - --data "${payload}" \ - "${SLACK_WEBHOOK_URL}") - if [ "${response}" != "ok" ]; then - echo "WARNING: Slack webhook returned: ${response}" >&2 + if [ "${FIRST}" = true ] && [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ]; then + # Post main message via chat.postMessage to get thread_ts + BOT_PAYLOAD=$(python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +p['channel'] = sys.argv[2] +print(json.dumps(p)) +" "${payload}" "${SLACK_CHANNEL_ID}") + + RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${BOT_PAYLOAD}" \ + "https://slack.com/api/chat.postMessage") + + THREAD_TS=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ts',''))" 2>/dev/null || echo "") + OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "") + + if [ "${OK}" != "True" ]; then + echo "WARNING: chat.postMessage failed: ${RESPONSE}" >&2 + # Fall back to webhook for this and remaining messages + THREAD_TS="" + curl -s -X POST -H 'Content-type: application/json' --data "${payload}" "${SLACK_WEBHOOK_URL}" || true + else + echo "Main message posted (ts=${THREAD_TS})" + fi + FIRST=false + elif [ -n "${THREAD_TS}" ] && [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ]; then + # Post thread reply via chat.postMessage + THREAD_PAYLOAD=$(python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +p['channel'] = sys.argv[2] +p['thread_ts'] = sys.argv[3] +print(json.dumps(p)) +" "${payload}" "${SLACK_CHANNEL_ID}" "${THREAD_TS}") + + RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${THREAD_PAYLOAD}" \ + "https://slack.com/api/chat.postMessage") + + OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "") + if [ "${OK}" != "True" ]; then + echo "WARNING: Thread reply failed: ${RESPONSE}" >&2 + fi + else + # Fallback: webhook (no threading) + response=$(curl -s -X POST \ + -H 'Content-type: application/json' \ + --data "${payload}" \ + "${SLACK_WEBHOOK_URL}") + if [ "${response}" != "ok" ]; then + echo "WARNING: Slack webhook returned: ${response}" >&2 + fi + FIRST=false fi done <<< "${PAYLOADS}" echo "Consolidated Slack notification sent." -# Upload HTML report as a file to Slack (requires bot token) +# ── Upload HTML report as file in thread ────────────────────────────── if [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ] && [ -n "${CONSOLIDATED_HTML}" ] && [ -f "${CONSOLIDATED_HTML}" ]; then echo "Uploading HTML report to Slack..." @@ -359,15 +398,19 @@ if [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ] && [ -n "${CONSOL -F "file=@${CONSOLIDATED_HTML}" \ "${UPLOAD_URL}" - # Step 3: Complete the upload and share to channel + # Step 3: Complete the upload and share to channel (in thread if available) COMPLETE_PAYLOAD=$(python3 -c " import json, sys -print(json.dumps({ +payload = { 'files': [{'id': sys.argv[1], 'title': sys.argv[2]}], 'channel_id': sys.argv[3], - 'initial_comment': 'Full nightly test report \u2014 download and open in a browser for interactive details.' -})) -" "${FILE_ID}" "${UPLOAD_TITLE}" "${SLACK_CHANNEL_ID}") + 'initial_comment': 'Full nightly test report \u2014 download and open in a browser for interactive details.', +} +thread_ts = sys.argv[4] if len(sys.argv) > 4 and sys.argv[4] else '' +if thread_ts: + payload['thread_ts'] = thread_ts +print(json.dumps(payload)) +" "${FILE_ID}" "${UPLOAD_TITLE}" "${SLACK_CHANNEL_ID}" "${THREAD_TS}") COMPLETE_RESPONSE=$(curl -s -X POST \ -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ From 4e8f5610193c8d2aad820a4d6db1e94747d61682 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 13:40:11 -0500 Subject: [PATCH 20/60] Fix CI job filtering and Slack block size limits - Filter out per-matrix test jobs (conda-cpp-tests, conda-python-tests, wheel-tests-*) from workflow job status since they are already tracked by S3 summaries. Only surface untracked jobs like notebooks and JuMP. - Move full CI job failure list to thread reply to avoid exceeding Slack's 3000-char block limit. Main message shows compact summary. - Chunk CI job details into multiple blocks if needed. --- ci/utils/aggregate_nightly.py | 18 +++++++++++- ci/utils/send_consolidated_summary.sh | 40 +++++++++++++++++++-------- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index 78172ddbe5..7df91923d7 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -174,9 +174,22 @@ def aggregate_summaries(summaries): def parse_workflow_jobs(workflow_jobs_path): """Parse GitHub Actions workflow job statuses from JSON file. - Returns a list of dicts with job name, conclusion, and URL.""" + Returns a list of dicts with job name, conclusion, and URL. + Only includes jobs NOT already tracked by per-matrix S3 summaries + (i.e., excludes conda-cpp-tests, conda-python-tests, + wheel-tests-cuopt, wheel-tests-cuopt-server matrix jobs and + their compute-matrix helpers).""" if not workflow_jobs_path or not Path(workflow_jobs_path).exists(): return [] + + # Job name prefixes that are already covered by per-matrix S3 reports + TRACKED_PREFIXES = ( + "conda-cpp-tests", + "conda-python-tests", + "wheel-tests-cuopt-server", + "wheel-tests-cuopt", + ) + try: with open(workflow_jobs_path) as f: data = json.load(f) @@ -187,6 +200,9 @@ def parse_workflow_jobs(workflow_jobs_path): # Skip the nightly-summary job itself if "nightly-summary" in name.lower(): continue + # Skip jobs already tracked by per-matrix S3 summaries + if any(name.startswith(prefix) for prefix in TRACKED_PREFIXES): + continue result.append({ "name": name, "conclusion": job.get("conclusion", "unknown"), diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index c6fced3676..609a0dcbc5 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -138,20 +138,16 @@ blocks.append({ }, }) -# Failed CI jobs in main message +# Failed CI jobs summary in main message (details in thread) if failed_ci_jobs: - lines = [] - for j in failed_ci_jobs: - url = j.get("url", "") - name = j["name"] - if url: - lines.append(f":x: <{url}|{name}>") - else: - lines.append(f":x: {name}") + names = [j["name"] for j in failed_ci_jobs] + summary = f":x: *{len(failed_ci_jobs)} CI job(s) failed:* " + ", ".join(f"`{n}`" for n in names[:5]) + if len(names) > 5: + summary += f" _+{len(names) - 5} more_" blocks.append({"type": "divider"}) blocks.append({ "type": "section", - "text": {"type": "mrkdwn", "text": "*Failed CI Jobs:*\n" + "\n".join(lines)}, + "text": {"type": "mrkdwn", "text": summary}, }) # Links in main message @@ -176,7 +172,29 @@ print(make_payload(blocks)) # THREAD REPLIES (lines 2+) — posted as replies to main message # ══════════════════════════════════════════════════════════════════════ -# ── Thread 1: Failed/flaky matrix entries ───────────────────────────── +# ── Thread 1: Failed CI job details ─────────────────────────────────── +if failed_ci_jobs: + ci_blocks = [] + current = "*Failed CI Jobs:*\n" + for j in failed_ci_jobs: + url = j.get("url", "") + name = j["name"] + line = f":x: <{url}|{name}>\n" if url else f":x: {name}\n" + if len(current) + len(line) > 2900: + ci_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + }) + current = "" + current += line + if current.strip(): + ci_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + }) + print(make_payload(ci_blocks)) + +# ── Thread 2: Failed/flaky matrix entries ───────────────────────────── failed_grid = [g for g in grid if g["status"] != "passed"] if failed_grid: From ac91a7452e6d18f869ab0d5ed43e31defc4abb7c Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 13:45:51 -0500 Subject: [PATCH 21/60] Show all CI workflow statuses and group test issues by workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Include ALL workflow jobs in consolidated JSON (not just untracked) with has_test_details flag to distinguish tracked vs untracked - Thread reply 1: CI Workflow Status showing every workflow group with pass/fail counts — new workflows automatically visible - Thread reply 2: Failing and flaky tests grouped by workflow so users see which workflow has which test issues - Main message alerts only on untracked CI failures (notebooks, JuMP) since tracked failures already appear in the matrix test grid --- ci/utils/aggregate_nightly.py | 25 ++- ci/utils/send_consolidated_summary.sh | 237 ++++++++++++-------------- 2 files changed, 122 insertions(+), 140 deletions(-) diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index 7df91923d7..a0d6db4c90 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -174,15 +174,15 @@ def aggregate_summaries(summaries): def parse_workflow_jobs(workflow_jobs_path): """Parse GitHub Actions workflow job statuses from JSON file. - Returns a list of dicts with job name, conclusion, and URL. - Only includes jobs NOT already tracked by per-matrix S3 summaries - (i.e., excludes conda-cpp-tests, conda-python-tests, - wheel-tests-cuopt, wheel-tests-cuopt-server matrix jobs and - their compute-matrix helpers).""" + Returns all jobs (except nightly-summary itself) with name, + conclusion, URL, and whether they are tracked by per-matrix + S3 summaries.""" if not workflow_jobs_path or not Path(workflow_jobs_path).exists(): return [] - # Job name prefixes that are already covered by per-matrix S3 reports + # Job name prefixes that are covered by per-matrix S3 reports. + # These jobs also have detailed test results; other jobs only have + # a pass/fail status at the workflow level. TRACKED_PREFIXES = ( "conda-cpp-tests", "conda-python-tests", @@ -200,14 +200,16 @@ def parse_workflow_jobs(workflow_jobs_path): # Skip the nightly-summary job itself if "nightly-summary" in name.lower(): continue - # Skip jobs already tracked by per-matrix S3 summaries - if any(name.startswith(prefix) for prefix in TRACKED_PREFIXES): + # Skip helper jobs (compute-matrix, etc.) + if "compute-matrix" in name.lower(): continue + tracked = any(name.startswith(p) for p in TRACKED_PREFIXES) result.append({ "name": name, "conclusion": job.get("conclusion", "unknown"), "status": job.get("status", "unknown"), "url": job.get("html_url", ""), + "has_test_details": tracked, }) return result except (json.JSONDecodeError, OSError) as exc: @@ -228,9 +230,13 @@ def generate_consolidated_json(agg, date_str, branch, github_run_url="", flaky_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "flaky") passed_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "passed") - # Workflow-level CI job statuses (notebooks, JuMP, etc.) + # Workflow-level CI job statuses wf_jobs = workflow_jobs or [] failed_ci_jobs = [j for j in wf_jobs if j["conclusion"] == "failure"] + # Jobs without per-matrix S3 tracking (notebooks, JuMP, etc.) + untracked_failed = [ + j for j in failed_ci_jobs if not j.get("has_test_details", False) + ] return { "timestamp": datetime.now(timezone.utc).isoformat(), @@ -252,6 +258,7 @@ def generate_consolidated_json(agg, date_str, branch, github_run_url="", "resolved_tests": agg["all_resolved_tests"], "workflow_jobs": wf_jobs, "failed_ci_jobs": failed_ci_jobs, + "untracked_failed_ci_jobs": untracked_failed, } diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index 609a0dcbc5..d6466d1bd0 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -56,6 +56,7 @@ totals = d.get("test_totals", {}) grid = d.get("matrix_grid", []) has_new = d.get("has_new_failures", False) failed_ci_jobs = d.get("failed_ci_jobs", []) +untracked_failed = d.get("untracked_failed_ci_jobs", []) workflow_jobs = d.get("workflow_jobs", []) total_jobs = jobs.get("total", 0) @@ -88,11 +89,13 @@ def make_payload(blocks): # ══════════════════════════════════════════════════════════════════════ blocks = [] -if failed_ci_count > 0 or (failed_jobs > 0 and has_new): +untracked_count = len(untracked_failed) +if untracked_count > 0 or (failed_jobs > 0 and has_new): emoji = ":rotating_light:" parts = [] - if failed_ci_count > 0: - parts.append(f"{failed_ci_count} CI job(s) failed") + if untracked_count > 0: + names = [j["name"] for j in untracked_failed] + parts.append(f"{untracked_count} CI job(s) failed ({', '.join(names[:3])})") if failed_jobs > 0 and has_new: parts.append(f"NEW test failures in {failed_jobs} matrix job(s)") elif failed_jobs > 0: @@ -138,10 +141,10 @@ blocks.append({ }, }) -# Failed CI jobs summary in main message (details in thread) -if failed_ci_jobs: - names = [j["name"] for j in failed_ci_jobs] - summary = f":x: *{len(failed_ci_jobs)} CI job(s) failed:* " + ", ".join(f"`{n}`" for n in names[:5]) +# Failed untracked CI jobs in main message (details in thread) +if untracked_failed: + names = [j["name"] for j in untracked_failed] + summary = f":x: *{len(untracked_failed)} CI job(s) failed:* " + ", ".join(f"`{n}`" for n in names[:5]) if len(names) > 5: summary += f" _+{len(names) - 5} more_" blocks.append({"type": "divider"}) @@ -172,14 +175,37 @@ print(make_payload(blocks)) # THREAD REPLIES (lines 2+) — posted as replies to main message # ══════════════════════════════════════════════════════════════════════ -# ── Thread 1: Failed CI job details ─────────────────────────────────── -if failed_ci_jobs: +# ── Thread 1: CI Workflow Status (all jobs) ─────────────────────────── +# Shows every workflow job so new workflows are automatically visible. +if workflow_jobs: + ci_icons = {"success": ":white_check_mark:", "failure": ":x:", + "cancelled": ":no_entry_sign:", "skipped": ":fast_forward:"} + + # Group by workflow prefix (e.g., "conda-cpp-tests", "conda-notebook-tests") + wf_groups = {} + for j in workflow_jobs: + # Use the part before " / " as group name, or full name + prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"] + wf_groups.setdefault(prefix, []).append(j) + ci_blocks = [] - current = "*Failed CI Jobs:*\n" - for j in failed_ci_jobs: - url = j.get("url", "") - name = j["name"] - line = f":x: <{url}|{name}>\n" if url else f":x: {name}\n" + current = "*CI Workflow Status:*\n" + for group_name, group_jobs in sorted(wf_groups.items()): + passed = sum(1 for j in group_jobs if j["conclusion"] == "success") + failed = sum(1 for j in group_jobs if j["conclusion"] == "failure") + total = len(group_jobs) + + if failed > 0: + icon = ":x:" + detail = f"{failed}/{total} failed" + elif passed == total: + icon = ":white_check_mark:" + detail = f"{total} passed" + else: + icon = ":grey_question:" + detail = f"{passed}/{total} passed" + + line = f"{icon} *{group_name}* — {detail}\n" if len(current) + len(line) > 2900: ci_blocks.append({ "type": "section", @@ -187,6 +213,7 @@ if failed_ci_jobs: }) current = "" current += line + if current.strip(): ci_blocks.append({ "type": "section", @@ -194,128 +221,76 @@ if failed_ci_jobs: }) print(make_payload(ci_blocks)) -# ── Thread 2: Failed/flaky matrix entries ───────────────────────────── -failed_grid = [g for g in grid if g["status"] != "passed"] - -if failed_grid: - test_types = {} - for g in failed_grid: - tt = g["test_type"] - test_types.setdefault(tt, []).append(g) - - grid_blocks = [] - current_text = "" - for tt, entries in sorted(test_types.items()): - cells = [] - for g in entries: - icon = status_icons.get(g["status"], ":grey_question:") - label = g["matrix_label"] - failed_count = g["counts"].get("failed", 0) - if failed_count > 0: - cells.append(f"{icon} `{label}` ({failed_count} failures)") - else: - cells.append(f"{icon} `{label}`") - section = f"*{tt}*\n" + "\n".join(f" {c}" for c in cells) + "\n" - - if current_text and len(current_text) + len(section) > 2800: - grid_blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": current_text.rstrip()}, - }) - current_text = "" - current_text += section - - if current_text: - grid_blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": current_text.rstrip()}, - }) - - for i in range(0, len(grid_blocks), 48): - chunk = grid_blocks[i:i+48] - print(make_payload(chunk)) - -# ── Thread 2: Failure details ───────────────────────────────────────── -detail_blocks = [] - +# ── Thread 2: Failing and flaky tests (grouped by workflow) ─────────── +# Build per-workflow test issue lists new_failures = d.get("new_failures", []) -if new_failures: - lines = [] - for f_entry in new_failures[:15]: - msg = f_entry.get("message", "")[:80].replace("\n", " ") - matrix = f_entry.get("matrix_label", "") - lines.append( - f":new: `{f_entry['name']}` ({f_entry['test_type']} / {matrix})\n {msg}" - ) - if len(new_failures) > 15: - lines.append(f"_...and {len(new_failures) - 15} more_") - text = "*:rotating_light: New Failures:*\n" + "\n".join(lines) - while text: - detail_blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": text[:2900]}, - }) - text = text[2900:] - recurring = d.get("recurring_failures", []) -if recurring: - lines = [] - for f_entry in recurring[:15]: - matrix = f_entry.get("matrix_label", "") - first = f_entry.get("first_seen", "?") - lines.append( - f":repeat: `{f_entry['name']}` ({f_entry['test_type']} / {matrix}) \u2014 since {first}" - ) - if len(recurring) > 15: - lines.append(f"_...and {len(recurring) - 15} more_") - detail_blocks.append({"type": "divider"}) - detail_blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": "*:x: Recurring Failures:*\n" + "\n".join(lines)}, - }) - +flaky = d.get("flaky_tests", []) resolved = d.get("resolved_tests", []) -if resolved: - lines = [] - for r in resolved[:10]: - matrix = r.get("matrix_label", "") - count = r.get("failure_count", "?") - lines.append( - f":white_check_mark: `{r['name']}` ({r['test_type']} / {matrix}) \u2014 failed {count}x" - ) - if len(resolved) > 10: - lines.append(f"_...and {len(resolved) - 10} more_") - detail_blocks.append({"type": "divider"}) - detail_blocks.append({ - "type": "section", - "text": { - "type": "mrkdwn", - "text": "*:white_check_mark: Stabilized (were failing, now pass):*\n" + "\n".join(lines), - }, - }) -flaky = d.get("flaky_tests", []) -if flaky: - unique_flaky = {} - for f_entry in flaky: - key = f_entry["name"] - unique_flaky.setdefault(key, []).append(f_entry.get("matrix_label", "")) - lines = [] - for name, matrices in sorted(unique_flaky.items())[:10]: - matrix_str = ", ".join(matrices[:3]) - if len(matrices) > 3: - matrix_str += f" +{len(matrices)-3} more" - lines.append(f":warning: `{name}` ({matrix_str})") - if len(unique_flaky) > 10: - lines.append(f"_...and {len(unique_flaky) - 10} more unique flaky tests_") - detail_blocks.append({"type": "divider"}) - detail_blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": "*:warning: Flaky Tests:*\n" + "\n".join(lines)}, - }) +# Collect all test issues by test_type (workflow) +issues_by_wf = {} +for f_entry in new_failures: + tt = f_entry.get("test_type", "unknown") + issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []}) + issues_by_wf[tt]["new"].append(f_entry) +for f_entry in recurring: + tt = f_entry.get("test_type", "unknown") + issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []}) + issues_by_wf[tt]["recurring"].append(f_entry) +for f_entry in flaky: + tt = f_entry.get("test_type", "unknown") + issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []}) + issues_by_wf[tt]["flaky"].append(f_entry) +for r in resolved: + tt = r.get("test_type", "unknown") + issues_by_wf.setdefault(tt, {"new": [], "recurring": [], "flaky": [], "resolved": []}) + issues_by_wf[tt]["resolved"].append(r) + +if issues_by_wf: + for wf_name, issues in sorted(issues_by_wf.items()): + wf_blocks = [] + wf_text = f"*{wf_name}*\n" + + # New failures + for f_entry in issues["new"][:10]: + msg = f_entry.get("message", "")[:60].replace("\n", " ") + matrix = f_entry.get("matrix_label", "") + wf_text += f":new: `{f_entry['name']}` ({matrix}) — {msg}\n" + + # Recurring failures + for f_entry in issues["recurring"][:10]: + matrix = f_entry.get("matrix_label", "") + first = f_entry.get("first_seen", "?") + wf_text += f":repeat: `{f_entry['name']}` ({matrix}) — since {first}\n" + + # Flaky + for f_entry in issues["flaky"][:10]: + matrix = f_entry.get("matrix_label", "") + wf_text += f":warning: `{f_entry['name']}` ({matrix})\n" + + # Resolved + for r in issues["resolved"][:5]: + matrix = r.get("matrix_label", "") + count = r.get("failure_count", "?") + wf_text += f":white_check_mark: `{r['name']}` ({matrix}) — was failing {count}x\n" + + # Truncation notes + for category, label, limit in [("new", "new failures", 10), ("recurring", "recurring", 10), + ("flaky", "flaky", 10), ("resolved", "resolved", 5)]: + if len(issues[category]) > limit: + wf_text += f"_...+{len(issues[category]) - limit} more {label}_\n" + + # Chunk if needed + while wf_text: + chunk = wf_text[:2900] + wf_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": chunk.rstrip()}, + }) + wf_text = wf_text[2900:] -if detail_blocks: - print(make_payload(detail_blocks)) + print(make_payload(wf_blocks)) PYEOF ) From 777a58dd7e01d3157676d07b030809b85ee6f7cc Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 14:02:15 -0500 Subject: [PATCH 22/60] Remove @channel ping from Slack notifications --- ci/utils/send_consolidated_summary.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index d6466d1bd0..813bf3fd29 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -101,7 +101,7 @@ if untracked_count > 0 or (failed_jobs > 0 and has_new): elif failed_jobs > 0: parts.append(f"recurring failures in {failed_jobs} matrix job(s)") text = " + ".join(parts) - mention = " " + mention = "" elif failed_jobs > 0: emoji = ":x:" text = f"Recurring failures in {failed_jobs} matrix job(s)" From 9c34a9e31f0432e22c9fb1ad15cc05ab91a77ae9 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 16:26:11 -0500 Subject: [PATCH 23/60] Fix presigned URLs and reorder Slack thread replies - Map CUOPT_AWS_* to standard AWS env vars before aws s3 presign so the CLI has credentials in the container - Log presign failures instead of swallowing them silently - Reorder thread: test failures/details first, CI workflow overview last --- ci/nightly_summary.sh | 15 ++++- ci/utils/send_consolidated_summary.sh | 89 +++++++++++++-------------- 2 files changed, 55 insertions(+), 49 deletions(-) diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index e23b11d1fc..647d08260a 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -73,9 +73,20 @@ python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \ --workflow-jobs "${WORKFLOW_JOBS_JSON}" # --- Generate presigned URLs for reports (7-day expiry) --- +# Map CUOPT_AWS_* to standard AWS env vars for the aws CLI +export AWS_ACCESS_KEY_ID="${CUOPT_AWS_ACCESS_KEY_ID:-${AWS_ACCESS_KEY_ID:-}}" +export AWS_SECRET_ACCESS_KEY="${CUOPT_AWS_SECRET_ACCESS_KEY:-${AWS_SECRET_ACCESS_KEY:-}}" +unset AWS_SESSION_TOKEN + PRESIGN_EXPIRY=604800 -PRESIGNED_HTML=$(aws s3 presign "${S3_CONSOLIDATED_HTML}" --expires-in "${PRESIGN_EXPIRY}" 2>/dev/null || echo "") -PRESIGNED_DASHBOARD=$(aws s3 presign "${S3_DASHBOARD_URI}" --expires-in "${PRESIGN_EXPIRY}" 2>/dev/null || echo "") +PRESIGNED_HTML=$(aws s3 presign "${S3_CONSOLIDATED_HTML}" --expires-in "${PRESIGN_EXPIRY}" 2>&1) || { + echo "WARNING: Failed to generate presigned URL for report: ${PRESIGNED_HTML}" >&2 + PRESIGNED_HTML="" +} +PRESIGNED_DASHBOARD=$(aws s3 presign "${S3_DASHBOARD_URI}" --expires-in "${PRESIGN_EXPIRY}" 2>&1) || { + echo "WARNING: Failed to generate presigned URL for dashboard: ${PRESIGNED_DASHBOARD}" >&2 + PRESIGNED_DASHBOARD="" +} # Send consolidated Slack notification if webhook is available and this is a nightly build if [ -n "${CUOPT_SLACK_WEBHOOK_URL:-}" ] && [ "${RAPIDS_BUILD_TYPE:-}" = "nightly" ]; then diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index 813bf3fd29..041a3de0e6 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -175,53 +175,7 @@ print(make_payload(blocks)) # THREAD REPLIES (lines 2+) — posted as replies to main message # ══════════════════════════════════════════════════════════════════════ -# ── Thread 1: CI Workflow Status (all jobs) ─────────────────────────── -# Shows every workflow job so new workflows are automatically visible. -if workflow_jobs: - ci_icons = {"success": ":white_check_mark:", "failure": ":x:", - "cancelled": ":no_entry_sign:", "skipped": ":fast_forward:"} - - # Group by workflow prefix (e.g., "conda-cpp-tests", "conda-notebook-tests") - wf_groups = {} - for j in workflow_jobs: - # Use the part before " / " as group name, or full name - prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"] - wf_groups.setdefault(prefix, []).append(j) - - ci_blocks = [] - current = "*CI Workflow Status:*\n" - for group_name, group_jobs in sorted(wf_groups.items()): - passed = sum(1 for j in group_jobs if j["conclusion"] == "success") - failed = sum(1 for j in group_jobs if j["conclusion"] == "failure") - total = len(group_jobs) - - if failed > 0: - icon = ":x:" - detail = f"{failed}/{total} failed" - elif passed == total: - icon = ":white_check_mark:" - detail = f"{total} passed" - else: - icon = ":grey_question:" - detail = f"{passed}/{total} passed" - - line = f"{icon} *{group_name}* — {detail}\n" - if len(current) + len(line) > 2900: - ci_blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": current.rstrip()}, - }) - current = "" - current += line - - if current.strip(): - ci_blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": current.rstrip()}, - }) - print(make_payload(ci_blocks)) - -# ── Thread 2: Failing and flaky tests (grouped by workflow) ─────────── +# ── Thread 1: Failing and flaky tests (grouped by workflow) ─────────── # Build per-workflow test issue lists new_failures = d.get("new_failures", []) recurring = d.get("recurring_failures", []) @@ -291,6 +245,47 @@ if issues_by_wf: wf_text = wf_text[2900:] print(make_payload(wf_blocks)) + +# ── Thread 2: CI Workflow Status (all jobs) ─────────────────────────── +# Shows every workflow job so new workflows are automatically visible. +if workflow_jobs: + wf_groups = {} + for j in workflow_jobs: + prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"] + wf_groups.setdefault(prefix, []).append(j) + + ci_blocks = [] + current = "*CI Workflow Status:*\n" + for group_name, group_jobs in sorted(wf_groups.items()): + passed = sum(1 for j in group_jobs if j["conclusion"] == "success") + failed = sum(1 for j in group_jobs if j["conclusion"] == "failure") + total = len(group_jobs) + + if failed > 0: + icon = ":x:" + detail = f"{failed}/{total} failed" + elif passed == total: + icon = ":white_check_mark:" + detail = f"{total} passed" + else: + icon = ":grey_question:" + detail = f"{passed}/{total} passed" + + line = f"{icon} *{group_name}* — {detail}\n" + if len(current) + len(line) > 2900: + ci_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + }) + current = "" + current += line + + if current.strip(): + ci_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + }) + print(make_payload(ci_blocks)) PYEOF ) From 8712ec73883f55ba8ebead5a51a85cdb94761926 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 16:43:19 -0500 Subject: [PATCH 24/60] Name failing workflows in Slack and add build summary - Main message now lists which workflows have failures by name (e.g., "Failures in: conda-notebook-tests, wheel-tests-cuopt") with per-workflow failure counts - Add build-summary job to build.yaml that sends a Slack message after all builds complete, showing pass/fail per build job - Build summary queries GitHub API for job statuses, grouped by workflow prefix (cpp-build, wheel-build-cuopt, docs-build, etc.) --- .github/workflows/build.yaml | 38 +++++++ ci/build_summary.sh | 152 ++++++++++++++++++++++++++ ci/utils/send_consolidated_summary.sh | 63 +++++++---- 3 files changed, 233 insertions(+), 20 deletions(-) create mode 100755 ci/build_summary.sh diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index a945cde8ec..910f469936 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -242,6 +242,44 @@ jobs: -f date="${INPUT_DATE}" \ -f sha="${INPUT_SHA}" + build-summary: + if: ${{ always() && (inputs.build_type == 'nightly') }} + needs: + - cpp-build + - python-build + - upload-conda + - wheel-build-cuopt-mps-parser + - wheel-publish-cuopt-mps-parser + - wheel-build-libcuopt + - wheel-publish-libcuopt + - wheel-build-cuopt + - wheel-publish-cuopt + - wheel-build-cuopt-server + - wheel-publish-cuopt-server + - wheel-build-cuopt-sh-client + - wheel-publish-cuopt-sh-client + - docs-build + runs-on: linux-amd64-cpu4 + container: + image: python:3.12-slim + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.sha }} + - name: Install dependencies + run: apt-get update && apt-get install -y --no-install-recommends curl + - name: Send build summary + env: + GITHUB_TOKEN: ${{ github.token }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_SERVER_URL: ${{ github.server_url }} + CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} + SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} + RAPIDS_BRANCH: ${{ inputs.branch }} + run: bash ci/build_summary.sh + build-images: needs: - wheel-publish-cuopt diff --git a/ci/build_summary.sh b/ci/build_summary.sh new file mode 100755 index 0000000000..be3e028eab --- /dev/null +++ b/ci/build_summary.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Send a Slack notification summarizing the build workflow status. +# Queries the GitHub API for job statuses and posts a compact message. + +set -euo pipefail + +BRANCH="${RAPIDS_BRANCH:-main}" +RUN_DATE="$(date +%F)" +GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}" +SLACK_WEBHOOK_URL="${CUOPT_SLACK_WEBHOOK_URL:-}" +SLACK_BOT_TOKEN="${SLACK_BOT_TOKEN:-}" +SLACK_CHANNEL_ID="${SLACK_CHANNEL_ID:-}" + +if [ -z "${SLACK_WEBHOOK_URL}" ] && [ -z "${SLACK_BOT_TOKEN}" ]; then + echo "No Slack credentials set, skipping build summary." + exit 0 +fi + +# Fetch workflow job statuses +JOBS_JSON="" +if [ -n "${GITHUB_TOKEN:-}" ] && [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then + echo "Fetching build job statuses from GitHub API..." + JOBS_JSON=$(curl -s -L \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100") +fi + +# Generate Slack payload +PAYLOAD=$(python3 -c " +import json, sys + +jobs_json = sys.argv[1] +branch = sys.argv[2] +date = sys.argv[3] +run_url = sys.argv[4] + +jobs = json.loads(jobs_json).get('jobs', []) if jobs_json else [] + +# Filter out build-summary itself and compute-matrix helpers +jobs = [j for j in jobs + if 'build-summary' not in j.get('name', '').lower() + and 'compute-matrix' not in j.get('name', '').lower()] + +# Group by workflow prefix +groups = {} +for j in jobs: + name = j.get('name', '') + prefix = name.split(' / ')[0] if ' / ' in name else name + groups.setdefault(prefix, []).append(j) + +total = len(jobs) +failed_count = sum(1 for j in jobs if j.get('conclusion') == 'failure') +passed_count = sum(1 for j in jobs if j.get('conclusion') == 'success') + +if failed_count > 0: + emoji = ':x:' + status = f'{failed_count} build job(s) failed' +else: + emoji = ':white_check_mark:' + status = f'All {passed_count} build jobs passed' + +blocks = [] +blocks.append({ + 'type': 'header', + 'text': {'type': 'plain_text', 'text': f'cuOpt Build \u2014 {branch} \u2014 {date}', 'emoji': True}, +}) +blocks.append({ + 'type': 'section', + 'text': {'type': 'mrkdwn', 'text': f'{emoji} *{status}*'}, +}) +blocks.append({'type': 'divider'}) + +# Build status per group +lines = [] +for group_name, group_jobs in sorted(groups.items()): + g_passed = sum(1 for j in group_jobs if j.get('conclusion') == 'success') + g_failed = sum(1 for j in group_jobs if j.get('conclusion') == 'failure') + g_total = len(group_jobs) + + if g_failed > 0: + icon = ':x:' + detail = f'{g_failed}/{g_total} failed' + elif g_passed == g_total: + icon = ':white_check_mark:' + detail = f'{g_total} passed' + else: + icon = ':grey_question:' + detail = f'{g_passed}/{g_total} passed' + lines.append(f'{icon} *{group_name}* \u2014 {detail}') + +text = '\n'.join(lines) +if len(text) > 2900: + text = text[:2900] + '\n_...truncated_' +blocks.append({ + 'type': 'section', + 'text': {'type': 'mrkdwn', 'text': text}, +}) + +# Link +if run_url: + blocks.append({'type': 'divider'}) + blocks.append({ + 'type': 'context', + 'elements': [{'type': 'mrkdwn', 'text': f'<{run_url}|:github: GitHub Actions>'}], + }) + +print(json.dumps({ + 'username': 'cuOpt Build Bot', + 'icon_emoji': ':package:', + 'blocks': blocks, +})) +" "${JOBS_JSON}" "${BRANCH}" "${RUN_DATE}" "${GITHUB_RUN_URL}") + +# Send via bot token (preferred) or webhook +echo "Sending build summary to Slack..." +if [ -n "${SLACK_BOT_TOKEN}" ] && [ -n "${SLACK_CHANNEL_ID}" ]; then + BOT_PAYLOAD=$(python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +p['channel'] = sys.argv[2] +print(json.dumps(p)) +" "${PAYLOAD}" "${SLACK_CHANNEL_ID}") + + RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${BOT_PAYLOAD}" \ + "https://slack.com/api/chat.postMessage") + + OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "") + if [ "${OK}" != "True" ]; then + echo "WARNING: chat.postMessage failed: ${RESPONSE}" >&2 + # Fall back to webhook + curl -s -X POST -H 'Content-type: application/json' --data "${PAYLOAD}" "${SLACK_WEBHOOK_URL}" || true + else + echo "Build summary posted to Slack." + fi +elif [ -n "${SLACK_WEBHOOK_URL}" ]; then + response=$(curl -s -X POST \ + -H 'Content-type: application/json' \ + --data "${PAYLOAD}" \ + "${SLACK_WEBHOOK_URL}") + if [ "${response}" != "ok" ]; then + echo "WARNING: Slack webhook returned: ${response}" >&2 + else + echo "Build summary posted to Slack." + fi +fi diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index 041a3de0e6..6e3e2b0543 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -89,24 +89,35 @@ def make_payload(blocks): # ══════════════════════════════════════════════════════════════════════ blocks = [] +# Identify which workflows have failures (from both CI jobs and matrix grid) +failing_workflows = set() +for j in failed_ci_jobs: + prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"] + failing_workflows.add(prefix) +for g in grid: + if g["status"].startswith("failed"): + failing_workflows.add(g["test_type"]) +flaky_workflows = set() +for g in grid: + if g["status"] == "flaky": + flaky_workflows.add(g["test_type"]) + +has_failures = len(failing_workflows) > 0 untracked_count = len(untracked_failed) -if untracked_count > 0 or (failed_jobs > 0 and has_new): + +if has_failures and (has_new or untracked_count > 0): emoji = ":rotating_light:" - parts = [] - if untracked_count > 0: - names = [j["name"] for j in untracked_failed] - parts.append(f"{untracked_count} CI job(s) failed ({', '.join(names[:3])})") - if failed_jobs > 0 and has_new: - parts.append(f"NEW test failures in {failed_jobs} matrix job(s)") - elif failed_jobs > 0: - parts.append(f"recurring failures in {failed_jobs} matrix job(s)") - text = " + ".join(parts) + wf_list = ", ".join(sorted(failing_workflows)[:5]) + if len(failing_workflows) > 5: + wf_list += f" +{len(failing_workflows) - 5} more" + text = f"Failures in: {wf_list}" mention = "" -elif failed_jobs > 0: +elif has_failures: emoji = ":x:" - text = f"Recurring failures in {failed_jobs} matrix job(s)" + wf_list = ", ".join(sorted(failing_workflows)[:5]) + text = f"Recurring failures in: {wf_list}" mention = "" -elif flaky_jobs > 0: +elif flaky_workflows: emoji = ":large_yellow_circle:" text = "All jobs passed but flaky tests detected" mention = "" @@ -141,16 +152,28 @@ blocks.append({ }, }) -# Failed untracked CI jobs in main message (details in thread) -if untracked_failed: - names = [j["name"] for j in untracked_failed] - summary = f":x: *{len(untracked_failed)} CI job(s) failed:* " + ", ".join(f"`{n}`" for n in names[:5]) - if len(names) > 5: - summary += f" _+{len(names) - 5} more_" +# Per-workflow failure summary in main message +if failing_workflows: + lines = [] + for wf in sorted(failing_workflows): + # Count matrix failures for this workflow + wf_grid = [g for g in grid if g["test_type"] == wf and g["status"].startswith("failed")] + # Count CI-level failures + wf_ci = [j for j in failed_ci_jobs + if (j["name"].split(" / ")[0] if " / " in j["name"] else j["name"]) == wf] + parts = [] + if wf_grid: + parts.append(f"{len(wf_grid)} matrix job(s)") + if wf_ci and not any(not j.get("has_test_details", False) for j in wf_ci): + pass # already covered by matrix + elif wf_ci: + parts.append("CI job failed") + detail = ", ".join(parts) if parts else "failed" + lines.append(f":x: *{wf}* — {detail}") blocks.append({"type": "divider"}) blocks.append({ "type": "section", - "text": {"type": "mrkdwn", "text": summary}, + "text": {"type": "mrkdwn", "text": "\n".join(lines)}, }) # Links in main message From 4419b928d17043c2cdfc3eb16bd5bce3717e6562 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 17:05:23 -0500 Subject: [PATCH 25/60] Add summary-only flag to test.yaml for quick nightly-summary testing Skips all test jobs when summary-only=true, so nightly-summary runs immediately without waiting for GPU runners. --- .github/workflows/test.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 097f607244..2bbf8105e2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,9 +24,14 @@ on: description: "build_type: one of [branch, nightly, pull-request]" type: string default: nightly + summary-only: + description: "If true, skip all test jobs and run only nightly-summary" + type: boolean + default: false jobs: conda-cpp-tests: + if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main with: build_type: ${{ inputs.build_type }} @@ -42,6 +47,7 @@ jobs: script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} conda-python-tests: + if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main with: run_codecov: false @@ -58,6 +64,7 @@ jobs: script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} wheel-tests-cuopt: + if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} @@ -73,6 +80,7 @@ jobs: script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} wheel-tests-cuopt-server: + if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} @@ -88,6 +96,7 @@ jobs: script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} conda-notebook-tests: + if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main with: From 1e711b38a819da2e1ca3ee329044bd5973a6f4e1 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 17:08:17 -0500 Subject: [PATCH 26/60] Make dashboard self-contained and add summary-only test flag - Embed index.json and consolidated data directly into dashboard HTML during aggregation so it works on private S3 buckets without runtime fetches (no more 403 errors) - Dashboard falls back to S3 fetch if embedded data is absent - Add summary-only input to test.yaml to skip all test jobs and run only nightly-summary (avoids waiting for GPU runners when testing) --- ci/dashboard/index.html | 56 +++++++++++++++++++++-------------- ci/utils/aggregate_nightly.py | 28 ++++++++++++++++-- 2 files changed, 60 insertions(+), 24 deletions(-) diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html index 9b56a7c915..fff5ca0d7c 100644 --- a/ci/dashboard/index.html +++ b/ci/dashboard/index.html @@ -223,37 +223,49 @@

cuOpt Nightly

/* Init */ /* ================================================================== */ async function init() { - // Determine S3 base URL from query param or auto-detect from location - const params = new URLSearchParams(window.location.search); - S.baseUrl = params.get('base_url') || deriveBaseUrl(); + // Use embedded data if available (injected by aggregate_nightly.py) + if (window.__EMBEDDED_INDEX__) { + S.index = window.__EMBEDDED_INDEX__; + S.embedded = true; + if (window.__EMBEDDED_CONSOLIDATED__) { + S.current = window.__EMBEDDED_CONSOLIDATED__; + } + } else { + // Fall back to fetching from S3 + const params = new URLSearchParams(window.location.search); + S.baseUrl = params.get('base_url') || deriveBaseUrl(); - if (!S.baseUrl) { - showEmpty('Set ?base_url=https://... to the S3 base URL for ci_test_reports/nightly/'); - return; - } + if (!S.baseUrl) { + showEmpty('Set ?base_url=https://... to the S3 base URL for ci_test_reports/nightly/'); + return; + } - // Ensure trailing slash - if (!S.baseUrl.endsWith('/')) S.baseUrl += '/'; + if (!S.baseUrl.endsWith('/')) S.baseUrl += '/'; - try { - // Load index - const indexResp = await fetch(S.baseUrl + 'index.json'); - if (!indexResp.ok) throw new Error(`index.json: ${indexResp.status}`); - S.index = await indexResp.json(); - } catch (e) { - showEmpty(`Failed to load index.json from ${esc(S.baseUrl)}.
${esc(e.message)}`); - return; + try { + const indexResp = await fetch(S.baseUrl + 'index.json'); + if (!indexResp.ok) throw new Error(`index.json: ${indexResp.status}`); + S.index = await indexResp.json(); + } catch (e) { + showEmpty(`Failed to load index.json from ${esc(S.baseUrl)}.
${esc(e.message)}`); + return; + } } populateDateSelector(); setupEventListeners(); - // Load most recent date - const dates = Object.keys(S.index.dates || {}).sort().reverse(); - if (dates.length > 0) { - await loadDate(dates[0]); + if (S.current) { + // Already have consolidated data from embedding + populateTestTypeFilters(); + render(); } else { - showEmpty('No nightly data available yet.'); + const dates = Object.keys(S.index.dates || {}).sort().reverse(); + if (dates.length > 0) { + await loadDate(dates[0]); + } else { + showEmpty('No nightly data available yet.'); + } } } diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index a0d6db4c90..2219bb85aa 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -704,11 +704,35 @@ def main(): output_dir, ) - # ---- Step 6: Upload dashboard ---- + # ---- Step 6: Upload dashboard (self-contained with embedded data) ---- if args.s3_dashboard_uri and args.dashboard_dir: dashboard_file = Path(args.dashboard_dir) / "index.html" if dashboard_file.exists(): - s3_upload(str(dashboard_file), args.s3_dashboard_uri) + # Read the index.json we just uploaded/created + index_path = output_dir / "index.json" + index_data = {} + if index_path.exists(): + with open(index_path) as f: + index_data = json.load(f) + + # Inject data into dashboard HTML so it works without S3 fetches + dashboard_html = dashboard_file.read_text() + inject_script = ( + "\n" + ) + # Insert before + dashboard_html = dashboard_html.replace( + "", inject_script + "" + ) + + embedded_path = output_dir / "dashboard.html" + embedded_path.write_text(dashboard_html) + s3_upload(str(embedded_path), args.s3_dashboard_uri) + print(f"Dashboard uploaded with embedded data") else: print( f"WARNING: Dashboard not found at {dashboard_file}", From cc9246e9598f42834718da75677e094967c1c6a5 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Tue, 14 Apr 2026 21:04:20 -0500 Subject: [PATCH 27/60] Show Failures tab first in dashboard instead of Matrix Grid --- ci/dashboard/index.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html index fff5ca0d7c..511203af00 100644 --- a/ci/dashboard/index.html +++ b/ci/dashboard/index.html @@ -197,8 +197,8 @@

cuOpt Nightly

- - + + @@ -215,7 +215,7 @@

cuOpt Nightly

baseUrl: '', // Set by config or URL param index: null, // index.json data current: null, // Current day's consolidated_summary.json - activeTab: 'matrix', + activeTab: 'failures', filters: { testType: new Set(), status: 'all' }, }; From 1b45e2413135e8bfbe742341b573c8b62eb1bb1c Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 10:17:40 -0500 Subject: [PATCH 28/60] Trim verbose Slack output: compact stats, failures-only CI status - Test totals: only show failed/flaky counts, skip passed/skipped/total - CI Workflow Status thread: only list failing workflows, one-line summary for passing ones --- ci/utils/send_consolidated_summary.sh | 52 +++++++++++---------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index 6e3e2b0543..0b57aa839f 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -128,13 +128,14 @@ else: text += f", all {passed_ci_count} CI jobs succeeded" mention = "" -stats = ( - f":white_check_mark: {totals.get('passed', 0)} passed | " - f":x: {totals.get('failed', 0)} failed | " - f":warning: {totals.get('flaky', 0)} flaky | " - f":fast_forward: {totals.get('skipped', 0)} skipped | " - f"Total: {totals.get('total', 0)}" -) +stats_parts = [] +if totals.get("failed", 0) > 0: + stats_parts.append(f":x: {totals['failed']} failed") +if totals.get("flaky", 0) > 0: + stats_parts.append(f":warning: {totals['flaky']} flaky") +if not stats_parts: + stats_parts.append(f":white_check_mark: {totals.get('total', 0)} tests passed") +stats = " | ".join(stats_parts) blocks.append({ "type": "header", @@ -269,46 +270,33 @@ if issues_by_wf: print(make_payload(wf_blocks)) -# ── Thread 2: CI Workflow Status (all jobs) ─────────────────────────── -# Shows every workflow job so new workflows are automatically visible. +# ── Thread 2: CI Workflow Status (only failures + summary) ──────────── if workflow_jobs: wf_groups = {} for j in workflow_jobs: prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"] wf_groups.setdefault(prefix, []).append(j) - ci_blocks = [] - current = "*CI Workflow Status:*\n" + failed_lines = [] + passed_count = 0 for group_name, group_jobs in sorted(wf_groups.items()): passed = sum(1 for j in group_jobs if j["conclusion"] == "success") failed = sum(1 for j in group_jobs if j["conclusion"] == "failure") total = len(group_jobs) if failed > 0: - icon = ":x:" - detail = f"{failed}/{total} failed" - elif passed == total: - icon = ":white_check_mark:" - detail = f"{total} passed" + failed_lines.append(f":x: *{group_name}* — {failed}/{total} failed") else: - icon = ":grey_question:" - detail = f"{passed}/{total} passed" - - line = f"{icon} *{group_name}* — {detail}\n" - if len(current) + len(line) > 2900: - ci_blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": current.rstrip()}, - }) - current = "" - current += line + passed_count += 1 - if current.strip(): - ci_blocks.append({ + if failed_lines: + text = "*Failed CI Workflows:*\n" + "\n".join(failed_lines) + if passed_count > 0: + text += f"\n_{passed_count} other workflow(s) passed_" + print(make_payload([{ "type": "section", - "text": {"type": "mrkdwn", "text": current.rstrip()}, - }) - print(make_payload(ci_blocks)) + "text": {"type": "mrkdwn", "text": text}, + }])) PYEOF ) From 78c1e38b2f6e5a791fcf0b47d479464bbf24d331 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 13:24:43 -0500 Subject: [PATCH 29/60] Fix build summary argument list too long error Write GitHub API response to a temp file instead of passing it as a shell argument to Python. The jobs JSON for a full build matrix exceeds the OS argument length limit. --- ci/build_summary.sh | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/ci/build_summary.sh b/ci/build_summary.sh index be3e028eab..e8fd81a436 100755 --- a/ci/build_summary.sh +++ b/ci/build_summary.sh @@ -20,25 +20,29 @@ if [ -z "${SLACK_WEBHOOK_URL}" ] && [ -z "${SLACK_BOT_TOKEN}" ]; then fi # Fetch workflow job statuses -JOBS_JSON="" +JOBS_FILE=$(mktemp) if [ -n "${GITHUB_TOKEN:-}" ] && [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then echo "Fetching build job statuses from GitHub API..." - JOBS_JSON=$(curl -s -L \ + curl -s -L \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github+json" \ - "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100") + "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100" \ + > "${JOBS_FILE}" +else + echo "{}" > "${JOBS_FILE}" fi # Generate Slack payload PAYLOAD=$(python3 -c " import json, sys -jobs_json = sys.argv[1] +with open(sys.argv[1]) as f: + data = json.load(f) branch = sys.argv[2] date = sys.argv[3] run_url = sys.argv[4] -jobs = json.loads(jobs_json).get('jobs', []) if jobs_json else [] +jobs = data.get('jobs', []) # Filter out build-summary itself and compute-matrix helpers jobs = [j for j in jobs @@ -113,7 +117,9 @@ print(json.dumps({ 'icon_emoji': ':package:', 'blocks': blocks, })) -" "${JOBS_JSON}" "${BRANCH}" "${RUN_DATE}" "${GITHUB_RUN_URL}") +" "${JOBS_FILE}" "${BRANCH}" "${RUN_DATE}" "${GITHUB_RUN_URL}") + +rm -f "${JOBS_FILE}" # Send via bot token (preferred) or webhook echo "Sending build summary to Slack..." From dd758865fc9d12327e1e45c106dfaeb585dbd0d7 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 13:27:34 -0500 Subject: [PATCH 30/60] Add summary-only flag to build.yaml for quick build-summary testing Skips all build/publish/test/image jobs when summary-only=true, so build-summary runs immediately without waiting for runners. --- .github/workflows/build.yaml | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 910f469936..ee455a4452 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -37,6 +37,10 @@ on: If 'true', trigger the test workflow after all builds complete. type: boolean default: false + summary-only: + description: "If true, skip all build jobs and run only build-summary" + type: boolean + default: false concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -44,6 +48,7 @@ concurrency: jobs: cpp-build: + if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main with: @@ -53,6 +58,7 @@ jobs: sha: ${{ inputs.sha }} script: ci/build_cpp.sh python-build: + if: ${{ !inputs.summary-only }} needs: [cpp-build] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main @@ -63,6 +69,7 @@ jobs: sha: ${{ inputs.sha }} script: ci/build_python.sh upload-conda: + if: ${{ !inputs.summary-only }} needs: [cpp-build, python-build] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main @@ -72,6 +79,7 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} wheel-build-cuopt-mps-parser: + if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: @@ -86,6 +94,7 @@ jobs: # need 1 build per Python version and arch (but CUDA version doesn't matter so choose the latest) matrix_filter: 'group_by([.ARCH, (.PY_VER |split(".") | map(tonumber))])|map(max_by([(.CUDA_VER|split(".")|map(tonumber))]))' wheel-publish-cuopt-mps-parser: + if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-mps-parser secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -97,6 +106,7 @@ jobs: package-name: cuopt_mps_parser package-type: python wheel-build-libcuopt: + if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-mps-parser secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main @@ -110,6 +120,7 @@ jobs: package-type: cpp matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) wheel-publish-libcuopt: + if: ${{ !inputs.summary-only }} needs: wheel-build-libcuopt secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -121,6 +132,7 @@ jobs: package-name: libcuopt package-type: cpp wheel-build-cuopt: + if: ${{ !inputs.summary-only }} needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main @@ -133,6 +145,7 @@ jobs: package-name: cuopt package-type: python wheel-publish-cuopt: + if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -144,6 +157,7 @@ jobs: package-name: cuopt package-type: python wheel-build-cuopt-server: + if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: @@ -158,6 +172,7 @@ jobs: # Only need 1 package per CUDA major version. This selects "ARCH=amd64 + the latest supported Python, 1 job per major CUDA version". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) wheel-publish-cuopt-server: + if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-server secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -169,6 +184,7 @@ jobs: package-name: cuopt_server package-type: python docs-build: + if: ${{ !inputs.summary-only }} needs: [python-build] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main @@ -184,6 +200,7 @@ jobs: container_image: "rapidsai/ci-conda:26.06-latest" script: "ci/build_docs.sh" wheel-build-cuopt-sh-client: + if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: @@ -199,6 +216,7 @@ jobs: # only need 1 build (noarch package): this selects amd64, oldest-supported Python, latest-supported CUDA matrix_filter: '[map(select(.ARCH == "amd64")) | min_by((.PY_VER | split(".") | map(tonumber)), (.CUDA_VER | split(".") | map(-tonumber)))]' wheel-publish-cuopt-sh-client: + if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-sh-client secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -219,7 +237,7 @@ jobs: - wheel-publish-cuopt-server - wheel-publish-cuopt-sh-client - wheel-publish-libcuopt - if: inputs.trigger-tests + if: ${{ inputs.trigger-tests && !inputs.summary-only }} runs-on: ubuntu-latest # ref: https://docs.github.com/en/actions/reference/security/secure-use#use-an-intermediate-environment-variable env: @@ -281,6 +299,7 @@ jobs: run: bash ci/build_summary.sh build-images: + if: ${{ !inputs.summary-only }} needs: - wheel-publish-cuopt - wheel-publish-cuopt-server From bd1ec6fbdd2fdaf7b26530faee32c7f8fc25aa95 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 14:12:19 -0500 Subject: [PATCH 31/60] Show CI job pass/fail counts in main Slack message Use GitHub API job counts (e.g., "1/11 failed") instead of vague "failed" or "matrix job(s)" in the per-workflow failure summary. --- ci/utils/send_consolidated_summary.sh | 32 ++++++++++++++------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index 0b57aa839f..bf63b48d94 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -153,24 +153,26 @@ blocks.append({ }, }) -# Per-workflow failure summary in main message +# Per-workflow failure summary using CI job counts from GitHub API +# Build a lookup: workflow prefix -> (failed, total) from workflow_jobs +wf_counts = {} +for j in workflow_jobs: + prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"] + wf_counts.setdefault(prefix, {"failed": 0, "total": 0}) + wf_counts[prefix]["total"] += 1 + if j["conclusion"] == "failure": + wf_counts[prefix]["failed"] += 1 + if failing_workflows: lines = [] for wf in sorted(failing_workflows): - # Count matrix failures for this workflow - wf_grid = [g for g in grid if g["test_type"] == wf and g["status"].startswith("failed")] - # Count CI-level failures - wf_ci = [j for j in failed_ci_jobs - if (j["name"].split(" / ")[0] if " / " in j["name"] else j["name"]) == wf] - parts = [] - if wf_grid: - parts.append(f"{len(wf_grid)} matrix job(s)") - if wf_ci and not any(not j.get("has_test_details", False) for j in wf_ci): - pass # already covered by matrix - elif wf_ci: - parts.append("CI job failed") - detail = ", ".join(parts) if parts else "failed" - lines.append(f":x: *{wf}* — {detail}") + counts = wf_counts.get(wf, {}) + f_count = counts.get("failed", 0) + t_count = counts.get("total", 0) + if t_count > 0: + lines.append(f":x: *{wf}* — {f_count}/{t_count} failed") + else: + lines.append(f":x: *{wf}* — failed") blocks.append({"type": "divider"}) blocks.append({ "type": "section", From 609db51bb5f3bde2c4c8292b3a1deaba503a5bb4 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 14:22:53 -0500 Subject: [PATCH 32/60] Show failures before matrix overview in consolidated HTML report --- ci/utils/aggregate_nightly.py | 55 +++++++++++++++++------------------ 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index 2219bb85aa..ab27ca3365 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -386,34 +386,6 @@ def generate_consolidated_html(
{totals["resolved"]}
Stabilized
""") - # --- Matrix grid --- - parts.append("

Matrix Overview

") - parts.append( - "" - "" - ) - for g in agg["matrix_grid"]: - counts = g["counts"] - # Build link to per-matrix HTML report on S3 - report_link = "" - if s3_reports_prefix: - report_filename = f"{g['test_type']}-{g['matrix_label']}.html" - report_link = ( - f'View' - ) - parts.append( - f"" - f"" - f"" - f"" - f"" - f"" - f"" - f"" - ) - parts.append("
Test TypeMatrixStatusPassedFailedFlakyTotalReport
{_html_escape(g['test_type'])}{_html_escape(g['matrix_label'])}{_status_badge(g['status'])}{counts.get('passed', 0)}{counts.get('failed', 0)}{counts.get('flaky', 0)}{counts.get('total', 0)}{report_link}
") - # --- New failures --- if agg["all_new_failures"]: parts.append("

New Failures

") @@ -501,6 +473,33 @@ def generate_consolidated_html( "All tests passed across all matrices!

" ) + # --- Matrix grid (at the end) --- + parts.append("

Matrix Overview

") + parts.append( + "" + "" + ) + for g in agg["matrix_grid"]: + counts = g["counts"] + report_link = "" + if s3_reports_prefix: + report_filename = f"{g['test_type']}-{g['matrix_label']}.html" + report_link = ( + f'View' + ) + parts.append( + f"" + f"" + f"" + f"" + f"" + f"" + f"" + f"" + ) + parts.append("
Test TypeMatrixStatusPassedFailedFlakyTotalReport
{_html_escape(g['test_type'])}{_html_escape(g['matrix_label'])}{_status_badge(g['status'])}{counts.get('passed', 0)}{counts.get('failed', 0)}{counts.get('flaky', 0)}{counts.get('total', 0)}{report_link}
") + parts.append("") return "\n".join(parts) From bcde49c8549da6a36de2d2cae62d8cfa26cb2ee1 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 14:24:55 -0500 Subject: [PATCH 33/60] Handle date switching gracefully in embedded dashboard When the dashboard has embedded data and no S3 access, show a friendly message instead of a 403 error when switching dates. The embedded dashboard always shows the latest run. --- ci/dashboard/index.html | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html index 511203af00..49872e4b5f 100644 --- a/ci/dashboard/index.html +++ b/ci/dashboard/index.html @@ -259,6 +259,11 @@

cuOpt Nightly

// Already have consolidated data from embedding populateTestTypeFilters(); render(); + // Mark date selector with current date when embedded + if (S.embedded) { + const sel = document.getElementById('date-select'); + sel.value = S.current.date || ''; + } } else { const dates = Object.keys(S.index.dates || {}).sort().reverse(); if (dates.length > 0) { @@ -281,6 +286,20 @@

cuOpt Nightly

/* ================================================================== */ async function loadDate(dateStr) { const main = document.getElementById('main-content'); + + // If embedded and this is the embedded date, use embedded data + if (S.embedded && S.current && S.current.date === dateStr) { + populateTestTypeFilters(); + render(); + return; + } + + // If embedded with no S3 access, can't load other dates + if (S.embedded && !S.baseUrl) { + showEmpty(`Only the latest run (${esc(S.current?.date || 'unknown')}) is available in this view. Download per-date reports from S3 for historical data.`); + return; + } + main.innerHTML = '
Loading data for ' + esc(dateStr) + '...
'; try { From 2499eb3391c1020b99bdd3815538118a85ea052b Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 14:27:37 -0500 Subject: [PATCH 34/60] Show branch and date as info labels in embedded dashboard Replace dropdowns with static labels when dashboard has embedded data, since switching dates/branches requires S3 access. --- ci/dashboard/index.html | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html index 49872e4b5f..c83cd87d3f 100644 --- a/ci/dashboard/index.html +++ b/ci/dashboard/index.html @@ -151,9 +151,11 @@

cuOpt Nightly

+ +
@@ -259,10 +261,16 @@

cuOpt Nightly

// Already have consolidated data from embedding populateTestTypeFilters(); render(); - // Mark date selector with current date when embedded if (S.embedded) { - const sel = document.getElementById('date-select'); - sel.value = S.current.date || ''; + // Show branch and date as info labels, hide dropdowns + document.getElementById('date-select').style.display = 'none'; + document.getElementById('branch-select').style.display = 'none'; + const dateInfo = document.getElementById('date-info'); + const branchInfo = document.getElementById('branch-info'); + dateInfo.textContent = S.current.date || 'unknown'; + dateInfo.style.display = 'block'; + branchInfo.textContent = S.current.branch || 'unknown'; + branchInfo.style.display = 'block'; } } else { const dates = Object.keys(S.index.dates || {}).sort().reverse(); From b9e6b9803f013fc793a02967f0539c9c440e25cc Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 14:31:30 -0500 Subject: [PATCH 35/60] Add branch separation to S3 paths for multi-branch nightly support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - S3 summaries, reports, and history now include branch slug in path: summaries/{date}/{branch}/, reports/{date}/{branch}/, history/{branch}/ - Each branch gets its own dashboard at dashboard/{branch}/index.html - index.json entries keyed by date/branch instead of just date - Dashboard date selector shows "date — branch" labels - Trends filtered to current branch - Prevents main and release/26.04 nightlies from overwriting each other --- ci/dashboard/index.html | 56 ++++++++++++++++++++----------- ci/nightly_summary.sh | 11 +++--- ci/utils/aggregate_nightly.py | 15 +++++---- ci/utils/nightly_report_helper.sh | 6 ++-- 4 files changed, 55 insertions(+), 33 deletions(-) diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html index c83cd87d3f..b24380b8a4 100644 --- a/ci/dashboard/index.html +++ b/ci/dashboard/index.html @@ -292,26 +292,32 @@

cuOpt Nightly

/* ================================================================== */ /* Data loading */ /* ================================================================== */ -async function loadDate(dateStr) { +async function loadDate(entryKey) { const main = document.getElementById('main-content'); - // If embedded and this is the embedded date, use embedded data - if (S.embedded && S.current && S.current.date === dateStr) { + // entryKey can be "date/branch" or just "date" (legacy) + const entry = S.index?.dates?.[entryKey] || {}; + const dateStr = entry.date || entryKey.split('/')[0] || entryKey; + const branch = entry.branch || entryKey.split('/')[1] || 'main'; + const branchSlug = branch.replace(/\//g, '-'); + + // If embedded and this matches the embedded data, use it + if (S.embedded && S.current && S.current.date === dateStr && S.current.branch === branch) { populateTestTypeFilters(); render(); return; } - // If embedded with no S3 access, can't load other dates + // If embedded with no S3 access, can't load other entries if (S.embedded && !S.baseUrl) { - showEmpty(`Only the latest run (${esc(S.current?.date || 'unknown')}) is available in this view. Download per-date reports from S3 for historical data.`); + showEmpty(`Only the latest run (${esc(S.current?.date || 'unknown')} / ${esc(S.current?.branch || 'unknown')}) is available in this view.`); return; } - main.innerHTML = '
Loading data for ' + esc(dateStr) + '...
'; + main.innerHTML = '
Loading data for ' + esc(dateStr) + ' / ' + esc(branch) + '...
'; try { - const url = S.baseUrl + 'summaries/' + dateStr + '/consolidated.json'; + const url = S.baseUrl + 'summaries/' + dateStr + '/' + branchSlug + '/consolidated.json'; const resp = await fetch(url); if (!resp.ok) throw new Error(`${resp.status}`); S.current = await resp.json(); @@ -501,21 +507,27 @@

cuOpt Nightly

function renderTrends() { if (!S.index || !S.index.dates) return '

No trend data available.

'; - const dates = Object.keys(S.index.dates).sort().slice(-14); // Last 14 days - if (!dates.length) return '

No trend data available.

'; + // Filter to current branch and sort by date + const currentBranch = S.current?.branch || 'main'; + const entries = Object.entries(S.index.dates) + .filter(([_, v]) => (v.branch || 'main') === currentBranch) + .sort((a, b) => a[0].localeCompare(b[0])) + .slice(-14); + if (!entries.length) return '

No trend data available.

'; - let html = '

Test Results — Last 14 Days

'; + let html = `

Test Results — Last 14 Runs (${esc(currentBranch)})

`; html += '
'; // Find max total for scaling let maxTotal = 1; - for (const d of dates) { - const t = S.index.dates[d].test_totals || {}; + for (const [_, val] of entries) { + const t = val.test_totals || {}; maxTotal = Math.max(maxTotal, t.total || 0); } - for (const d of dates) { - const t = S.index.dates[d].test_totals || {}; + for (const [key, val] of entries) { + const d = val.date || key.split('/')[0] || key; + const t = val.test_totals || {}; const total = t.total || 0; const passed = t.passed || 0; const failed = t.failed || 0; @@ -539,11 +551,12 @@

cuOpt Nightly

html += '
'; // Job pass rate trend - html += '

Matrix Job Pass Rate — Last 14 Days

'; + html += `

Matrix Job Pass Rate — Last 14 Runs (${esc(currentBranch)})

`; html += '
'; - for (const d of dates) { - const j = S.index.dates[d].job_summary || {}; + for (const [key, val] of entries) { + const d = val.date || key.split('/')[0] || key; + const j = val.job_summary || {}; const total = j.total || 0; const passed = j.passed || 0; const failed = j.failed || 0; @@ -638,8 +651,13 @@

cuOpt Nightly

/* ================================================================== */ function populateDateSelector() { const sel = document.getElementById('date-select'); - const dates = Object.keys(S.index.dates || {}).sort().reverse(); - sel.innerHTML = dates.map(d => ``).join(''); + const entries = Object.entries(S.index.dates || {}).sort((a, b) => b[0].localeCompare(a[0])); + sel.innerHTML = entries.map(([key, val]) => { + const date = val.date || key.split('/')[0] || key; + const branch = val.branch || key.split('/')[1] || ''; + const label = branch ? `${date} — ${branch}` : date; + return ``; + }).join(''); } function setupEventListeners() { diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 647d08260a..2c50a0cd36 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -34,12 +34,13 @@ if [ -z "${CUOPT_DATASET_S3_URI:-}" ]; then fi S3_BASE="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" -S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/${RUN_DATE}/" -S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/" -S3_CONSOLIDATED_JSON="${S3_BASE}/summaries/${RUN_DATE}/consolidated.json" -S3_CONSOLIDATED_HTML="${S3_BASE}/reports/${RUN_DATE}/consolidated.html" +BRANCH_SLUG=$(echo "${BRANCH}" | tr '/' '-') +S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/" +S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/" +S3_CONSOLIDATED_JSON="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/consolidated.json" +S3_CONSOLIDATED_HTML="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/consolidated.html" S3_INDEX_URI="${S3_BASE}/index.json" -S3_DASHBOARD_URI="${S3_BASE}/dashboard/index.html" +S3_DASHBOARD_URI="${S3_BASE}/dashboard/${BRANCH_SLUG}/index.html" DASHBOARD_DIR="${SCRIPT_DIR}/dashboard" # --- Query GitHub API for workflow job statuses --- diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index ab27ca3365..4767dc70c1 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -526,20 +526,23 @@ def update_index(s3_index_uri, date_str, consolidated, output_dir): except (json.JSONDecodeError, OSError): pass - # Add today's entry (compact — just enough for the dashboard trends) - index["dates"][date_str] = { + # Add today's entry keyed by date/branch for multi-branch support + branch = consolidated.get("branch", "main") + entry_key = f"{date_str}/{branch}" + index["dates"][entry_key] = { + "date": date_str, + "branch": branch, "job_summary": consolidated.get("job_summary", {}), "test_totals": consolidated.get("test_totals", {}), "has_new_failures": consolidated.get("has_new_failures", False), - "branch": consolidated.get("branch", ""), "github_run_url": consolidated.get("github_run_url", ""), } - # Prune to last N days + # Prune to last N entries dates_sorted = sorted(index["dates"].keys(), reverse=True) if len(dates_sorted) > MAX_INDEX_DAYS: - for old_date in dates_sorted[MAX_INDEX_DAYS:]: - del index["dates"][old_date] + for old_key in dates_sorted[MAX_INDEX_DAYS:]: + del index["dates"][old_key] # Write and upload with open(local_index, "w") as f: diff --git a/ci/utils/nightly_report_helper.sh b/ci/utils/nightly_report_helper.sh index 809b918df8..0ab568c34d 100755 --- a/ci/utils/nightly_report_helper.sh +++ b/ci/utils/nightly_report_helper.sh @@ -72,9 +72,9 @@ generate_nightly_report() { if [ -n "${CUOPT_DATASET_S3_URI:-}" ]; then local s3_base="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" - s3_history_uri="${s3_base}/history/${test_type}-${branch_slug}-${matrix_label}.json" - s3_summary_uri="${s3_base}/summaries/${run_date}/${test_type}-${matrix_label}.json" - s3_html_uri="${s3_base}/reports/${run_date}/${test_type}-${matrix_label}.html" + s3_history_uri="${s3_base}/history/${branch_slug}/${test_type}-${matrix_label}.json" + s3_summary_uri="${s3_base}/summaries/${run_date}/${branch_slug}/${test_type}-${matrix_label}.json" + s3_html_uri="${s3_base}/reports/${run_date}/${branch_slug}/${test_type}-${matrix_label}.html" fi # --- Run nightly report --- From 9f7cef5fedf31ffb87a66187fcf5663b033896b2 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 14:38:49 -0500 Subject: [PATCH 36/60] Fall back to legacy S3 path when branch-separated path is empty Checks if the new branch-separated summaries path has data before aggregating. Falls back to the old flat path for backward compatibility with summaries uploaded before the branch separation. --- ci/nightly_summary.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 2c50a0cd36..7cbc8a1c51 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -57,6 +57,15 @@ else echo "{}" > "${WORKFLOW_JOBS_JSON}" fi +# Check if branch-separated path has data; fall back to legacy path if empty +S3_SUMMARIES_LEGACY="${S3_BASE}/summaries/${RUN_DATE}/" +SUMMARY_COUNT=$(aws s3 ls "${S3_SUMMARIES_PREFIX}" 2>/dev/null | wc -l || echo "0") +if [ "${SUMMARY_COUNT}" -eq 0 ]; then + echo "No summaries at branch-separated path, falling back to legacy path" + S3_SUMMARIES_PREFIX="${S3_SUMMARIES_LEGACY}" + S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/" +fi + echo "Aggregating nightly summaries from ${S3_SUMMARIES_PREFIX}" python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \ From 889cc6091095e6d7a7c5917fbdf06fd91169ac2f Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 14:52:23 -0500 Subject: [PATCH 37/60] Pass date through to nightly-summary for correct S3 path lookup The nightly-summary was hardcoding today's date, but summaries on S3 are keyed by the date they were created. When re-running against earlier data, the date must match. Now uses the date input from the workflow, falling back to today if not provided. --- .github/workflows/nightly-summary.yaml | 8 ++++++++ .github/workflows/test.yaml | 1 + ci/nightly_summary.sh | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nightly-summary.yaml b/.github/workflows/nightly-summary.yaml index c286466937..724d55636e 100644 --- a/.github/workflows/nightly-summary.yaml +++ b/.github/workflows/nightly-summary.yaml @@ -20,6 +20,10 @@ on: required: true type: string default: nightly + date: + description: "Date (YYYY-MM-DD) for this run. Defaults to today." + required: false + type: string workflow_call: inputs: branch: @@ -31,6 +35,9 @@ on: build_type: required: true type: string + date: + required: false + type: string secrets: CUOPT_DATASET_S3_URI: required: true @@ -68,6 +75,7 @@ jobs: CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} RAPIDS_BRANCH: ${{ inputs.branch }} + RUN_DATE: ${{ inputs.date }} GITHUB_TOKEN: ${{ github.token }} GITHUB_RUN_ID: ${{ github.run_id }} GITHUB_REPOSITORY: ${{ github.repository }} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 2bbf8105e2..4fa51ade1b 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -121,6 +121,7 @@ jobs: branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} build_type: ${{ inputs.build_type }} + date: ${{ inputs.date }} secrets: CUOPT_DATASET_S3_URI: ${{ secrets.CUOPT_DATASET_S3_URI }} CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 7cbc8a1c51..2494a22704 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -22,7 +22,7 @@ SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" OUTPUT_DIR="${PWD}/aggregate-output" mkdir -p "${OUTPUT_DIR}" -RUN_DATE="$(date +%F)" +RUN_DATE="${RUN_DATE:-$(date +%F)}" BRANCH="${RAPIDS_BRANCH:-main}" GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}" From f30a4e2c708919955c72550a997beab2955d4e46 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 15:19:16 -0500 Subject: [PATCH 38/60] Move AWS credential mapping before S3 fallback check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The aws s3 ls command in the legacy path fallback needs credentials to access the private bucket. Moving CUOPT_AWS_* → AWS_* mapping to the top of the script so all aws CLI calls have credentials. --- ci/nightly_summary.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 2494a22704..9fab77ff09 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -27,6 +27,11 @@ BRANCH="${RAPIDS_BRANCH:-main}" GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}" +# Map CUOPT_AWS_* to standard AWS env vars for the aws CLI +export AWS_ACCESS_KEY_ID="${CUOPT_AWS_ACCESS_KEY_ID:-${AWS_ACCESS_KEY_ID:-}}" +export AWS_SECRET_ACCESS_KEY="${CUOPT_AWS_SECRET_ACCESS_KEY:-${AWS_SECRET_ACCESS_KEY:-}}" +unset AWS_SESSION_TOKEN + if [ -z "${CUOPT_DATASET_S3_URI:-}" ]; then echo "WARNING: CUOPT_DATASET_S3_URI is not set. Skipping nightly aggregation." >&2 echo "The per-matrix reports (uploaded by individual test jobs) are still available on S3." @@ -83,11 +88,6 @@ python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \ --workflow-jobs "${WORKFLOW_JOBS_JSON}" # --- Generate presigned URLs for reports (7-day expiry) --- -# Map CUOPT_AWS_* to standard AWS env vars for the aws CLI -export AWS_ACCESS_KEY_ID="${CUOPT_AWS_ACCESS_KEY_ID:-${AWS_ACCESS_KEY_ID:-}}" -export AWS_SECRET_ACCESS_KEY="${CUOPT_AWS_SECRET_ACCESS_KEY:-${AWS_SECRET_ACCESS_KEY:-}}" -unset AWS_SESSION_TOKEN - PRESIGN_EXPIRY=604800 PRESIGNED_HTML=$(aws s3 presign "${S3_CONSOLIDATED_HTML}" --expires-in "${PRESIGN_EXPIRY}" 2>&1) || { echo "WARNING: Failed to generate presigned URL for report: ${PRESIGNED_HTML}" >&2 From 404263c0165ecf50095ede250685eedf6672a3cf Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 15:26:50 -0500 Subject: [PATCH 39/60] Exclude consolidated.json from fallback path check Previous empty runs uploaded consolidated.json to the branch path, causing the fallback to think data exists. Now only counts actual per-matrix summary files when deciding whether to fall back. --- ci/nightly_summary.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 9fab77ff09..c8708c4777 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -62,11 +62,11 @@ else echo "{}" > "${WORKFLOW_JOBS_JSON}" fi -# Check if branch-separated path has data; fall back to legacy path if empty +# Check if branch-separated path has per-matrix summaries (exclude consolidated.json) S3_SUMMARIES_LEGACY="${S3_BASE}/summaries/${RUN_DATE}/" -SUMMARY_COUNT=$(aws s3 ls "${S3_SUMMARIES_PREFIX}" 2>/dev/null | wc -l || echo "0") +SUMMARY_COUNT=$(aws s3 ls "${S3_SUMMARIES_PREFIX}" 2>/dev/null | grep -v consolidated | grep -c '\.json' || echo "0") if [ "${SUMMARY_COUNT}" -eq 0 ]; then - echo "No summaries at branch-separated path, falling back to legacy path" + echo "No per-matrix summaries at branch path (${S3_SUMMARIES_PREFIX}), falling back to legacy path" S3_SUMMARIES_PREFIX="${S3_SUMMARIES_LEGACY}" S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/" fi From c90e73c0e8d35adbc640ca592bcd32f537ba4fa7 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 15:30:11 -0500 Subject: [PATCH 40/60] Remove legacy S3 path fallback Branch-separated paths are now the only path structure. A full build+test run will populate the new paths. --- ci/nightly_summary.sh | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index c8708c4777..7e77fa5052 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -62,14 +62,6 @@ else echo "{}" > "${WORKFLOW_JOBS_JSON}" fi -# Check if branch-separated path has per-matrix summaries (exclude consolidated.json) -S3_SUMMARIES_LEGACY="${S3_BASE}/summaries/${RUN_DATE}/" -SUMMARY_COUNT=$(aws s3 ls "${S3_SUMMARIES_PREFIX}" 2>/dev/null | grep -v consolidated | grep -c '\.json' || echo "0") -if [ "${SUMMARY_COUNT}" -eq 0 ]; then - echo "No per-matrix summaries at branch path (${S3_SUMMARIES_PREFIX}), falling back to legacy path" - S3_SUMMARIES_PREFIX="${S3_SUMMARIES_LEGACY}" - S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/" -fi echo "Aggregating nightly summaries from ${S3_SUMMARIES_PREFIX}" From 54d01fddf9a82bb0cec5ddfcfabc0f133c01d367 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 16:53:30 -0500 Subject: [PATCH 41/60] Add tests and build-images to build-summary needs Build summary should wait for and report on all jobs including the test trigger and image builds. --- .github/workflows/build.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ee455a4452..414e19e977 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -277,6 +277,8 @@ jobs: - wheel-build-cuopt-sh-client - wheel-publish-cuopt-sh-client - docs-build + - tests + - build-images runs-on: linux-amd64-cpu4 container: image: python:3.12-slim From b7efe3b1882cda811b48b5a247507cc83719c263 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 15 Apr 2026 16:55:49 -0500 Subject: [PATCH 42/60] Simplify build-summary needs to leaf jobs only Only need to depend on tests, build-images, and docs-build since they transitively depend on all upstream build/publish jobs. --- .github/workflows/build.yaml | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 414e19e977..3ba0edd8c1 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -263,22 +263,9 @@ jobs: build-summary: if: ${{ always() && (inputs.build_type == 'nightly') }} needs: - - cpp-build - - python-build - - upload-conda - - wheel-build-cuopt-mps-parser - - wheel-publish-cuopt-mps-parser - - wheel-build-libcuopt - - wheel-publish-libcuopt - - wheel-build-cuopt - - wheel-publish-cuopt - - wheel-build-cuopt-server - - wheel-publish-cuopt-server - - wheel-build-cuopt-sh-client - - wheel-publish-cuopt-sh-client - - docs-build - tests - build-images + - docs-build runs-on: linux-amd64-cpu4 container: image: python:3.12-slim From f72f57da67a1c34040b540cf96575ddf56577f8d Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 16 Apr 2026 10:14:07 -0500 Subject: [PATCH 43/60] Remove duplicate failure list and simplify status text - Status line now says "N workflow(s) with failures" instead of listing workflow names (the breakdown is right below) - Remove "Failed CI Workflows" thread reply since the main message already shows the same per-workflow counts --- ci/utils/send_consolidated_summary.sh | 35 ++------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index bf63b48d94..195a7d5797 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -107,15 +107,11 @@ untracked_count = len(untracked_failed) if has_failures and (has_new or untracked_count > 0): emoji = ":rotating_light:" - wf_list = ", ".join(sorted(failing_workflows)[:5]) - if len(failing_workflows) > 5: - wf_list += f" +{len(failing_workflows) - 5} more" - text = f"Failures in: {wf_list}" + text = f"{len(failing_workflows)} workflow(s) with failures" mention = "" elif has_failures: emoji = ":x:" - wf_list = ", ".join(sorted(failing_workflows)[:5]) - text = f"Recurring failures in: {wf_list}" + text = f"Recurring failures in {len(failing_workflows)} workflow(s)" mention = "" elif flaky_workflows: emoji = ":large_yellow_circle:" @@ -272,33 +268,6 @@ if issues_by_wf: print(make_payload(wf_blocks)) -# ── Thread 2: CI Workflow Status (only failures + summary) ──────────── -if workflow_jobs: - wf_groups = {} - for j in workflow_jobs: - prefix = j["name"].split(" / ")[0] if " / " in j["name"] else j["name"] - wf_groups.setdefault(prefix, []).append(j) - - failed_lines = [] - passed_count = 0 - for group_name, group_jobs in sorted(wf_groups.items()): - passed = sum(1 for j in group_jobs if j["conclusion"] == "success") - failed = sum(1 for j in group_jobs if j["conclusion"] == "failure") - total = len(group_jobs) - - if failed > 0: - failed_lines.append(f":x: *{group_name}* — {failed}/{total} failed") - else: - passed_count += 1 - - if failed_lines: - text = "*Failed CI Workflows:*\n" + "\n".join(failed_lines) - if passed_count > 0: - text += f"\n_{passed_count} other workflow(s) passed_" - print(make_payload([{ - "type": "section", - "text": {"type": "mrkdwn", "text": text}, - }])) PYEOF ) From 967483de5f7fd33b704db6530dfba117f73f3ac6 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 16 Apr 2026 13:14:37 -0500 Subject: [PATCH 44/60] Fix S3 path mismatch: per-matrix summaries use flat paths The rapidsai shared workflows set RAPIDS_BRANCH to 'main' inside test containers regardless of the branch input. Per-matrix summaries are uploaded to flat date-based paths (summaries/{date}/), not branch-separated ones. Only nightly-summary outputs (consolidated report, dashboard) use branch-separated paths. Reverts nightly_report_helper.sh to original flat paths and updates nightly_summary.sh to read from flat paths while writing outputs to branch-separated paths. --- ci/nightly_summary.sh | 19 +++++++++++++++++-- ci/utils/nightly_report_helper.sh | 6 +++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 7e77fa5052..22c63f9742 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -40,8 +40,11 @@ fi S3_BASE="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" BRANCH_SLUG=$(echo "${BRANCH}" | tr '/' '-') -S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/" -S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/" +# Per-matrix summaries are uploaded by rapidsai shared workflows which use +# a flat date-based path (RAPIDS_BRANCH inside those containers is always "main"). +# Only our outputs (consolidated, dashboard) use branch-separated paths. +S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/${RUN_DATE}/" +S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/" S3_CONSOLIDATED_JSON="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/consolidated.json" S3_CONSOLIDATED_HTML="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/consolidated.html" S3_INDEX_URI="${S3_BASE}/index.json" @@ -63,6 +66,18 @@ else fi +echo "RUN_DATE=${RUN_DATE}, BRANCH=${BRANCH}, BRANCH_SLUG=${BRANCH_SLUG}" +echo "Listing S3 summaries at ${S3_SUMMARIES_PREFIX}:" +aws s3 ls "${S3_SUMMARIES_PREFIX}" 2>&1 || echo "(no files or access error)" +# Diagnostic: show what's on S3 for this date +echo "=== S3 diagnostics ===" +echo "RUN_DATE=${RUN_DATE} BRANCH=${BRANCH} BRANCH_SLUG=${BRANCH_SLUG}" +echo "Looking for summaries at: ${S3_SUMMARIES_PREFIX}" +aws s3 ls "${S3_SUMMARIES_PREFIX}" 2>&1 | head -5 || true +echo "All summaries for ${RUN_DATE}:" +aws s3 ls "${S3_BASE}/summaries/${RUN_DATE}/" 2>&1 | head -10 || true +echo "=== End diagnostics ===" + echo "Aggregating nightly summaries from ${S3_SUMMARIES_PREFIX}" python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \ diff --git a/ci/utils/nightly_report_helper.sh b/ci/utils/nightly_report_helper.sh index 0ab568c34d..809b918df8 100755 --- a/ci/utils/nightly_report_helper.sh +++ b/ci/utils/nightly_report_helper.sh @@ -72,9 +72,9 @@ generate_nightly_report() { if [ -n "${CUOPT_DATASET_S3_URI:-}" ]; then local s3_base="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" - s3_history_uri="${s3_base}/history/${branch_slug}/${test_type}-${matrix_label}.json" - s3_summary_uri="${s3_base}/summaries/${run_date}/${branch_slug}/${test_type}-${matrix_label}.json" - s3_html_uri="${s3_base}/reports/${run_date}/${branch_slug}/${test_type}-${matrix_label}.html" + s3_history_uri="${s3_base}/history/${test_type}-${branch_slug}-${matrix_label}.json" + s3_summary_uri="${s3_base}/summaries/${run_date}/${test_type}-${matrix_label}.json" + s3_html_uri="${s3_base}/reports/${run_date}/${test_type}-${matrix_label}.html" fi # --- Run nightly report --- From 285cb9f365868421c35dd0e0bbef438db309062f Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 16 Apr 2026 13:28:23 -0500 Subject: [PATCH 45/60] Fix s3_list to recursively find summaries in subdirectories The per-matrix summaries may be in branch subdirectories under the date prefix (e.g., summaries/2026-04-16/main/). The non-recursive aws s3 ls only returned directory prefixes, not actual files. Now uses --recursive to find all JSON files regardless of nesting. --- ci/utils/s3_helpers.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/ci/utils/s3_helpers.py b/ci/utils/s3_helpers.py index be1d2c872b..54e8b96d21 100644 --- a/ci/utils/s3_helpers.py +++ b/ci/utils/s3_helpers.py @@ -91,11 +91,13 @@ def s3_upload(local_path, s3_uri): def s3_list(s3_prefix): - """List objects under an S3 prefix. Returns list of S3 URIs.""" + """List objects under an S3 prefix (recursive). Returns list of S3 URIs.""" env = s3_env() + # Extract bucket and prefix from s3_prefix for reconstructing full URIs + # s3_prefix looks like "s3://bucket/path/to/prefix/" try: result = subprocess.run( - ["aws", "s3", "ls", s3_prefix], + ["aws", "s3", "ls", "--recursive", s3_prefix], env=env, check=True, capture_output=True, @@ -105,9 +107,18 @@ def s3_list(s3_prefix): print(f"WARNING: S3 ls failed: {exc}", file=sys.stderr) return [] + # --recursive output format: "2026-04-16 12:00:00 1234 path/to/file.json" + # We need to reconstruct full S3 URIs from the key paths + # Parse bucket from s3_prefix + if not s3_prefix.startswith("s3://"): + return [] + without_scheme = s3_prefix[5:] # remove "s3://" + bucket = without_scheme.split("/")[0] + base_uri = f"s3://{bucket}/" + uris = [] for line in result.stdout.strip().splitlines(): - parts = line.split() - if parts: - uris.append(f"{s3_prefix}{parts[-1]}") + parts = line.split(None, 3) # date, time, size, key + if len(parts) == 4: + uris.append(f"{base_uri}{parts[3]}") return uris From 94067f9707e7d664dc4b47f0d06847122eae3eba Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 16 Apr 2026 13:33:18 -0500 Subject: [PATCH 46/60] Restore branch-separated paths for multi-branch nightly support Per-matrix summaries use branch subdirectories so main and release branches don't overwrite each other. For production nightlies, RAPIDS_BRANCH matches the branch input. s3_list is recursive so it handles any subdirectory structure. --- ci/nightly_summary.sh | 11 ++++++----- ci/utils/nightly_report_helper.sh | 6 +++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 22c63f9742..3ad9210486 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -40,11 +40,12 @@ fi S3_BASE="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" BRANCH_SLUG=$(echo "${BRANCH}" | tr '/' '-') -# Per-matrix summaries are uploaded by rapidsai shared workflows which use -# a flat date-based path (RAPIDS_BRANCH inside those containers is always "main"). -# Only our outputs (consolidated, dashboard) use branch-separated paths. -S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/${RUN_DATE}/" -S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/" +# Per-matrix summaries are uploaded by test jobs under summaries/{date}/{branch}/. +# For production nightlies (main, release/*), RAPIDS_BRANCH matches the branch input. +# For feature branch testing, RAPIDS_BRANCH may default to "main" in rapidsai containers, +# so we search the date prefix recursively (s3_list handles this). +S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/" +S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/" S3_CONSOLIDATED_JSON="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/consolidated.json" S3_CONSOLIDATED_HTML="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/consolidated.html" S3_INDEX_URI="${S3_BASE}/index.json" diff --git a/ci/utils/nightly_report_helper.sh b/ci/utils/nightly_report_helper.sh index 809b918df8..0ab568c34d 100755 --- a/ci/utils/nightly_report_helper.sh +++ b/ci/utils/nightly_report_helper.sh @@ -72,9 +72,9 @@ generate_nightly_report() { if [ -n "${CUOPT_DATASET_S3_URI:-}" ]; then local s3_base="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" - s3_history_uri="${s3_base}/history/${test_type}-${branch_slug}-${matrix_label}.json" - s3_summary_uri="${s3_base}/summaries/${run_date}/${test_type}-${matrix_label}.json" - s3_html_uri="${s3_base}/reports/${run_date}/${test_type}-${matrix_label}.html" + s3_history_uri="${s3_base}/history/${branch_slug}/${test_type}-${matrix_label}.json" + s3_summary_uri="${s3_base}/summaries/${run_date}/${branch_slug}/${test_type}-${matrix_label}.json" + s3_html_uri="${s3_base}/reports/${run_date}/${branch_slug}/${test_type}-${matrix_label}.html" fi # --- Run nightly report --- From 952b8f64b8cc19e1b2ab45ace9f402370736ed63 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 16 Apr 2026 13:36:58 -0500 Subject: [PATCH 47/60] Remove unused status filter from dashboard sidebar The status filter only applied to the Matrix Grid tab which is no longer the default view. Tab navigation (Failures, Flaky, etc.) already serves as the status filter. --- ci/dashboard/index.html | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html index b24380b8a4..73329dea0c 100644 --- a/ci/dashboard/index.html +++ b/ci/dashboard/index.html @@ -163,16 +163,6 @@

cuOpt Nightly

-
- -
- All - New Fail - Recurring - Flaky - Passed -
-
Passed
@@ -665,16 +655,6 @@

cuOpt Nightly

loadDate(e.target.value); }); - // Status filter chips - for (const chip of document.querySelectorAll('#status-filters .filter-chip')) { - chip.addEventListener('click', () => { - document.querySelectorAll('#status-filters .filter-chip') - .forEach(c => c.classList.remove('active')); - chip.classList.add('active'); - S.filters.status = chip.dataset.status; - renderTab(S.activeTab); - }); - } } function statusBadge(status) { From 4cfa8860a3d1530950822ffaf7d58099327c7f62 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 16 Apr 2026 13:50:37 -0500 Subject: [PATCH 48/60] Add fallback S3 prefix for cross-branch summary lookup When the branch-specific summaries path is empty (e.g., feature branch testing where RAPIDS_BRANCH defaults to main in rapidsai containers), falls back to searching the date-level prefix to find summaries uploaded by test jobs. Ensures nightly-summary always finds data from the same run regardless of RAPIDS_BRANCH mismatch. --- ci/nightly_summary.sh | 16 +++++----------- ci/utils/aggregate_nightly.py | 26 ++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 3ad9210486..04ef08682b 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -67,22 +67,16 @@ else fi -echo "RUN_DATE=${RUN_DATE}, BRANCH=${BRANCH}, BRANCH_SLUG=${BRANCH_SLUG}" -echo "Listing S3 summaries at ${S3_SUMMARIES_PREFIX}:" -aws s3 ls "${S3_SUMMARIES_PREFIX}" 2>&1 || echo "(no files or access error)" -# Diagnostic: show what's on S3 for this date -echo "=== S3 diagnostics ===" -echo "RUN_DATE=${RUN_DATE} BRANCH=${BRANCH} BRANCH_SLUG=${BRANCH_SLUG}" -echo "Looking for summaries at: ${S3_SUMMARIES_PREFIX}" -aws s3 ls "${S3_SUMMARIES_PREFIX}" 2>&1 | head -5 || true -echo "All summaries for ${RUN_DATE}:" -aws s3 ls "${S3_BASE}/summaries/${RUN_DATE}/" 2>&1 | head -10 || true -echo "=== End diagnostics ===" +# Fallback: search the date-level prefix if branch-specific path is empty. +# This handles the case where RAPIDS_BRANCH in rapidsai containers differs +# from the branch input (e.g., feature branch testing where RAPIDS_BRANCH=main). +S3_SUMMARIES_FALLBACK="${S3_BASE}/summaries/${RUN_DATE}/" echo "Aggregating nightly summaries from ${S3_SUMMARIES_PREFIX}" python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \ --s3-summaries-prefix "${S3_SUMMARIES_PREFIX}" \ + --s3-summaries-fallback "${S3_SUMMARIES_FALLBACK}" \ --s3-reports-prefix "${S3_REPORTS_PREFIX}" \ --s3-output-uri "${S3_CONSOLIDATED_JSON}" \ --s3-html-output-uri "${S3_CONSOLIDATED_HTML}" \ diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index 4767dc70c1..04989a4846 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -38,8 +38,11 @@ # --------------------------------------------------------------------------- -def download_summaries(s3_prefix, local_dir): +def download_summaries(s3_prefix, local_dir, s3_fallback_prefix=""): """Download all JSON summaries from S3 prefix into local_dir. + If s3_fallback_prefix is set and no summaries found at s3_prefix, + retries with the fallback (used when RAPIDS_BRANCH in rapidsai + containers doesn't match the branch input). Returns list of loaded summary dicts.""" local_dir = Path(local_dir) local_dir.mkdir(parents=True, exist_ok=True) @@ -49,6 +52,18 @@ def download_summaries(s3_prefix, local_dir): u for u in uris if u.endswith(".json") and not u.endswith("/consolidated.json") ] + + # Fallback: search the parent date prefix if branch-specific path is empty + if not json_uris and s3_fallback_prefix and s3_fallback_prefix != s3_prefix: + print(f"No summaries at {s3_prefix}, trying fallback: {s3_fallback_prefix}") + uris = s3_list(s3_fallback_prefix) + json_uris = [ + u for u in uris + if u.endswith(".json") and not u.endswith("/consolidated.json") + ] + if json_uris: + s3_prefix = s3_fallback_prefix + print(f"Found {len(json_uris)} summary file(s) at {s3_prefix}") summaries = [] @@ -567,6 +582,11 @@ def main(): default="", help="S3 prefix for per-matrix JSON summaries (e.g., s3://bucket/.../summaries/2026-04-13/)", ) + parser.add_argument( + "--s3-summaries-fallback", + default="", + help="Fallback S3 prefix if no summaries found at primary prefix", + ) parser.add_argument( "--s3-reports-prefix", default="", @@ -633,7 +653,9 @@ def main(): summaries = load_local_summaries(args.local_summaries_dir) elif args.s3_summaries_prefix: download_dir = output_dir / "downloaded_summaries" - summaries = download_summaries(args.s3_summaries_prefix, download_dir) + summaries = download_summaries( + args.s3_summaries_prefix, download_dir, args.s3_summaries_fallback + ) else: print( "ERROR: Provide --s3-summaries-prefix or --local-summaries-dir", From 7b5c12ef1045f9522c15cdc9f6b780c8c6fa19ea Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 16 Apr 2026 15:21:21 -0500 Subject: [PATCH 49/60] Add CUOPT_S3_URI as common S3 base for reports New repo secret CUOPT_S3_URI (e.g., s3://cuopt-datasets/) is the bucket root. Scripts append ci_test_reports/nightly/ to build full paths, keeping reports outside ci_datasets/. Replaces the previous use of CUOPT_DATASET_S3_URI for report paths. --- .github/workflows/nightly-summary.yaml | 4 ++-- .github/workflows/test.yaml | 10 +++++++++- ci/nightly_summary.sh | 7 +++---- ci/utils/nightly_report_helper.sh | 4 ++-- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/.github/workflows/nightly-summary.yaml b/.github/workflows/nightly-summary.yaml index 724d55636e..1bc3369c41 100644 --- a/.github/workflows/nightly-summary.yaml +++ b/.github/workflows/nightly-summary.yaml @@ -39,7 +39,7 @@ on: required: false type: string secrets: - CUOPT_DATASET_S3_URI: + CUOPT_S3_URI: required: true CUOPT_AWS_ACCESS_KEY_ID: required: true @@ -67,7 +67,7 @@ jobs: pip install awscli - name: Run nightly summary env: - CUOPT_DATASET_S3_URI: ${{ secrets.CUOPT_DATASET_S3_URI }} + CUOPT_S3_URI: ${{ secrets.CUOPT_S3_URI }} CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4fa51ade1b..a7bafa5cbd 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -46,6 +46,8 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + script-env-secret-4-key: CUOPT_S3_URI + script-env-secret-4-value: ${{ secrets.CUOPT_S3_URI }} conda-python-tests: if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main @@ -63,6 +65,8 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + script-env-secret-4-key: CUOPT_S3_URI + script-env-secret-4-value: ${{ secrets.CUOPT_S3_URI }} wheel-tests-cuopt: if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main @@ -79,6 +83,8 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + script-env-secret-4-key: CUOPT_S3_URI + script-env-secret-4-value: ${{ secrets.CUOPT_S3_URI }} wheel-tests-cuopt-server: if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main @@ -95,6 +101,8 @@ jobs: script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + script-env-secret-4-key: CUOPT_S3_URI + script-env-secret-4-value: ${{ secrets.CUOPT_S3_URI }} conda-notebook-tests: if: ${{ !inputs.summary-only }} secrets: inherit @@ -123,7 +131,7 @@ jobs: build_type: ${{ inputs.build_type }} date: ${{ inputs.date }} secrets: - CUOPT_DATASET_S3_URI: ${{ secrets.CUOPT_DATASET_S3_URI }} + CUOPT_S3_URI: ${{ secrets.CUOPT_S3_URI }} CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} CUOPT_SLACK_WEBHOOK_URL: ${{ secrets.CUOPT_SLACK_WEBHOOK_URL }} diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 04ef08682b..5cb0b0b639 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -32,13 +32,12 @@ export AWS_ACCESS_KEY_ID="${CUOPT_AWS_ACCESS_KEY_ID:-${AWS_ACCESS_KEY_ID:-}}" export AWS_SECRET_ACCESS_KEY="${CUOPT_AWS_SECRET_ACCESS_KEY:-${AWS_SECRET_ACCESS_KEY:-}}" unset AWS_SESSION_TOKEN -if [ -z "${CUOPT_DATASET_S3_URI:-}" ]; then - echo "WARNING: CUOPT_DATASET_S3_URI is not set. Skipping nightly aggregation." >&2 - echo "The per-matrix reports (uploaded by individual test jobs) are still available on S3." +if [ -z "${CUOPT_S3_URI:-}" ]; then + echo "WARNING: CUOPT_S3_URI is not set. Skipping nightly aggregation." >&2 exit 0 fi -S3_BASE="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" +S3_BASE="${CUOPT_S3_URI}ci_test_reports/nightly" BRANCH_SLUG=$(echo "${BRANCH}" | tr '/' '-') # Per-matrix summaries are uploaded by test jobs under summaries/{date}/{branch}/. # For production nightlies (main, release/*), RAPIDS_BRANCH matches the branch input. diff --git a/ci/utils/nightly_report_helper.sh b/ci/utils/nightly_report_helper.sh index 0ab568c34d..deb887b441 100755 --- a/ci/utils/nightly_report_helper.sh +++ b/ci/utils/nightly_report_helper.sh @@ -70,8 +70,8 @@ generate_nightly_report() { local s3_summary_uri="" local s3_html_uri="" - if [ -n "${CUOPT_DATASET_S3_URI:-}" ]; then - local s3_base="${CUOPT_DATASET_S3_URI}ci_test_reports/nightly" + if [ -n "${CUOPT_S3_URI:-}" ]; then + local s3_base="${CUOPT_S3_URI}ci_test_reports/nightly" s3_history_uri="${s3_base}/history/${branch_slug}/${test_type}-${matrix_label}.json" s3_summary_uri="${s3_base}/summaries/${run_date}/${branch_slug}/${test_type}-${matrix_label}.json" s3_html_uri="${s3_base}/reports/${run_date}/${branch_slug}/${test_type}-${matrix_label}.html" From 2e8a5ec5153b5dd1a2394fa62e6c2047a8a40bd4 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 16 Apr 2026 15:24:00 -0500 Subject: [PATCH 50/60] Unify S3 access under single CUOPT_S3_URI secret Replace CUOPT_DATASET_S3_URI with CUOPT_S3_URI (bucket root, e.g., s3://cuopt-datasets/). Scripts append the appropriate path prefix: - ci_datasets/routing/ for test datasets - ci_test_reports/nightly/ for reports and dashboards Set the repo secret CUOPT_S3_URI to the bucket root. The old CUOPT_DATASET_S3_URI secret can be removed after this merges. --- .github/workflows/pr.yaml | 16 ++++++++-------- .github/workflows/test.yaml | 28 ++++++++++++---------------- ci/nightly_summary.sh | 3 +-- ci/utils/nightly_report_helper.sh | 2 +- datasets/get_test_data.sh | 8 ++++---- 5 files changed, 26 insertions(+), 31 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a652c23b9a..be67501892 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -299,8 +299,8 @@ jobs: script: ci/test_cpp.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY @@ -323,8 +323,8 @@ jobs: script: ci/test_python.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY @@ -384,8 +384,8 @@ jobs: script: ci/test_wheel_cuopt.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY @@ -424,8 +424,8 @@ jobs: script: ci/test_wheel_cuopt_server.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a7bafa5cbd..5246ed0124 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -40,14 +40,13 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_cpp.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} - script-env-secret-4-key: CUOPT_S3_URI - script-env-secret-4-value: ${{ secrets.CUOPT_S3_URI }} + conda-python-tests: if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main @@ -59,14 +58,13 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_python.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} - script-env-secret-4-key: CUOPT_S3_URI - script-env-secret-4-value: ${{ secrets.CUOPT_S3_URI }} + wheel-tests-cuopt: if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main @@ -77,14 +75,13 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_wheel_cuopt.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} - script-env-secret-4-key: CUOPT_S3_URI - script-env-secret-4-value: ${{ secrets.CUOPT_S3_URI }} + wheel-tests-cuopt-server: if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main @@ -95,14 +92,13 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_wheel_cuopt_server.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} - script-env-secret-4-key: CUOPT_S3_URI - script-env-secret-4-value: ${{ secrets.CUOPT_S3_URI }} + conda-notebook-tests: if: ${{ !inputs.summary-only }} secrets: inherit diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh index 5cb0b0b639..41790d8b44 100755 --- a/ci/nightly_summary.sh +++ b/ci/nightly_summary.sh @@ -6,8 +6,7 @@ # consolidated Slack notification. Runs as a post-test job after all # matrix CI jobs finish. # -# The script needs S3 access. It tries CUOPT_DATASET_S3_URI first, then -# falls back to standard AWS env vars set by aws-actions/configure-aws-credentials. +# The script needs S3 access via CUOPT_S3_URI (bucket root) and CUOPT_AWS_* credentials. # # Optional: # CUOPT_SLACK_WEBHOOK_URL - sends Slack if set diff --git a/ci/utils/nightly_report_helper.sh b/ci/utils/nightly_report_helper.sh index deb887b441..c3b77e6b7a 100755 --- a/ci/utils/nightly_report_helper.sh +++ b/ci/utils/nightly_report_helper.sh @@ -22,7 +22,7 @@ # RAPIDS_CUDA_VERSION - CUDA version (e.g., "12.9") # RAPIDS_PY_VERSION - Python version (e.g., "3.12"), used with --with-python-version # RAPIDS_BRANCH - branch name (e.g., "main") -# CUOPT_DATASET_S3_URI - S3 base URI for reports +# CUOPT_S3_URI - S3 bucket root (e.g., s3://cuopt-datasets/) # GITHUB_SHA - commit SHA # GITHUB_STEP_SUMMARY - path for GitHub Actions step summary diff --git a/datasets/get_test_data.sh b/datasets/get_test_data.sh index 528455e133..472813a003 100755 --- a/datasets/get_test_data.sh +++ b/datasets/get_test_data.sh @@ -8,7 +8,7 @@ set -o pipefail ################################################################################ # S3 Dataset Download Support ################################################################################ -# Set CUOPT_DATASET_S3_URI to base S3 path +# Set CUOPT_S3_URI to S3 bucket root (e.g., s3://cuopt-datasets/) # AWS credentials should be configured via: # - Environment variables (CUOPT_AWS_ACCESS_KEY_ID, CUOPT_AWS_SECRET_ACCESS_KEY) # - Standard AWS variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) @@ -18,8 +18,8 @@ set -o pipefail function try_download_from_s3() { local s3_dirs=("$@") # Array of directories to sync from S3 - if [ -z "${CUOPT_DATASET_S3_URI:-}" ]; then - echo "CUOPT_DATASET_S3_URI not set, skipping S3 download..." + if [ -z "${CUOPT_S3_URI:-}" ]; then + echo "CUOPT_S3_URI not set, skipping S3 download..." return 1 fi @@ -35,7 +35,7 @@ function try_download_from_s3() { fi # Append routing subdirectory to base S3 URI - local s3_uri="${CUOPT_DATASET_S3_URI}routing/" + local s3_uri="${CUOPT_S3_URI}ci_datasets/routing/" echo "Downloading datasets from S3..." # Use CUOPT-specific credentials only From 188ef3cf72ac4d1dff4c8ed2d169ba8c6ca21650 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 16 Apr 2026 15:25:24 -0500 Subject: [PATCH 51/60] Update last CUOPT_DATASET_S3_URI reference in developer skill --- skills/cuopt-developer/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/cuopt-developer/SKILL.md b/skills/cuopt-developer/SKILL.md index 98fe62f19c..66d41c003e 100644 --- a/skills/cuopt-developer/SKILL.md +++ b/skills/cuopt-developer/SKILL.md @@ -296,7 +296,7 @@ rmm::device_uvector data(100, stream); | CUDA out of memory | Reduce problem size | | Slow debug library loading | Device symbols cause delay | -| CI state doesn't persist between runs | CI containers are ephemeral. Never write persistent state to repo files from CI — use S3 (`CUOPT_DATASET_S3_URI`) or artifact stores. Ask: "After this container dies, does tomorrow's run see today's data?" | +| CI state doesn't persist between runs | CI containers are ephemeral. Never write persistent state to repo files from CI — use S3 (`CUOPT_S3_URI`) or artifact stores. Ask: "After this container dies, does tomorrow's run see today's data?" | | CI state transitions go unreported | When CI tracks state over time (e.g. test failures), every transition (new failure, recurring, stabilized) needs an explicit notification path. Ask: "When state X changes to Y, who learns about it and how?" | | Designing CI features without lifecycle check | Before shipping any CI feature that tracks state: (1) Where does state live between runs? (2) What writes/reads it? (3) What happens on state transitions? Verify end-to-end, not just the happy-path logic. | | Change applied to only some targets | Before implementing, audit the full scope of what needs the change. For CI: `ls ci/test*.sh`. For APIs: grep all callers. For patterns: find every instance. Enumerate ALL targets first, implement second. | From 46e1ca642f844e568dc57412a0d58407925e10c1 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Fri, 17 Apr 2026 10:52:16 -0500 Subject: [PATCH 52/60] Remove summary-only testing flags from build and test workflows --- .github/workflows/build.yaml | 21 +-------------------- .github/workflows/test.yaml | 9 --------- 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3ba0edd8c1..96766de4a2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -37,10 +37,6 @@ on: If 'true', trigger the test workflow after all builds complete. type: boolean default: false - summary-only: - description: "If true, skip all build jobs and run only build-summary" - type: boolean - default: false concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -48,7 +44,6 @@ concurrency: jobs: cpp-build: - if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main with: @@ -58,7 +53,6 @@ jobs: sha: ${{ inputs.sha }} script: ci/build_cpp.sh python-build: - if: ${{ !inputs.summary-only }} needs: [cpp-build] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main @@ -69,7 +63,6 @@ jobs: sha: ${{ inputs.sha }} script: ci/build_python.sh upload-conda: - if: ${{ !inputs.summary-only }} needs: [cpp-build, python-build] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main @@ -79,7 +72,6 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} wheel-build-cuopt-mps-parser: - if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: @@ -94,7 +86,6 @@ jobs: # need 1 build per Python version and arch (but CUDA version doesn't matter so choose the latest) matrix_filter: 'group_by([.ARCH, (.PY_VER |split(".") | map(tonumber))])|map(max_by([(.CUDA_VER|split(".")|map(tonumber))]))' wheel-publish-cuopt-mps-parser: - if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-mps-parser secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -106,7 +97,6 @@ jobs: package-name: cuopt_mps_parser package-type: python wheel-build-libcuopt: - if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-mps-parser secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main @@ -120,7 +110,6 @@ jobs: package-type: cpp matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) wheel-publish-libcuopt: - if: ${{ !inputs.summary-only }} needs: wheel-build-libcuopt secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -132,7 +121,6 @@ jobs: package-name: libcuopt package-type: cpp wheel-build-cuopt: - if: ${{ !inputs.summary-only }} needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main @@ -145,7 +133,6 @@ jobs: package-name: cuopt package-type: python wheel-publish-cuopt: - if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -157,7 +144,6 @@ jobs: package-name: cuopt package-type: python wheel-build-cuopt-server: - if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: @@ -172,7 +158,6 @@ jobs: # Only need 1 package per CUDA major version. This selects "ARCH=amd64 + the latest supported Python, 1 job per major CUDA version". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) wheel-publish-cuopt-server: - if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-server secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -184,7 +169,6 @@ jobs: package-name: cuopt_server package-type: python docs-build: - if: ${{ !inputs.summary-only }} needs: [python-build] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main @@ -200,7 +184,6 @@ jobs: container_image: "rapidsai/ci-conda:26.06-latest" script: "ci/build_docs.sh" wheel-build-cuopt-sh-client: - if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: @@ -216,7 +199,6 @@ jobs: # only need 1 build (noarch package): this selects amd64, oldest-supported Python, latest-supported CUDA matrix_filter: '[map(select(.ARCH == "amd64")) | min_by((.PY_VER | split(".") | map(tonumber)), (.CUDA_VER | split(".") | map(-tonumber)))]' wheel-publish-cuopt-sh-client: - if: ${{ !inputs.summary-only }} needs: wheel-build-cuopt-sh-client secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main @@ -237,7 +219,7 @@ jobs: - wheel-publish-cuopt-server - wheel-publish-cuopt-sh-client - wheel-publish-libcuopt - if: ${{ inputs.trigger-tests && !inputs.summary-only }} + if: inputs.trigger-tests runs-on: ubuntu-latest # ref: https://docs.github.com/en/actions/reference/security/secure-use#use-an-intermediate-environment-variable env: @@ -288,7 +270,6 @@ jobs: run: bash ci/build_summary.sh build-images: - if: ${{ !inputs.summary-only }} needs: - wheel-publish-cuopt - wheel-publish-cuopt-server diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5246ed0124..60f56b0c95 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,14 +24,9 @@ on: description: "build_type: one of [branch, nightly, pull-request]" type: string default: nightly - summary-only: - description: "If true, skip all test jobs and run only nightly-summary" - type: boolean - default: false jobs: conda-cpp-tests: - if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main with: build_type: ${{ inputs.build_type }} @@ -48,7 +43,6 @@ jobs: script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} conda-python-tests: - if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main with: run_codecov: false @@ -66,7 +60,6 @@ jobs: script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} wheel-tests-cuopt: - if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} @@ -83,7 +76,6 @@ jobs: script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} wheel-tests-cuopt-server: - if: ${{ !inputs.summary-only }} uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} @@ -100,7 +92,6 @@ jobs: script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} conda-notebook-tests: - if: ${{ !inputs.summary-only }} secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main with: From 31b12a11e11c8557efc04098ca63b1cbbc788184 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Fri, 17 Apr 2026 11:01:39 -0500 Subject: [PATCH 53/60] Add SPDX license header to dashboard HTML --- ci/dashboard/index.html | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html index 73329dea0c..a1ae36b9cb 100644 --- a/ci/dashboard/index.html +++ b/ci/dashboard/index.html @@ -1,3 +1,7 @@ + From 893029149a6abbc9dd3fa77663fe4ec01d583b4a Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Fri, 17 Apr 2026 11:27:31 -0500 Subject: [PATCH 54/60] Improve nightly CI reporting UX across Slack, dashboard, and HTML - Increase error message truncation to 150 chars for new failures in Slack thread replies (recurring failures stay short) - Append failing matrix labels to per-workflow summary in main Slack message (up to 3, then "+N more") - Add per-job "View Logs" links in Slack thread workflow sections - Add "Flake Rate" column and rename "Retries" to "Flake Count" in dashboard flaky tests table - Add clickable "View Logs" links for failed jobs in build summary - Make test names in HTML report failure tables link to source on GitHub when suite looks like a file path and sha is available --- ci/build_summary.sh | 4 +++ ci/dashboard/index.html | 20 ++++++++++++++- ci/utils/aggregate_nightly.py | 22 ++++++++++++++-- ci/utils/send_consolidated_summary.sh | 37 +++++++++++++++++++++++---- 4 files changed, 75 insertions(+), 8 deletions(-) diff --git a/ci/build_summary.sh b/ci/build_summary.sh index e8fd81a436..4501fbd152 100755 --- a/ci/build_summary.sh +++ b/ci/build_summary.sh @@ -88,6 +88,10 @@ for group_name, group_jobs in sorted(groups.items()): if g_failed > 0: icon = ':x:' detail = f'{g_failed}/{g_total} failed' + # Add clickable log links for failed jobs + failed_in_group = [j for j in group_jobs if j.get('conclusion') == 'failure'] + if failed_in_group and failed_in_group[0].get('html_url'): + detail += f' <{failed_in_group[0]["html_url"]}|View Logs>' elif g_passed == g_total: icon = ':white_check_mark:' detail = f'{g_total} passed' diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html index a1ae36b9cb..a36a9da0d7 100644 --- a/ci/dashboard/index.html +++ b/ci/dashboard/index.html @@ -457,14 +457,32 @@

cuOpt Nightly

return '

No flaky tests matching current filters.

'; let html = '

Flaky Tests (passed on retry)

'; - html += ''; + html += ''; for (const e of items) { + // Compute flake rate from index data if available (flaky / total across runs) + let flakeRate = '—'; + if (S.index && S.index.dates) { + const currentBranch = S.current?.branch || 'main'; + const dateEntries = Object.values(S.index.dates) + .filter(v => (v.branch || 'main') === currentBranch); + const totalRuns = dateEntries.length; + if (totalRuns > 0) { + const totalFlaky = dateEntries.reduce((sum, v) => + sum + ((v.test_totals || {}).flaky || 0), 0); + const totalTests = dateEntries.reduce((sum, v) => + sum + ((v.test_totals || {}).total || 0), 0); + if (totalTests > 0) { + flakeRate = (totalFlaky / totalTests * 100).toFixed(2) + '%'; + } + } + } html += ` + `; } html += '
Test TypeMatrixSuiteTestRetries
Test TypeMatrixSuiteTestFlake CountFlake Rate
${esc(e.test_type||'')} ${esc(e.matrix_label||'')} ${esc(e.suite)} ${esc(e.name)} FLAKY ${e.retry_count||'?'}${flakeRate}
'; diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index 04989a4846..71d7579de9 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -401,6 +401,24 @@ def generate_consolidated_html(
{totals["resolved"]}
Stabilized
""") + # Helper: build a GitHub source link for test names when suite looks like a file path + def _test_name_html(entry): + """Return HTML for the test name, linked to source if suite looks like a file path.""" + name_escaped = _html_escape(entry['name']) + suite = entry.get('suite', '') + # Find the sha from the matching grid entry + sha = "unknown" + for g in agg["matrix_grid"]: + if (g["test_type"] == entry.get("test_type") + and g["matrix_label"] == entry.get("matrix_label") + and g.get("sha")): + sha = g["sha"] + break + if sha != "unknown" and suite and ('/' in suite or suite.endswith('.py')): + url = f"https://github.com/NVIDIA/cuopt/blob/{_html_escape(sha)}/{_html_escape(suite)}" + return f'{name_escaped}' + return f"{name_escaped}" + # --- New failures --- if agg["all_new_failures"]: parts.append("

New Failures

") @@ -415,7 +433,7 @@ def generate_consolidated_html( f"" f"" f"" - f"" + f"" f"' ) @@ -435,7 +453,7 @@ def generate_consolidated_html( f"" f"" f"" - f"" + f"" f"" f"' diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index 195a7d5797..199f19fc3d 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -159,16 +159,31 @@ for j in workflow_jobs: if j["conclusion"] == "failure": wf_counts[prefix]["failed"] += 1 +# Build a lookup: workflow prefix -> list of failing matrix_labels from grid +wf_failing_labels = {} +for g in grid: + if g["status"].startswith("failed"): + wf_failing_labels.setdefault(g["test_type"], []).append(g["matrix_label"]) + if failing_workflows: lines = [] for wf in sorted(failing_workflows): counts = wf_counts.get(wf, {}) f_count = counts.get("failed", 0) t_count = counts.get("total", 0) + # Append failing matrix labels (up to 3, then "+N more") + labels = wf_failing_labels.get(wf, []) + label_suffix = "" + if labels: + shown = labels[:3] + label_suffix = " (" + ", ".join(shown) + if len(labels) > 3: + label_suffix += f", +{len(labels) - 3} more" + label_suffix += ")" if t_count > 0: - lines.append(f":x: *{wf}* — {f_count}/{t_count} failed") + lines.append(f":x: *{wf}* — {f_count}/{t_count} failed{label_suffix}") else: - lines.append(f":x: *{wf}* — failed") + lines.append(f":x: *{wf}* — failed{label_suffix}") blocks.append({"type": "divider"}) blocks.append({ "type": "section", @@ -228,13 +243,13 @@ if issues_by_wf: wf_blocks = [] wf_text = f"*{wf_name}*\n" - # New failures + # New failures (show more error context — 150 chars) for f_entry in issues["new"][:10]: - msg = f_entry.get("message", "")[:60].replace("\n", " ") + msg = f_entry.get("message", "")[:150].replace("\n", " ") matrix = f_entry.get("matrix_label", "") wf_text += f":new: `{f_entry['name']}` ({matrix}) — {msg}\n" - # Recurring failures + # Recurring failures (shorter — just show since date) for f_entry in issues["recurring"][:10]: matrix = f_entry.get("matrix_label", "") first = f_entry.get("first_seen", "?") @@ -257,6 +272,18 @@ if issues_by_wf: if len(issues[category]) > limit: wf_text += f"_...+{len(issues[category]) - limit} more {label}_\n" + # Per-job log links: find workflow_jobs matching this workflow prefix + job_urls = [j["url"] for j in workflow_jobs + if j.get("url") and j["name"].split(" / ")[0] == wf_name + and j["conclusion"] == "failure"] + if not job_urls: + # Also try matching by test_type prefix for tracked jobs + job_urls = [j["url"] for j in workflow_jobs + if j.get("url") and j["name"].startswith(wf_name) + and j["conclusion"] == "failure"] + if job_urls: + wf_text += f"<{job_urls[0]}|:link: View Logs>\n" + # Chunk if needed while wf_text: chunk = wf_text[:2900] From 53db6009ea87d4fb00e11dda778017be9982efbe Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Fri, 17 Apr 2026 11:33:07 -0500 Subject: [PATCH 55/60] Ping user on new failures and reorder: new > flaky > recurring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Mention @rgsl888prabhu in Slack only when there are NEW failures (not for recurring or flaky-only runs) - Reorder everywhere: new failures first, then flaky, then recurring, then resolved — consistent across Slack thread, HTML report, and dashboard tabs --- ci/dashboard/index.html | 2 +- ci/utils/aggregate_nightly.py | 34 +++++++++++++-------------- ci/utils/send_consolidated_summary.sh | 18 +++++++------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html index a36a9da0d7..2cdd7406ce 100644 --- a/ci/dashboard/index.html +++ b/ci/dashboard/index.html @@ -194,9 +194,9 @@

cuOpt Nightly

- +
diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index 71d7579de9..17cea4dec7 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -439,6 +439,23 @@ def _test_name_html(entry): ) parts.append("
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_html_escape(e['name'])}{_test_name_html(e)}
{short}" f'
{msg}
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_html_escape(e['name'])}{_test_name_html(e)}{_html_escape(e.get('first_seen', '?'))}
{short}" f'
{msg}
") + # --- Flaky --- + if agg["all_flaky_tests"]: + parts.append("

Flaky Tests

") + parts.append( + "" + "" + ) + for e in agg["all_flaky_tests"]: + parts.append( + f"" + f"" + f"" + f"" + f"" + ) + parts.append("
Test TypeMatrixSuiteTestRetries
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_html_escape(e['name'])}{e.get('retry_count', '?')}
") + # --- Recurring failures --- if agg["all_recurring_failures"]: parts.append("

Recurring Failures

") @@ -478,23 +495,6 @@ def _test_name_html(entry): ) parts.append("
") - # --- Flaky --- - if agg["all_flaky_tests"]: - parts.append("

Flaky Tests

") - parts.append( - "" - "" - ) - for e in agg["all_flaky_tests"]: - parts.append( - f"" - f"" - f"" - f"" - f"" - ) - parts.append("
Test TypeMatrixSuiteTestRetries
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_html_escape(e['name'])}{e.get('retry_count', '?')}
") - if ( not agg["all_new_failures"] and not agg["all_recurring_failures"] diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index 199f19fc3d..02e332e13d 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -107,8 +107,8 @@ untracked_count = len(untracked_failed) if has_failures and (has_new or untracked_count > 0): emoji = ":rotating_light:" - text = f"{len(failing_workflows)} workflow(s) with failures" - mention = "" + text = f"{len(failing_workflows)} workflow(s) with NEW failures" + mention = "<@rgsl888prabhu> " elif has_failures: emoji = ":x:" text = f"Recurring failures in {len(failing_workflows)} workflow(s)" @@ -243,23 +243,23 @@ if issues_by_wf: wf_blocks = [] wf_text = f"*{wf_name}*\n" - # New failures (show more error context — 150 chars) + # New failures first (most urgent, show more error context) for f_entry in issues["new"][:10]: msg = f_entry.get("message", "")[:150].replace("\n", " ") matrix = f_entry.get("matrix_label", "") wf_text += f":new: `{f_entry['name']}` ({matrix}) — {msg}\n" - # Recurring failures (shorter — just show since date) + # Flaky (actionable — tests that are unstable) + for f_entry in issues["flaky"][:10]: + matrix = f_entry.get("matrix_label", "") + wf_text += f":warning: `{f_entry['name']}` ({matrix})\n" + + # Recurring failures (known issues) for f_entry in issues["recurring"][:10]: matrix = f_entry.get("matrix_label", "") first = f_entry.get("first_seen", "?") wf_text += f":repeat: `{f_entry['name']}` ({matrix}) — since {first}\n" - # Flaky - for f_entry in issues["flaky"][:10]: - matrix = f_entry.get("matrix_label", "") - wf_text += f":warning: `{f_entry['name']}` ({matrix})\n" - # Resolved for r in issues["resolved"][:5]: matrix = r.get("matrix_label", "") From 9c3cc99ff3cc71e63010a0cfd9ff9b873453a3ff Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Fri, 17 Apr 2026 11:58:43 -0500 Subject: [PATCH 56/60] Show useful error details in HTML report summaries Extract the last line of error messages (usually the assertion or exception) instead of the first line (usually the test method signature). Increases visible summary to 200 chars. Full error still available in expandable details. --- ci/utils/aggregate_nightly.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py index 17cea4dec7..f515f36acd 100644 --- a/ci/utils/aggregate_nightly.py +++ b/ci/utils/aggregate_nightly.py @@ -419,6 +419,25 @@ def _test_name_html(entry): return f'{name_escaped}' return f"{name_escaped}" + def _error_summary(message, max_len=200): + """Extract the most useful part of an error message for display. + Prefers the last line (usually the assertion) over the first + (usually the test method signature).""" + if not message: + return "" + lines = [l.strip() for l in message.strip().splitlines() if l.strip()] + # Use the last non-empty line (typically the assertion/error) + if lines: + summary = lines[-1] + # If the last line is very short, include the previous line too + if len(summary) < 40 and len(lines) > 1: + summary = lines[-2] + " — " + summary + else: + summary = message + if len(summary) > max_len: + summary = summary[:max_len] + "..." + return summary + # --- New failures --- if agg["all_new_failures"]: parts.append("

New Failures

") @@ -428,7 +447,7 @@ def _test_name_html(entry): ) for e in agg["all_new_failures"]: msg = _html_escape(e.get("message", "")) - short = _html_escape(e.get("message", "")[:100]) + short = _html_escape(_error_summary(e.get("message", ""))) parts.append( f"" f"" @@ -465,7 +484,7 @@ def _test_name_html(entry): ) for e in agg["all_recurring_failures"]: msg = _html_escape(e.get("message", "")) - short = _html_escape(e.get("message", "")[:100]) + short = _html_escape(_error_summary(e.get("message", ""))) parts.append( f"" f"" From 1d064d741143e23d774f0473edaef941c7f33ea3 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Fri, 17 Apr 2026 12:01:17 -0500 Subject: [PATCH 57/60] Chunk all Slack message blocks to respect 3000-char limit Add chunking to per-workflow failure block in main message, per-matrix summary sections (failures, stabilized, flaky), and build summary job list. Prevents invalid_blocks errors when many workflows or tests fail simultaneously. --- ci/build_summary.sh | 15 +++++++------ ci/utils/send_consolidated_summary.sh | 19 ++++++++++++---- ci/utils/send_nightly_summary.sh | 32 +++++++++++++-------------- 3 files changed, 38 insertions(+), 28 deletions(-) diff --git a/ci/build_summary.sh b/ci/build_summary.sh index 4501fbd152..f61a402a66 100755 --- a/ci/build_summary.sh +++ b/ci/build_summary.sh @@ -100,13 +100,14 @@ for group_name, group_jobs in sorted(groups.items()): detail = f'{g_passed}/{g_total} passed' lines.append(f'{icon} *{group_name}* \u2014 {detail}') -text = '\n'.join(lines) -if len(text) > 2900: - text = text[:2900] + '\n_...truncated_' -blocks.append({ - 'type': 'section', - 'text': {'type': 'mrkdwn', 'text': text}, -}) +current = '' +for line in lines: + if current and len(current) + len(line) + 1 > 2900: + blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': current.rstrip()}}) + current = '' + current += line + '\n' +if current.strip(): + blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': current.rstrip()}}) # Link if run_url: diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index 02e332e13d..a568d237ab 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -185,10 +185,21 @@ if failing_workflows: else: lines.append(f":x: *{wf}* — failed{label_suffix}") blocks.append({"type": "divider"}) - blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": "\n".join(lines)}, - }) + # Chunk to stay within Slack's 3000-char block limit + current = "" + for line in lines: + if current and len(current) + len(line) + 1 > 2900: + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + }) + current = "" + current += line + "\n" + if current.strip(): + blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + }) # Links in main message link_parts = [] diff --git a/ci/utils/send_nightly_summary.sh b/ci/utils/send_nightly_summary.sh index 7b39a02cec..63c742a2ca 100755 --- a/ci/utils/send_nightly_summary.sh +++ b/ci/utils/send_nightly_summary.sh @@ -92,20 +92,27 @@ blocks.append({ blocks.append({"type": "divider"}) +def chunk_lines_to_blocks(header, lines, blocks, limit=2900): + """Add lines as section blocks, chunking to stay under Slack's char limit.""" + current = f"*{header}*\n" + for line in lines: + if len(current) + len(line) + 1 > limit: + blocks.append({"type": "section", "text": {"type": "mrkdwn", "text": current.rstrip()}}) + current = "" + current += line + "\n" + if current.strip(): + blocks.append({"type": "section", "text": {"type": "mrkdwn", "text": current.rstrip()}}) + # --- Genuine failures --- if failed > 0: lines = [] for f_entry in d.get("new_failures", []): - msg = f_entry.get("message", "")[:60].replace("\n", " ") + msg = f_entry.get("message", "")[:150].replace("\n", " ") lines.append(f" :new: `{f_entry['name']}` ({f_entry['suite']}) \u2014 {msg}") for f_entry in d.get("recurring_failures", []): - msg = f_entry.get("message", "")[:60].replace("\n", " ") first = f_entry.get("first_seen", "?") lines.append(f" :repeat: `{f_entry['name']}` ({f_entry['suite']}) \u2014 since {first}") - blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": "*Genuine Failures:*\n" + "\n".join(lines)}, - }) + chunk_lines_to_blocks("Genuine Failures:", lines, blocks) # --- Stabilized tests --- resolved_list = d.get("resolved_tests", []) @@ -119,13 +126,7 @@ if resolved_list: f" :white_check_mark: `{r['name']}` ({r['suite']}) \u2014 " f"failing since {since}, failed {count}x{flaky_tag}" ) - blocks.append({ - "type": "section", - "text": { - "type": "mrkdwn", - "text": "*Stabilized (were failing, now pass):*\n" + "\n".join(lines), - }, - }) + chunk_lines_to_blocks("Stabilized (were failing, now pass):", lines, blocks) # --- Flaky tests --- flaky_list = d.get("flaky_tests", []) @@ -134,10 +135,7 @@ if flaky_list: for f_entry in flaky_list: retries = f_entry.get("retry_count", "?") lines.append(f" :warning: `{f_entry['name']}` ({f_entry['suite']}) \u2014 {retries} retries") - blocks.append({ - "type": "section", - "text": {"type": "mrkdwn", "text": "*Flaky Tests (passed on retry):*\n" + "\n".join(lines)}, - }) + chunk_lines_to_blocks("Flaky Tests (passed on retry):", lines, blocks) # --- Links --- link_parts = [] From 4430650c5fc94bb243c752d0adb95f2f204e6878 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Fri, 17 Apr 2026 13:32:59 -0500 Subject: [PATCH 58/60] Keep error assertion in JUnit XML message extraction The traceback in JUnit XML failure elements starts with the test method signature and ends with the actual assertion/error. Taking the first 500 chars often cut off before the useful part. Now keeps the first line (context) plus the last 500 chars (where the assertion lives). --- ci/utils/nightly_report.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/ci/utils/nightly_report.py b/ci/utils/nightly_report.py index 2bd23b1f18..b836d82694 100755 --- a/ci/utils/nightly_report.py +++ b/ci/utils/nightly_report.py @@ -98,12 +98,28 @@ def parse_junit_xml(xml_path): status = "failed" message = failure.get("message", "") if failure.text: - message = failure.text[:500] + # Keep the last 500 chars (where the assertion/error is) + # plus the first line for context + text = failure.text.strip() + lines = text.splitlines() + first_line = lines[0] if lines else "" + last_chunk = text[-500:] if len(text) > 500 else text + if len(text) > 500: + message = first_line + "\n...\n" + last_chunk + else: + message = text elif error is not None: status = "error" message = error.get("message", "") if error.text: - message = error.text[:500] + text = error.text.strip() + lines = text.splitlines() + first_line = lines[0] if lines else "" + last_chunk = text[-500:] if len(text) > 500 else text + if len(text) > 500: + message = first_line + "\n...\n" + last_chunk + else: + message = text else: status = "passed" message = "" From f589b76a60715141609ff11be4a080d1717d83a6 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Fri, 17 Apr 2026 13:42:14 -0500 Subject: [PATCH 59/60] Add self-maintaining failed job links in Slack thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of mapping test_type to GitHub job names (fragile), add a separate thread reply listing all failed jobs with direct clickable links to their GitHub Actions logs. Self-maintaining — any new workflow automatically appears. --- ci/utils/send_consolidated_summary.sh | 33 +++++++++++++++++---------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh index a568d237ab..88f238899b 100755 --- a/ci/utils/send_consolidated_summary.sh +++ b/ci/utils/send_consolidated_summary.sh @@ -283,18 +283,6 @@ if issues_by_wf: if len(issues[category]) > limit: wf_text += f"_...+{len(issues[category]) - limit} more {label}_\n" - # Per-job log links: find workflow_jobs matching this workflow prefix - job_urls = [j["url"] for j in workflow_jobs - if j.get("url") and j["name"].split(" / ")[0] == wf_name - and j["conclusion"] == "failure"] - if not job_urls: - # Also try matching by test_type prefix for tracked jobs - job_urls = [j["url"] for j in workflow_jobs - if j.get("url") and j["name"].startswith(wf_name) - and j["conclusion"] == "failure"] - if job_urls: - wf_text += f"<{job_urls[0]}|:link: View Logs>\n" - # Chunk if needed while wf_text: chunk = wf_text[:2900] @@ -306,6 +294,27 @@ if issues_by_wf: print(make_payload(wf_blocks)) +# ── Thread: Failed job log links ────────────────────────────────────── +failed_job_links = [j for j in workflow_jobs if j["conclusion"] == "failure" and j.get("url")] +if failed_job_links: + link_blocks = [] + current = "*Failed Job Logs:*\n" + for j in failed_job_links: + line = f":x: <{j['url']}|{j['name']}>\n" + if len(current) + len(line) > 2900: + link_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + }) + current = "" + current += line + if current.strip(): + link_blocks.append({ + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + }) + print(make_payload(link_blocks)) + PYEOF ) From 2e49363daf87b45483ab60945c7f26d35bc193ed Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Fri, 17 Apr 2026 15:39:34 -0500 Subject: [PATCH 60/60] update --- ci/utils/nightly_report.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/ci/utils/nightly_report.py b/ci/utils/nightly_report.py index b836d82694..55a39d89bf 100755 --- a/ci/utils/nightly_report.py +++ b/ci/utils/nightly_report.py @@ -98,28 +98,12 @@ def parse_junit_xml(xml_path): status = "failed" message = failure.get("message", "") if failure.text: - # Keep the last 500 chars (where the assertion/error is) - # plus the first line for context - text = failure.text.strip() - lines = text.splitlines() - first_line = lines[0] if lines else "" - last_chunk = text[-500:] if len(text) > 500 else text - if len(text) > 500: - message = first_line + "\n...\n" + last_chunk - else: - message = text + message = failure.text.strip() elif error is not None: status = "error" message = error.get("message", "") if error.text: - text = error.text.strip() - lines = text.splitlines() - first_line = lines[0] if lines else "" - last_chunk = text[-500:] if len(text) > 500 else text - if len(text) > 500: - message = first_line + "\n...\n" + last_chunk - else: - message = text + message = error.text.strip() else: status = "passed" message = "" @@ -560,7 +544,7 @@ def generate_json_summary( "suite": e["suite"], "name": e["name"], "classname": e["classname"], - "message": e.get("message", "")[:200], + "message": e.get("message", ""), } for e in new_failures ], @@ -570,7 +554,7 @@ def generate_json_summary( "name": e["name"], "classname": e["classname"], "first_seen": e.get("first_seen", "unknown"), - "message": e.get("message", "")[:200], + "message": e.get("message", ""), } for e in recurring_failures ],
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}