From 7221f3301618f1345668c5e77f5da9109a2ea5d9 Mon Sep 17 00:00:00 2001 From: Brendan O'Leary Date: Thu, 9 Apr 2026 12:37:04 -0400 Subject: [PATCH 1/4] Add --trend flag for post-run trend analysis Wire RunTrendAnalyzer into benchmark.py via a new --trend flag. When passed, analyzes score trends for the benchmarked model after results are written, logging regression/improvement before upload. Additional flags --trend-window (default 10) and --trend-threshold (default -0.5) allow tuning the analysis parameters. Usage: python benchmark.py --model anthropic/claude-sonnet-4 --trend python benchmark.py --model anthropic/claude-sonnet-4 --trend --trend-window 5 --- scripts/benchmark.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 4f6e32a..d39e1d0 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -34,6 +34,7 @@ ) from lib_grading import GradeResult, grade_task from lib_tasks import Task, TaskLoader +from lib_trend import RunTrendAnalyzer # Configure logging @@ -251,17 +252,28 @@ def _parse_args() -> argparse.Namespace: action="store_true", help="Continue running all tasks even if sanity check scores 0%%", ) + parser.add_argument( + "--trend", + action="store_true", + help="Run trend analysis after benchmark completes (requires ≥2 runs in output dir)", + ) parser.add_argument( "--trend-window", type=int, - default=None, + default=10, metavar="N", - help="Analyze score trends over the last N runs after benchmarking (requires ≥2 runs)", + help="Number of recent runs to include in trend analysis (default: 10)", + ) + parser.add_argument( + "--trend-threshold", + type=float, + default=-0.5, + help="Slope (%/run) below which regression is flagged (default: -0.5)", ) args = parser.parse_args() # Validate --trend-window - if args.trend_window is not None and args.trend_window < 2: + if args.trend_window < 2: parser.error("--trend-window must be >= 2") return args @@ -846,13 +858,14 @@ def _build_and_write_results(): _log_category_summary(task_entries, tasks_by_id) _log_efficiency_summary(efficiency, grades_by_task_id) # Run trend analysis if requested - if args.trend_window is not None: + if args.trend: try: from lib_trend import RunTrendAnalyzer analyzer = RunTrendAnalyzer( results_dir=output_dir, window=args.trend_window, + regression_threshold=args.trend_threshold, ) analyzer.run(model=args.model) except Exception as exc: From 33b55d0e51cf01fdf5adaf790fa618cd0527d4bc Mon Sep 17 00:00:00 2001 From: Brendan O'Leary Date: Thu, 9 Apr 2026 12:47:10 -0400 Subject: [PATCH 2/4] Address review feedback - Fix help text: %%%%/run -> %/run (argparse doesn't need escaping) - Validate --trend-window >= 2 to avoid confusing behavior - Wrap trend analysis in try/except so failures don't abort upload - Skip in_progress runs in lib_trend.py to avoid skewed regression detection --- scripts/lib_trend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/lib_trend.py b/scripts/lib_trend.py index 708be4f..0677e24 100644 --- a/scripts/lib_trend.py +++ b/scripts/lib_trend.py @@ -88,6 +88,10 @@ def load_points(self, model: Optional[str] = None) -> Dict[str, List[RunPoint]]: except (json.JSONDecodeError, OSError): continue + # Skip incomplete runs (written incrementally during benchmark) + if data.get("in_progress"): + continue + m = data.get("model", "") ts = data.get("timestamp", 0.0) run_id = data.get("run_id", path.stem) From a3ae51b1948dd991597d175cfb7af1700df2f771 Mon Sep 17 00:00:00 2001 From: Brendan O'Leary Date: Thu, 9 Apr 2026 12:57:15 -0400 Subject: [PATCH 3/4] Remove unused top-level import (now lazy-loaded inside try block) --- scripts/benchmark.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index d39e1d0..b61c0ed 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -34,7 +34,6 @@ ) from lib_grading import GradeResult, grade_task from lib_tasks import Task, TaskLoader -from lib_trend import RunTrendAnalyzer # Configure logging From 78e82cd0a0c366601528d22b0b442ed8d6a7ea6d Mon Sep 17 00:00:00 2001 From: Brendan O'Leary Date: Thu, 9 Apr 2026 13:00:51 -0400 Subject: [PATCH 4/4] Escape % in argparse help string --- scripts/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index b61c0ed..9da6a7d 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -267,7 +267,7 @@ def _parse_args() -> argparse.Namespace: "--trend-threshold", type=float, default=-0.5, - help="Slope (%/run) below which regression is flagged (default: -0.5)", + help="Slope (%%/run) below which regression is flagged (default: -0.5)", ) args = parser.parse_args()