diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 4f6e32a..9da6a7d 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -251,17 +251,28 @@ def _parse_args() -> argparse.Namespace: action="store_true", help="Continue running all tasks even if sanity check scores 0%%", ) + parser.add_argument( + "--trend", + action="store_true", + help="Run trend analysis after benchmark completes (requires ≥2 runs in output dir)", + ) parser.add_argument( "--trend-window", type=int, - default=None, + default=10, metavar="N", - help="Analyze score trends over the last N runs after benchmarking (requires ≥2 runs)", + help="Number of recent runs to include in trend analysis (default: 10)", + ) + parser.add_argument( + "--trend-threshold", + type=float, + default=-0.5, + help="Slope (%%/run) below which regression is flagged (default: -0.5)", ) args = parser.parse_args() # Validate --trend-window - if args.trend_window is not None and args.trend_window < 2: + if args.trend_window < 2: parser.error("--trend-window must be >= 2") return args @@ -846,13 +857,14 @@ def _build_and_write_results(): _log_category_summary(task_entries, tasks_by_id) _log_efficiency_summary(efficiency, grades_by_task_id) # Run trend analysis if requested - if args.trend_window is not None: + if args.trend: try: from lib_trend import RunTrendAnalyzer analyzer = RunTrendAnalyzer( results_dir=output_dir, window=args.trend_window, + regression_threshold=args.trend_threshold, ) analyzer.run(model=args.model) except Exception as exc: diff --git a/scripts/lib_trend.py b/scripts/lib_trend.py index 708be4f..0677e24 100644 --- a/scripts/lib_trend.py +++ b/scripts/lib_trend.py @@ -88,6 +88,10 @@ def load_points(self, model: Optional[str] = None) -> Dict[str, List[RunPoint]]: except (json.JSONDecodeError, OSError): continue + # Skip incomplete runs (written incrementally during benchmark) + if data.get("in_progress"): + continue + m = data.get("model", "") ts = data.get("timestamp", 0.0) run_id = data.get("run_id", path.stem)