pinchbench · olearycrew · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -251,17 +251,28 @@ def _parse_args() -> argparse.Namespace:
         action="store_true",
         help="Continue running all tasks even if sanity check scores 0%%",
     )
+    parser.add_argument(
+        "--trend",
+        action="store_true",
+        help="Run trend analysis after benchmark completes (requires ≥2 runs in output dir)",
+    )
     parser.add_argument(
         "--trend-window",
         type=int,
-        default=None,
+        default=10,
         metavar="N",
-        help="Analyze score trends over the last N runs after benchmarking (requires ≥2 runs)",
+        help="Number of recent runs to include in trend analysis (default: 10)",
+    )
+    parser.add_argument(
+        "--trend-threshold",
+        type=float,
+        default=-0.5,
+        help="Slope (%%/run) below which regression is flagged (default: -0.5)",
     )
     args = parser.parse_args()
 
     # Validate --trend-window
-    if args.trend_window is not None and args.trend_window < 2:
+    if args.trend_window < 2:
         parser.error("--trend-window must be >= 2")
 
     return args
@@ -846,13 +857,14 @@ def _build_and_write_results():
     _log_category_summary(task_entries, tasks_by_id)
     _log_efficiency_summary(efficiency, grades_by_task_id)
     # Run trend analysis if requested
-    if args.trend_window is not None:
+    if args.trend:
         try:
             from lib_trend import RunTrendAnalyzer
 
             analyzer = RunTrendAnalyzer(
                 results_dir=output_dir,
                 window=args.trend_window,
+                regression_threshold=args.trend_threshold,
             )
             analyzer.run(model=args.model)
         except Exception as exc:

diff --git a/scripts/lib_trend.py b/scripts/lib_trend.py
@@ -88,6 +88,10 @@ def load_points(self, model: Optional[str] = None) -> Dict[str, List[RunPoint]]:
             except (json.JSONDecodeError, OSError):
                 continue
 
+            # Skip incomplete runs (written incrementally during benchmark)
+            if data.get("in_progress"):
+                continue
+
             m = data.get("model", "")
             ts = data.get("timestamp", 0.0)
             run_id = data.get("run_id", path.stem)