Skip to content

feat(benchmark): RunTrendAnalyzer — detect score regression across sequential benchmark runs #101

@nanookclaw

Description

@nanookclaw

Summary

benchmark.py writes per-run results to {output_dir}/{run_id}_{model_slug}.json, each including:

  • timestamp: Unix epoch for the run
  • efficiency.overall_score / per-task grading.mean: score for each run

There is no mechanism to detect whether a model's benchmark score is improving or degrading across sequential runs. Each run is isolated — if a model regresses from 87% → 83% → 79% across three runs, that slope is invisible.

Proposed: RunTrendAnalyzer

A lightweight analysis utility that reads all *_{model_slug}.json files from the output directory, orders them by timestamp, computes an OLS slope on score_pct, and flags regression.

# scripts/lib_trend.py

from __future__ import annotations

import json
import statistics
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional


@dataclass
class RunPoint:
    run_id: str
    timestamp: float
    model: str
    score_pct: float
    task_count: int


@dataclass
class RunTrendReport:
    model: str
    run_count: int
    window: int
    slope: float  # score_pct per run (negative = regression)
    points: List[RunPoint]
    regression_detected: bool
    regression_threshold: float

    def summary(self) -> str:
        direction = "▼ REGRESSION" if self.regression_detected else "▲ improving" if self.slope > 0 else "→ stable"
        return (
            f"{direction}: {self.model} slope={self.slope:+.2f}%/run "
            f"over last {self.run_count} runs "
            f"(threshold={self.regression_threshold:+.2f})"        )


class RunTrendAnalyzer:
    """Detect score regression across sequential benchmark runs for a given model."""

    def __init__(
        self,
        results_dir: Path,
        *,
        window: int = 10,
        regression_threshold: float = -0.5,  # slope below this triggers regression flag
    ):
        self.results_dir = results_dir
        self.window = window
        self.regression_threshold = regression_threshold

    def load_points(self, model: Optional[str] = None) -> dict[str, List[RunPoint]]:
        """Load all run points, grouped by model slug."""
        grouped: dict[str, List[RunPoint]] = {}
        for path in self.results_dir.glob("*.json"):
            try:
                data = json.loads(path.read_text())
            except Exception:
                continue
            m = data.get("model", "")
            ts = data.get("timestamp", 0.0)
            run_id = data.get("run_id", path.stem)
            tasks = data.get("tasks", [])
            if not tasks:
                continue
            # Compute overall score_pct from per-task means
            total = sum(t["grading"]["mean"] for t in tasks if "grading" in t)
            score_pct = (total / len(tasks)) * 100
            if model and m != model:
                continue
            grouped.setdefault(m, []).append(RunPoint(run_id, ts, m, score_pct, len(tasks)))
        for pts in grouped.values():
            pts.sort(key=lambda p: p.timestamp)
        return grouped

    def analyze(self, model: Optional[str] = None) -> List[RunTrendReport]:
        """Return trend reports for each model (or the specified model)."""
        grouped = self.load_points(model)
        reports = []
        for m, pts in grouped.items():
            window_pts = pts[-self.window :]
            if len(window_pts) < 2:
                continue
            xs = list(range(len(window_pts)))
            ys = [p.score_pct for p in window_pts]
            slope, _ = statistics.linear_regression(xs, ys)
            reports.append(
                RunTrendReport(
                    model=m,
                    run_count=len(window_pts),
                    window=self.window,
                    slope=slope,
                    points=window_pts,
                    regression_detected=slope < self.regression_threshold,
                    regression_threshold=self.regression_threshold,
                )
            )
        return reports

CLI Integration

# Add to benchmark.py or as a standalone script
python scripts/benchmark.py trend --output-dir results/ --model claude-sonnet-4-6
# Output: ▼ REGRESSION: claude-sonnet-4-6 slope=-1.80%/run over last 8 runs (threshold=-0.50)

Why This Matters

PinchBench runs are already timestamped and written to output_dir/ — the raw data is there. The leaderboard captures cross-model ranking, but cross-run trend for the same model is invisible. A model degrading at −1%/run over 10 runs loses 10 percentage points before anyone notices.

This is complementary to the leaderboard: leaderboard = cross-model snapshot, trend = within-model trajectory.

Implementation notes:

  • Pure stdlib: uses statistics.linear_regression (Python 3.10+, already in pyproject.toml)
  • Zero new dependencies
  • Strictly additive — does not touch existing result writing or upload paths
  • Compatible with existing {run_id}_{model_slug}.json format

Reference: PDR: Persistence of Behavioral Drift in Production LLM Agents — 112 confirmed instances of this gap across evaluation frameworks.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions