Skip to content

feat(evaluation): RunTrendAnalyzer — detect total_score regression across sequential benchmark runs #3

@nanookclaw

Description

@nanookclaw

Summary

evaluation/score.py writes a _score.json to each run's workspace with:

  • run_id
  • task_id
  • total_score (0–100 weighted average across checklist items)
  • timestamp (from _meta.json in the same workspace)

evaluation/utils.py's list_runs() already returns all runs sorted by timestamp (reverse order), including the workspace path. The raw data for cross-run trend analysis is fully present.

Gap: If an agent's total_score regresses from 72 → 68 → 64 across three runs, that slope is invisible. Each run is evaluated in isolation.

Proposed: RunTrendAnalyzer

# evaluation/trend.py

from __future__ import annotations

import json
import statistics
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional

from .utils import list_runs, get_run_workspace


@dataclass
class RunScorePoint:
    run_id: str
    task_id: Optional[str]
    timestamp: str
    total_score: float
    status: str


@dataclass
class RunTrendReport:
    task_id: Optional[str]
    run_count: int
    window: int
    slope: float          # total_score per run (negative = regression)
    points: List[RunScorePoint]
    regression_detected: bool
    regression_threshold: float

    def summary(self) -> str:
        direction = "▼ REGRESSION" if self.regression_detected else ("▲ improving" if self.slope > 0 else "→ stable")
        task = self.task_id or "all tasks"
        return (
            f"{direction} [{task}]: slope={self.slope:+.2f} pts/run "
            f"over last {self.run_count} runs (threshold={self.regression_threshold:+.2f})"
        )


class RunTrendAnalyzer:
    """Detect total_score regression across sequential benchmark runs."""

    def __init__(
        self,
        *,
        window: int = 10,
        regression_threshold: float = -1.0,  # slope below this triggers regression flag
    ):
        self.window = window
        self.regression_threshold = regression_threshold

    def load_points(self, task_id: Optional[str] = None) -> List[RunScorePoint]:
        """Load score points from all completed runs, ordered by timestamp ascending."""
        runs = list_runs()  # returns list of dicts with run_id, task_id, timestamp, status, workspace
        points = []
        for run in reversed(runs):  # list_runs returns reverse-chrono; reverse to get ascending
            if task_id and run.get("task_id") != task_id:
                continue
            if run.get("status") != "completed":
                continue
            ws = get_run_workspace(run["run_id"])
            if ws is None:
                continue
            score_path = ws / "_score.json"
            if not score_path.exists():
                continue
            try:
                score_data = json.loads(score_path.read_text())
            except Exception:
                continue
            points.append(RunScorePoint(
                run_id=run["run_id"],
                task_id=run.get("task_id"),
                timestamp=run.get("timestamp", ""),
                total_score=float(score_data.get("total_score", 0)),
                status=run["status"],
            ))
        return points

    def analyze(self, task_id: Optional[str] = None) -> Optional[RunTrendReport]:
        """Return trend report for the specified task (or all completed runs if None)."""
        points = self.load_points(task_id)
        window_pts = points[-self.window:]
        if len(window_pts) < 2:
            return None
        xs = list(range(len(window_pts)))
        ys = [p.total_score for p in window_pts]
        slope, _ = statistics.linear_regression(xs, ys)
        return RunTrendReport(
            task_id=task_id,
            run_count=len(window_pts),
            window=self.window,
            slope=slope,
            points=window_pts,
            regression_detected=slope < self.regression_threshold,
            regression_threshold=self.regression_threshold,
        )

CLI Integration

Add a trend subcommand to evaluation/__main__.py:

elif args.command == "trend":
    from .trend import RunTrendAnalyzer
    analyzer = RunTrendAnalyzer(window=args.window or 10)
    report = analyzer.analyze(task_id=getattr(args, "task", None))
    if report:
        print(report.summary())
    else:
        print("Not enough completed runs for trend analysis (need ≥ 2).")
python -m evaluation trend --task openai-tripos-b
# ▼ REGRESSION [openai-tripos-b]: slope=-2.40 pts/run over last 8 runs (threshold=-1.00)

Why This Matters

  • Agents are iterated (prompt changes, model upgrades, tool updates) — cross-run score trajectory is the primary signal for whether changes are helping or hurting
  • ResearchClawBench's rubric is complex (objective + subjective modes, weighted checklist) — gradual drift is easy to miss run-to-run
  • list_runs() + _score.json provide everything needed; this is strictly additive (no schema changes)
  • Pure stdlib: statistics.linear_regression (Python 3.10+)

Reference: PDR: Persistence of Behavioral Drift in Production LLM Agents — 113 confirmed instances of this gap across evaluation frameworks.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions