From a0043dd6a54ced1e574e29d422b504eaf51e05a5 Mon Sep 17 00:00:00 2001
From: Glen Maddern <glen@glenmaddern.com>
Date: Mon, 1 Dec 2025 15:39:00 +1100
Subject: [PATCH 1/3] feat: Add extract_eval_stats.py script for eval
 statistics extraction

Extract detailed statistics from openbench/Inspect AI evaluation logs (.eval ZIP or .json files).

- Total time and working time per sample
- LLM HTTP round-trip time (from model call events)
- Provider-specific timing for Groq (queue_time, prompt_time, completion_time)
- Approximate tool time (working_time - llm_http_time)
- LLM time fraction of total working time

- Input, output, and total tokens per sample and aggregate
- Per-model breakdown from model_usage

- OpenAI: prompt_tokens and cached_tokens with cache hit rate
- Anthropic: cache_read_input_tokens and cache_creation_input_tokens

- Human-readable summary to stdout
- JSON output with --output flag
- Per-sample statistics with --per-sample flag

```bash
python scripts/extract_eval_stats.py logs/my_eval.eval

python scripts/extract_eval_stats.py logs/ -o stats.json --per-sample -v
```

```
  Model: groq/openai/gpt-oss-120b
  Task: progressivemcpbench-copilot

  Samples: 130
  Score: mean=0.523 (min=0.000, max=1.000)

  CALLS:
    Model calls: 794
    Tool calls: 590

  TIME BREAKDOWN:
    Total eval time: 1h 46m
    LLM time: 7m 49s (7.3%)
    Tool time (approx): 1h 38m

  LLM CALL PERFORMANCE (avg per call):
    HTTP round-trip: 623ms
    Time to first token (queue): 12ms
    Prompt processing: 55ms
    Completion generation: 307ms

  TOKENS:
    Input: 5.41M
    Output: 124.4K
    Total: 5.54M

  THROUGHPUT:
    Input processing: 74.2K tok/s
    Output generation: 475 tok/s

  CACHING (OpenAI-compatible):
    Prompt tokens: 5.41M
    Cached tokens: 3.94M
    Cache hit rate: 72.8%
```

- Excludes scorer model calls (only counts primary model usage)
- All data is extracted from existing eval logs - no runtime changes required
- Cache statistics depend on provider support (OpenAI, Anthropic formats)

Amp-Thread-ID: https://ampcode.com/threads/T-2f80b096-55b1-4353-930c-76e9f6368f71
Co-authored-by: Amp <amp@ampcode.com>
---
 scripts/extract_eval_stats.py | 749 ++++++++++++++++++++++++++++++++++
 1 file changed, 749 insertions(+)
 create mode 100644 scripts/extract_eval_stats.py

diff --git a/scripts/extract_eval_stats.py b/scripts/extract_eval_stats.py
new file mode 100644
index 00000000..a9823173
--- /dev/null
+++ b/scripts/extract_eval_stats.py
@@ -0,0 +1,749 @@
+#!/usr/bin/env python3
+"""
+Extract detailed statistics from openbench/Inspect AI evaluation logs.
+
+This script parses .eval files (ZIP archives) or .json log files and extracts:
+- Timing statistics: total time, LLM HTTP time, tool time
+- Token usage: input, output, total tokens
+- Cache statistics: cached tokens (OpenAI), cache read/creation (Anthropic)
+- Provider-specific timing: queue_time, prompt_time, completion_time (Groq)
+
+Usage:
+    python scripts/extract_eval_stats.py <eval_log_path> [-o output.json] [--per-sample] [-v]
+
+Examples:
+    python scripts/extract_eval_stats.py logs/my_eval.eval
+    python scripts/extract_eval_stats.py logs/ -o stats.json --per-sample
+"""
+
+import argparse
+import json
+import statistics
+import sys
+import zipfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+
+@dataclass
+class SampleStats:
+    """Statistics for a single sample."""
+
+    sample_id: str
+    epoch: int | None = None
+    score: float | None = None
+    total_time: float | None = None
+    working_time: float | None = None
+
+    # Model call counts
+    n_model_calls: int = 0
+    n_tool_calls: int = 0
+
+    # LLM timing (from HTTP calls)
+    llm_http_time_total: float = 0.0
+    llm_http_times: list[float] = field(default_factory=list)
+
+    # Provider-specific timing (Groq)
+    llm_provider_time_total: float = 0.0
+    llm_queue_time_total: float = 0.0
+    llm_prompt_time_total: float = 0.0
+    llm_completion_time_total: float = 0.0
+
+    # Token usage
+    input_tokens_total: int = 0
+    output_tokens_total: int = 0
+    total_tokens: int = 0
+
+    # OpenAI caching
+    openai_prompt_tokens_total: int = 0
+    openai_cached_tokens_total: int = 0
+
+    # Anthropic caching
+    anthropic_input_tokens_total: int = 0
+    anthropic_cache_read_tokens_total: int = 0
+    anthropic_cache_creation_tokens_total: int = 0
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "sample_id": self.sample_id,
+            "epoch": self.epoch,
+            "score": self.score,
+            "total_time": self.total_time,
+            "working_time": self.working_time,
+            "n_model_calls": self.n_model_calls,
+            "n_tool_calls": self.n_tool_calls,
+            "llm_http_time_total": self.llm_http_time_total,
+            "llm_http_time_mean": (
+                statistics.mean(self.llm_http_times) if self.llm_http_times else None
+            ),
+            "llm_provider_time_total": self.llm_provider_time_total,
+            "llm_queue_time_total": self.llm_queue_time_total,
+            "llm_prompt_time_total": self.llm_prompt_time_total,
+            "llm_completion_time_total": self.llm_completion_time_total,
+            "input_tokens_total": self.input_tokens_total,
+            "output_tokens_total": self.output_tokens_total,
+            "total_tokens": self.total_tokens,
+            "openai_prompt_tokens_total": self.openai_prompt_tokens_total,
+            "openai_cached_tokens_total": self.openai_cached_tokens_total,
+            "openai_cache_hit_rate": (
+                self.openai_cached_tokens_total / self.openai_prompt_tokens_total
+                if self.openai_prompt_tokens_total > 0
+                else None
+            ),
+            "anthropic_input_tokens_total": self.anthropic_input_tokens_total,
+            "anthropic_cache_read_tokens_total": self.anthropic_cache_read_tokens_total,
+            "anthropic_cache_creation_tokens_total": self.anthropic_cache_creation_tokens_total,
+        }
+
+
+@dataclass
+class EvalStats:
+    """Aggregated statistics for an entire evaluation."""
+
+    log_path: str
+    model: str | None = None
+    task: str | None = None
+    num_samples: int = 0
+    sample_stats: list[SampleStats] = field(default_factory=list)
+
+    def aggregate(self) -> dict[str, Any]:
+        """Compute aggregate statistics across all samples."""
+        if not self.sample_stats:
+            return {"log_path": self.log_path, "num_samples": 0}
+
+        # Filter for valid values
+        total_times = [
+            s.total_time for s in self.sample_stats if s.total_time is not None
+        ]
+        working_times = [
+            s.working_time for s in self.sample_stats if s.working_time is not None
+        ]
+        scores = [s.score for s in self.sample_stats if s.score is not None]
+
+        # Counts
+        total_model_calls = sum(s.n_model_calls for s in self.sample_stats)
+        total_tool_calls = sum(s.n_tool_calls for s in self.sample_stats)
+
+        # LLM timing - collect all individual call times for averaging
+        all_http_times = []
+        all_queue_times = []
+        all_prompt_times = []
+        all_completion_times = []
+        all_provider_times = []
+
+        for s in self.sample_stats:
+            all_http_times.extend(s.llm_http_times)
+            # Provider timing is summed per sample, so we need per-call averages
+            if s.n_model_calls > 0:
+                all_queue_times.append(s.llm_queue_time_total / s.n_model_calls)
+                all_prompt_times.append(s.llm_prompt_time_total / s.n_model_calls)
+                all_completion_times.append(
+                    s.llm_completion_time_total / s.n_model_calls
+                )
+                all_provider_times.append(
+                    s.llm_provider_time_total / s.n_model_calls
+                )
+
+        # Token sums and rates
+        input_tokens_sum = sum(s.input_tokens_total for s in self.sample_stats)
+        output_tokens_sum = sum(s.output_tokens_total for s in self.sample_stats)
+        total_tokens_sum = sum(s.total_tokens for s in self.sample_stats)
+
+        # Calculate throughput (tokens per second)
+        llm_completion_time_sum = sum(
+            s.llm_completion_time_total for s in self.sample_stats
+        )
+        output_tokens_per_sec = (
+            output_tokens_sum / llm_completion_time_sum
+            if llm_completion_time_sum > 0
+            else None
+        )
+
+        llm_prompt_time_sum = sum(s.llm_prompt_time_total for s in self.sample_stats)
+        input_tokens_per_sec = (
+            input_tokens_sum / llm_prompt_time_sum if llm_prompt_time_sum > 0 else None
+        )
+
+        # OpenAI cache sums
+        openai_prompt_sum = sum(s.openai_prompt_tokens_total for s in self.sample_stats)
+        openai_cached_sum = sum(s.openai_cached_tokens_total for s in self.sample_stats)
+
+        # Anthropic cache sums
+        anthropic_input_sum = sum(
+            s.anthropic_input_tokens_total for s in self.sample_stats
+        )
+        anthropic_cache_read_sum = sum(
+            s.anthropic_cache_read_tokens_total for s in self.sample_stats
+        )
+        anthropic_cache_creation_sum = sum(
+            s.anthropic_cache_creation_tokens_total for s in self.sample_stats
+        )
+
+        # Time breakdown
+        llm_http_time_sum = sum(s.llm_http_time_total for s in self.sample_stats)
+        tool_time_approx = (
+            sum(working_times) - llm_http_time_sum if working_times else None
+        )
+
+        return {
+            "log_path": self.log_path,
+            "model": self.model,
+            "task": self.task,
+            "num_samples": len(self.sample_stats),
+            # Scores
+            "score_mean": statistics.mean(scores) if scores else None,
+            "score_min": min(scores) if scores else None,
+            "score_max": max(scores) if scores else None,
+            # Sample timing
+            "total_time_sum": sum(total_times) if total_times else None,
+            "total_time_mean": statistics.mean(total_times) if total_times else None,
+            "total_time_p50": (
+                statistics.median(total_times) if total_times else None
+            ),
+            "total_time_p95": (
+                sorted(total_times)[int(len(total_times) * 0.95)]
+                if len(total_times) >= 20
+                else None
+            ),
+            "working_time_sum": sum(working_times) if working_times else None,
+            "working_time_mean": (
+                statistics.mean(working_times) if working_times else None
+            ),
+            # LLM call timing (averages per call)
+            "llm_http_time_mean": (
+                statistics.mean(all_http_times) if all_http_times else None
+            ),
+            "llm_queue_time_mean": (
+                statistics.mean(all_queue_times) if all_queue_times else None
+            ),
+            "llm_prompt_time_mean": (
+                statistics.mean(all_prompt_times) if all_prompt_times else None
+            ),
+            "llm_completion_time_mean": (
+                statistics.mean(all_completion_times) if all_completion_times else None
+            ),
+            "llm_provider_time_mean": (
+                statistics.mean(all_provider_times) if all_provider_times else None
+            ),
+            # Time breakdown (for understanding where time goes)
+            "llm_http_time_sum": llm_http_time_sum,
+            "tool_time_approx": tool_time_approx,
+            "llm_time_fraction": (
+                llm_http_time_sum / sum(working_times)
+                if working_times and sum(working_times) > 0
+                else None
+            ),
+            # Counts
+            "total_model_calls": total_model_calls,
+            "total_tool_calls": total_tool_calls,
+            # Tokens
+            "input_tokens_sum": input_tokens_sum,
+            "output_tokens_sum": output_tokens_sum,
+            "total_tokens_sum": total_tokens_sum,
+            # Throughput
+            "output_tokens_per_sec": output_tokens_per_sec,
+            "input_tokens_per_sec": input_tokens_per_sec,
+            # OpenAI caching
+            "openai_prompt_tokens_sum": openai_prompt_sum,
+            "openai_cached_tokens_sum": openai_cached_sum,
+            "openai_cache_hit_rate": (
+                openai_cached_sum / openai_prompt_sum if openai_prompt_sum > 0 else None
+            ),
+            # Anthropic caching
+            "anthropic_input_tokens_sum": anthropic_input_sum,
+            "anthropic_cache_read_tokens_sum": anthropic_cache_read_sum,
+            "anthropic_cache_creation_tokens_sum": anthropic_cache_creation_sum,
+        }
+
+
+def extract_usage_from_response(response: dict[str, Any]) -> dict[str, Any]:
+    """
+    Extract token usage and caching info from a raw model response.
+
+    Handles both OpenAI and Anthropic response formats.
+    """
+    result: dict[str, Any] = {
+        "input_tokens": 0,
+        "output_tokens": 0,
+        "total_tokens": 0,
+        "openai_prompt_tokens": 0,
+        "openai_cached_tokens": 0,
+        "anthropic_input_tokens": 0,
+        "anthropic_cache_read_tokens": 0,
+        "anthropic_cache_creation_tokens": 0,
+    }
+
+    usage = response.get("usage") or {}
+
+    # OpenAI format
+    prompt_tokens = usage.get("prompt_tokens") or 0
+    completion_tokens = usage.get("completion_tokens") or 0
+    total_tokens = usage.get("total_tokens") or (prompt_tokens + completion_tokens)
+
+    result["input_tokens"] = prompt_tokens
+    result["output_tokens"] = completion_tokens
+    result["total_tokens"] = total_tokens
+    result["openai_prompt_tokens"] = prompt_tokens
+
+    # OpenAI cached tokens
+    prompt_details = usage.get("prompt_tokens_details") or {}
+    cached_tokens = prompt_details.get("cached_tokens") or 0
+    result["openai_cached_tokens"] = cached_tokens
+
+    # Anthropic format (check for anthropic-specific fields)
+    if "cache_read_input_tokens" in usage or "cache_creation_input_tokens" in usage:
+        result["anthropic_input_tokens"] = usage.get("input_tokens") or 0
+        result["anthropic_cache_read_tokens"] = (
+            usage.get("cache_read_input_tokens") or 0
+        )
+        result["anthropic_cache_creation_tokens"] = (
+            usage.get("cache_creation_input_tokens") or 0
+        )
+        # For Anthropic, recalculate input tokens
+        result["input_tokens"] = usage.get("input_tokens") or 0
+        result["output_tokens"] = usage.get("output_tokens") or 0
+        result["total_tokens"] = result["input_tokens"] + result["output_tokens"]
+
+    return result
+
+
+def extract_provider_timing(metadata: dict[str, Any]) -> dict[str, float]:
+    """Extract provider-specific timing info (e.g., Groq timing)."""
+    return {
+        "queue_time": metadata.get("queue_time") or 0.0,
+        "prompt_time": metadata.get("prompt_time") or 0.0,
+        "completion_time": metadata.get("completion_time") or 0.0,
+        "total_time": metadata.get("total_time") or 0.0,
+    }
+
+
+def build_span_hierarchy(events: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
+    """Build a mapping of span_id -> span info from span_begin events."""
+    span_info: dict[str, dict[str, Any]] = {}
+    for evt in events:
+        if evt.get("event") == "span_begin":
+            span_info[evt["id"]] = {
+                "type": evt.get("type", ""),
+                "name": evt.get("name", ""),
+                "parent_id": evt.get("parent_id"),
+            }
+    return span_info
+
+
+def get_root_span_type(span_id: str, span_info: dict[str, dict[str, Any]]) -> str:
+    """Walk up the span hierarchy to find the root span type (solvers, scorers, init)."""
+    visited: set[str] = set()
+    while span_id and span_id not in visited:
+        visited.add(span_id)
+        info = span_info.get(span_id)
+        if not info:
+            return "unknown"
+        if info["type"] in ("solvers", "scorers", "init"):
+            return info["type"]
+        span_id = info.get("parent_id", "")
+    return "unknown"
+
+
+def extract_sample_stats(sample_data: dict[str, Any], verbose: bool = False) -> SampleStats:
+    """Extract statistics from a single sample.
+    
+    Only includes model calls from the 'solvers' span, excluding scorer LLM calls.
+    """
+    stats = SampleStats(
+        sample_id=sample_data.get("id", "unknown"),
+        epoch=sample_data.get("epoch"),
+        total_time=sample_data.get("total_time"),
+        working_time=sample_data.get("working_time"),
+    )
+
+    # Extract score
+    scores = sample_data.get("scores") or {}
+    for scorer_name, scorer_data in scores.items():
+        if isinstance(scorer_data, dict) and "value" in scorer_data:
+            stats.score = scorer_data["value"]
+            break
+
+    # Process events
+    events = sample_data.get("events") or []
+    
+    # Build span hierarchy to distinguish solver vs scorer calls
+    span_info = build_span_hierarchy(events)
+
+    for event in events:
+        event_type = event.get("event") or event.get("type") or event.get("kind", "")
+
+        if event_type == "model" or "model" in str(event_type).lower():
+            # Check if this is a solver call (not a scorer call)
+            span_id = event.get("span_id", "")
+            root_span_type = get_root_span_type(span_id, span_info)
+            
+            # Only count model calls from solvers, not scorers
+            if root_span_type != "solvers":
+                if verbose and root_span_type == "scorers":
+                    pass  # Skip scorer calls silently unless debugging
+                continue
+            
+            stats.n_model_calls += 1
+
+            # HTTP time from call
+            call = event.get("call") or {}
+            http_time = call.get("time") or 0.0
+            if http_time:
+                stats.llm_http_time_total += http_time
+                stats.llm_http_times.append(http_time)
+
+            # Output contains usage and metadata
+            output = event.get("output") or {}
+
+            # Token usage from output.usage
+            usage = output.get("usage") or {}
+            stats.input_tokens_total += usage.get("input_tokens") or 0
+            stats.output_tokens_total += usage.get("output_tokens") or 0
+            stats.total_tokens += usage.get("total_tokens") or 0
+
+            # Check for cache info in raw response
+            response = call.get("response") or {}
+            if response:
+                cache_info = extract_usage_from_response(response)
+                stats.openai_prompt_tokens_total += cache_info["openai_prompt_tokens"]
+                stats.openai_cached_tokens_total += cache_info["openai_cached_tokens"]
+                stats.anthropic_input_tokens_total += cache_info[
+                    "anthropic_input_tokens"
+                ]
+                stats.anthropic_cache_read_tokens_total += cache_info[
+                    "anthropic_cache_read_tokens"
+                ]
+                stats.anthropic_cache_creation_tokens_total += cache_info[
+                    "anthropic_cache_creation_tokens"
+                ]
+
+            # Provider timing from output.metadata (Groq)
+            metadata = output.get("metadata") or {}
+            provider_timing = extract_provider_timing(metadata)
+            stats.llm_provider_time_total += provider_timing["total_time"]
+            stats.llm_queue_time_total += provider_timing["queue_time"]
+            stats.llm_prompt_time_total += provider_timing["prompt_time"]
+            stats.llm_completion_time_total += provider_timing["completion_time"]
+
+        elif event_type == "tool" or "tool" in str(event_type).lower():
+            stats.n_tool_calls += 1
+
+    # Also check model_usage at sample level (fallback)
+    model_usage = sample_data.get("model_usage") or {}
+    if not stats.input_tokens_total and model_usage:
+        for model_name, usage in model_usage.items():
+            stats.input_tokens_total += usage.get("input_tokens") or 0
+            stats.output_tokens_total += usage.get("output_tokens") or 0
+            stats.total_tokens += usage.get("total_tokens") or 0
+
+    return stats
+
+
+def parse_eval_file(eval_file: Path, verbose: bool = False) -> EvalStats:
+    """Parse a .eval ZIP file and extract statistics."""
+    eval_stats = EvalStats(log_path=str(eval_file))
+
+    try:
+        with zipfile.ZipFile(eval_file, "r") as zf:
+            # Read start journal for metadata
+            try:
+                with zf.open("_journal/start.json") as f:
+                    start_data = json.load(f)
+                    eval_info = start_data.get("eval") or {}
+                    eval_stats.model = eval_info.get("model")
+                    eval_stats.task = eval_info.get("task")
+            except (KeyError, json.JSONDecodeError):
+                pass
+
+            # Find all sample files
+            sample_files = [
+                name
+                for name in zf.namelist()
+                if name.startswith("samples/") and name.endswith(".json")
+            ]
+
+            for sample_file in sample_files:
+                try:
+                    with zf.open(sample_file) as f:
+                        sample_data = json.load(f)
+                        sample_stats = extract_sample_stats(sample_data, verbose)
+                        eval_stats.sample_stats.append(sample_stats)
+                except (json.JSONDecodeError, KeyError) as e:
+                    if verbose:
+                        print(
+                            f"Warning: Could not parse {sample_file}: {e}",
+                            file=sys.stderr,
+                        )
+
+            eval_stats.num_samples = len(eval_stats.sample_stats)
+
+    except zipfile.BadZipFile as e:
+        print(f"Warning: Could not read ZIP file {eval_file}: {e}", file=sys.stderr)
+
+    return eval_stats
+
+
+def parse_json_file(json_file: Path, verbose: bool = False) -> EvalStats:
+    """Parse a .json log file and extract statistics."""
+    eval_stats = EvalStats(log_path=str(json_file))
+
+    try:
+        with open(json_file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        # Try to find samples in various locations
+        samples = []
+        if "results" in data and "samples" in data["results"]:
+            samples = data["results"]["samples"]
+        elif "samples" in data:
+            samples = data["samples"]
+
+        for sample_data in samples:
+            sample_stats = extract_sample_stats(sample_data, verbose)
+            eval_stats.sample_stats.append(sample_stats)
+
+        eval_stats.num_samples = len(eval_stats.sample_stats)
+
+        # Try to get model/task info
+        if "eval" in data:
+            eval_stats.model = data["eval"].get("model")
+            eval_stats.task = data["eval"].get("task")
+
+    except (json.JSONDecodeError, OSError) as e:
+        print(f"Warning: Could not parse {json_file}: {e}", file=sys.stderr)
+
+    return eval_stats
+
+
+def find_log_files(log_path: Path) -> list[Path]:
+    """Find all log files (.eval or .json) in a path."""
+    if log_path.is_file():
+        if log_path.suffix in (".eval", ".json"):
+            return [log_path]
+        return []
+
+    log_files = []
+    for f in log_path.rglob("*.eval"):
+        log_files.append(f)
+    for f in log_path.rglob("*.json"):
+        if "samples" not in f.parts:  # Skip sample subdirectory files
+            log_files.append(f)
+
+    return sorted(log_files)
+
+
+def format_time(seconds: float | None) -> str:
+    """Format seconds as human-readable time (e.g., '2h 20m', '45s', '1.23s')."""
+    if seconds is None:
+        return "N/A"
+    
+    if seconds < 1:
+        return f"{seconds*1000:.0f}ms"
+    elif seconds < 10:
+        return f"{seconds:.2f}s"
+    elif seconds < 60:
+        return f"{seconds:.1f}s"
+    elif seconds < 3600:
+        minutes = int(seconds // 60)
+        secs = int(seconds % 60)
+        if secs == 0:
+            return f"{minutes}m"
+        return f"{minutes}m {secs}s"
+    else:
+        hours = int(seconds // 3600)
+        remaining = seconds % 3600
+        minutes = int(remaining // 60)
+        # Round to nearest 10 minutes for long durations
+        if hours >= 3:
+            minutes = round(minutes / 10) * 10
+            if minutes == 60:
+                hours += 1
+                minutes = 0
+        if minutes == 0:
+            return f"{hours}h"
+        return f"{hours}h {minutes}m"
+
+
+def format_tokens(count: int | None) -> str:
+    """Format token count as human-readable number."""
+    if count is None or count == 0:
+        return "0"
+    if count < 1000:
+        return str(count)
+    elif count < 1_000_000:
+        return f"{count/1000:.1f}K"
+    else:
+        return f"{count/1_000_000:.2f}M"
+
+
+def format_percentage(value: float | None) -> str:
+    """Format a fraction as percentage."""
+    if value is None:
+        return "N/A"
+    return f"{value * 100:.1f}%"
+
+
+def format_rate(tokens_per_sec: float | None) -> str:
+    """Format tokens per second as human-readable rate."""
+    if tokens_per_sec is None:
+        return "N/A"
+    if tokens_per_sec >= 1000:
+        return f"{tokens_per_sec/1000:.1f}K tok/s"
+    return f"{tokens_per_sec:.0f} tok/s"
+
+
+def print_summary(eval_stats: EvalStats) -> None:
+    """Print a human-readable summary of eval statistics."""
+    agg = eval_stats.aggregate()
+
+    print(f"\n{'=' * 60}")
+    print(f"EVAL: {agg['log_path']}")
+    print(f"{'=' * 60}")
+
+    if agg["model"]:
+        print(f"  Model: {agg['model']}")
+    if agg["task"]:
+        print(f"  Task: {agg['task']}")
+
+    print(f"\n  Samples: {agg['num_samples']}")
+    if agg["score_mean"] is not None:
+        print(
+            f"  Score: mean={agg['score_mean']:.3f} (min={agg['score_min']:.3f}, max={agg['score_max']:.3f})"
+        )
+
+    print(f"\n  CALLS:")
+    print(f"    Model calls: {agg['total_model_calls']}")
+    print(f"    Tool calls: {agg['total_tool_calls']}")
+
+    print(f"\n  TIME BREAKDOWN:")
+    print(f"    Total eval time: {format_time(agg['total_time_sum'])}")
+    print(
+        f"    LLM time: {format_time(agg['llm_http_time_sum'])} ({format_percentage(agg['llm_time_fraction'])})"
+    )
+    print(f"    Tool time (approx): {format_time(agg['tool_time_approx'])}")
+
+    # LLM call performance (averages)
+    if agg["llm_http_time_mean"] is not None:
+        print(f"\n  LLM CALL PERFORMANCE (avg per call):")
+        print(f"    HTTP round-trip: {format_time(agg['llm_http_time_mean'])}")
+        if agg["llm_queue_time_mean"] is not None and agg["llm_queue_time_mean"] > 0:
+            print(f"    Time to first token (queue): {format_time(agg['llm_queue_time_mean'])}")
+        if agg["llm_prompt_time_mean"] is not None and agg["llm_prompt_time_mean"] > 0:
+            print(f"    Prompt processing: {format_time(agg['llm_prompt_time_mean'])}")
+        if agg["llm_completion_time_mean"] is not None and agg["llm_completion_time_mean"] > 0:
+            print(f"    Completion generation: {format_time(agg['llm_completion_time_mean'])}")
+
+    print(f"\n  TOKENS:")
+    print(f"    Input: {format_tokens(agg['input_tokens_sum'])}")
+    print(f"    Output: {format_tokens(agg['output_tokens_sum'])}")
+    print(f"    Total: {format_tokens(agg['total_tokens_sum'])}")
+
+    # Throughput
+    if agg["output_tokens_per_sec"] is not None or agg["input_tokens_per_sec"] is not None:
+        print(f"\n  THROUGHPUT:")
+        if agg["input_tokens_per_sec"] is not None:
+            print(f"    Input processing: {format_rate(agg['input_tokens_per_sec'])}")
+        if agg["output_tokens_per_sec"] is not None:
+            print(f"    Output generation: {format_rate(agg['output_tokens_per_sec'])}")
+
+    if agg["openai_cached_tokens_sum"] and agg["openai_cached_tokens_sum"] > 0:
+        print(f"\n  CACHING (OpenAI-compatible):")
+        print(f"    Prompt tokens: {format_tokens(agg['openai_prompt_tokens_sum'])}")
+        print(f"    Cached tokens: {format_tokens(agg['openai_cached_tokens_sum'])}")
+        print(f"    Cache hit rate: {format_percentage(agg['openai_cache_hit_rate'])}")
+
+    if (
+        agg["anthropic_cache_read_tokens_sum"]
+        and agg["anthropic_cache_read_tokens_sum"] > 0
+    ) or (
+        agg["anthropic_cache_creation_tokens_sum"]
+        and agg["anthropic_cache_creation_tokens_sum"] > 0
+    ):
+        print(f"\n  CACHING (Anthropic):")
+        print(f"    Input tokens: {format_tokens(agg['anthropic_input_tokens_sum'])}")
+        print(
+            f"    Cache read: {format_tokens(agg['anthropic_cache_read_tokens_sum'])}"
+        )
+        print(
+            f"    Cache creation: {format_tokens(agg['anthropic_cache_creation_tokens_sum'])}"
+        )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Extract statistics from openbench evaluation logs"
+    )
+    parser.add_argument(
+        "log_path",
+        type=Path,
+        help="Path to log directory or individual log file",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        help="Output file for JSON statistics",
+    )
+    parser.add_argument(
+        "--per-sample",
+        action="store_true",
+        help="Include per-sample statistics in output",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Print verbose output",
+    )
+
+    args = parser.parse_args()
+
+    if not args.log_path.exists():
+        print(f"Error: Path does not exist: {args.log_path}", file=sys.stderr)
+        sys.exit(1)
+
+    log_files = find_log_files(args.log_path)
+    if not log_files:
+        print(f"Error: No log files found in {args.log_path}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.verbose:
+        print(f"Found {len(log_files)} log file(s)")
+
+    all_eval_stats: list[EvalStats] = []
+
+    for log_file in log_files:
+        if args.verbose:
+            print(f"Processing {log_file}")
+
+        if log_file.suffix == ".eval":
+            eval_stats = parse_eval_file(log_file, args.verbose)
+        else:
+            eval_stats = parse_json_file(log_file, args.verbose)
+
+        all_eval_stats.append(eval_stats)
+        print_summary(eval_stats)
+
+    # Write JSON output if requested
+    if args.output:
+        output_data: dict[str, Any] = {
+            "evals": [es.aggregate() for es in all_eval_stats],
+        }
+        if args.per_sample:
+            output_data["samples"] = []
+            for es in all_eval_stats:
+                for sample in es.sample_stats:
+                    sample_dict = sample.to_dict()
+                    sample_dict["eval_log_path"] = es.log_path
+                    output_data["samples"].append(sample_dict)
+
+        with open(args.output, "w", encoding="utf-8") as f:
+            json.dump(output_data, f, indent=2)
+
+        print(f"\nJSON output written to: {args.output}")
+
+
+if __name__ == "__main__":
+    main()

From 60716487129ea05db6ed629f9674389a160f67fa Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Mon, 1 Dec 2025 22:56:29 +0000
Subject: [PATCH 2/3] Refactor LLM timing calculations for accuracy

Co-authored-by: gmaddern <gmaddern@groq.com>
---
 scripts/extract_eval_stats.py | 39 ++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/scripts/extract_eval_stats.py b/scripts/extract_eval_stats.py
index a9823173..1bcc3ea4 100644
--- a/scripts/extract_eval_stats.py
+++ b/scripts/extract_eval_stats.py
@@ -128,23 +128,20 @@ def aggregate(self) -> dict[str, Any]:
 
         # LLM timing - collect all individual call times for averaging
         all_http_times = []
-        all_queue_times = []
-        all_prompt_times = []
-        all_completion_times = []
-        all_provider_times = []
+        total_queue_time_sum = 0.0
+        total_prompt_time_sum = 0.0
+        total_completion_time_sum = 0.0
+        total_provider_time_sum = 0.0
+        total_provider_time_calls = 0
 
         for s in self.sample_stats:
             all_http_times.extend(s.llm_http_times)
-            # Provider timing is summed per sample, so we need per-call averages
             if s.n_model_calls > 0:
-                all_queue_times.append(s.llm_queue_time_total / s.n_model_calls)
-                all_prompt_times.append(s.llm_prompt_time_total / s.n_model_calls)
-                all_completion_times.append(
-                    s.llm_completion_time_total / s.n_model_calls
-                )
-                all_provider_times.append(
-                    s.llm_provider_time_total / s.n_model_calls
-                )
+                total_queue_time_sum += s.llm_queue_time_total
+                total_prompt_time_sum += s.llm_prompt_time_total
+                total_completion_time_sum += s.llm_completion_time_total
+                total_provider_time_sum += s.llm_provider_time_total
+                total_provider_time_calls += s.n_model_calls
 
         # Token sums and rates
         input_tokens_sum = sum(s.input_tokens_total for s in self.sample_stats)
@@ -216,16 +213,24 @@ def aggregate(self) -> dict[str, Any]:
                 statistics.mean(all_http_times) if all_http_times else None
             ),
             "llm_queue_time_mean": (
-                statistics.mean(all_queue_times) if all_queue_times else None
+                total_queue_time_sum / total_provider_time_calls
+                if total_provider_time_calls > 0
+                else None
             ),
             "llm_prompt_time_mean": (
-                statistics.mean(all_prompt_times) if all_prompt_times else None
+                total_prompt_time_sum / total_provider_time_calls
+                if total_provider_time_calls > 0
+                else None
             ),
             "llm_completion_time_mean": (
-                statistics.mean(all_completion_times) if all_completion_times else None
+                total_completion_time_sum / total_provider_time_calls
+                if total_provider_time_calls > 0
+                else None
             ),
             "llm_provider_time_mean": (
-                statistics.mean(all_provider_times) if all_provider_times else None
+                total_provider_time_sum / total_provider_time_calls
+                if total_provider_time_calls > 0
+                else None
             ),
             # Time breakdown (for understanding where time goes)
             "llm_http_time_sum": llm_http_time_sum,

From ce075e0f1fe069b08c9ad12c9d070ded4679bc56 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Mon, 1 Dec 2025 22:58:53 +0000
Subject: [PATCH 3/3] Refactor total_time_p95 calculation for accuracy

Co-authored-by: gmaddern <gmaddern@groq.com>
---
 scripts/extract_eval_stats.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/extract_eval_stats.py b/scripts/extract_eval_stats.py
index 1bcc3ea4..2f6a5195 100644
--- a/scripts/extract_eval_stats.py
+++ b/scripts/extract_eval_stats.py
@@ -184,6 +184,18 @@ def aggregate(self) -> dict[str, Any]:
             sum(working_times) - llm_http_time_sum if working_times else None
         )
 
+        total_time_p95 = None
+        if len(total_times) >= 20:
+            sorted_times = sorted(total_times)
+            percentile_position = 0.95 * (len(sorted_times) - 1)
+            lower_index = int(percentile_position)
+            upper_index = min(lower_index + 1, len(sorted_times) - 1)
+            fraction = percentile_position - lower_index
+            total_time_p95 = (
+                sorted_times[lower_index] * (1 - fraction)
+                + sorted_times[upper_index] * fraction
+            )
+
         return {
             "log_path": self.log_path,
             "model": self.model,
@@ -199,11 +211,7 @@ def aggregate(self) -> dict[str, Any]:
             "total_time_p50": (
                 statistics.median(total_times) if total_times else None
             ),
-            "total_time_p95": (
-                sorted(total_times)[int(len(total_times) * 0.95)]
-                if len(total_times) >= 20
-                else None
-            ),
+            "total_time_p95": total_time_p95,
             "working_time_sum": sum(working_times) if working_times else None,
             "working_time_mean": (
                 statistics.mean(working_times) if working_times else None