From 9e732c11e57800564bc8f69d137870d19f9ddb79 Mon Sep 17 00:00:00 2001 From: Rob von Behren Date: Tue, 10 Mar 2026 14:47:24 -0700 Subject: [PATCH] fix: bmj list now counts finished scenarios from in-progress runs Previously only counted scenarios from completed benchmark runs (benchmark_outcomes), missing finished scenarios in still-running benchmark runs. Now fetches scenario run details for in-progress runs to get accurate done/error/score counts. Co-Authored-By: Claude Opus 4.6 --- src/commands/benchmark-job/list.ts | 51 +++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/src/commands/benchmark-job/list.ts b/src/commands/benchmark-job/list.ts index 0beba1d..86d1dec 100644 --- a/src/commands/benchmark-job/list.ts +++ b/src/commands/benchmark-job/list.ts @@ -5,6 +5,7 @@ import chalk from "chalk"; import { listBenchmarkJobs, + listBenchmarkRunScenarioRuns, type BenchmarkJob, } from "../../services/benchmarkJobService.js"; import { output, outputError } from "../../utils/output.js"; @@ -54,7 +55,16 @@ interface JobStats { avgScore: number | null; } -function aggregateJobStats(job: BenchmarkJob): JobStats { +// Scenario run states that count as finished +const SCENARIO_DONE_STATES = new Set([ + "completed", + "failed", + "canceled", + "timeout", + "error", +]); + +async function aggregateJobStats(job: BenchmarkJob): Promise { const outcomes = job.benchmark_outcomes || []; const scenarioCount = job.job_spec?.scenario_ids?.length || 0; const agentCount = job.job_spec?.agent_configs?.length || 1; @@ -65,6 +75,7 @@ function aggregateJobStats(job: BenchmarkJob): JobStats { let scoreSum = 0; let scoreCount = 0; + // Count from completed benchmark runs for (const outcome of outcomes) { done += outcome.n_completed + outcome.n_failed + outcome.n_timeout; errors += outcome.n_failed + outcome.n_timeout; @@ -74,6 +85,38 @@ function aggregateJobStats(job: BenchmarkJob): JobStats { } } + // Count finished scenarios from in-progress benchmark runs + const inProgressRuns = job.in_progress_runs || []; + if (inProgressRuns.length > 0) { + const runResults = await Promise.all( + inProgressRuns.map((run) => + listBenchmarkRunScenarioRuns(run.benchmark_run_id), + ), + ); + for (const scenarioRuns of runResults) { + let runScoreSum = 0; + let runScoreCount = 0; + for (const sr of scenarioRuns) { + const state = sr.state?.toLowerCase() || ""; + if (SCENARIO_DONE_STATES.has(state)) { + done++; + if (state !== "completed") { + errors++; + } + const score = sr.scoring_contract_result?.score; + if (score !== undefined && score !== null) { + runScoreSum += score; + runScoreCount++; + } + } + } + if (runScoreCount > 0) { + scoreSum += runScoreSum / runScoreCount; + scoreCount++; + } + } + } + return { done, total: total || done, @@ -120,7 +163,7 @@ function truncate(str: string, maxLen: number): string { return str.slice(0, maxLen - 1) + "…"; } -function printTable(jobs: BenchmarkJob[]): void { +async function printTable(jobs: BenchmarkJob[]): Promise { if (jobs.length === 0) { console.log(chalk.dim("No benchmark jobs found")); return; @@ -149,7 +192,7 @@ function printTable(jobs: BenchmarkJob[]): void { // Rows for (const job of jobs) { - const stats = aggregateJobStats(job); + const stats = await aggregateJobStats(job); const id = truncate(job.id, COL_ID).padEnd(COL_ID); const name = truncate(job.name || "", nameWidth).padEnd(nameWidth); @@ -262,7 +305,7 @@ export async function listBenchmarkJobsCommand( if (format !== "text") { output(jobs, { format, defaultFormat: "json" }); } else { - printTable(jobs); + await printTable(jobs); } } catch (error) { outputError("Failed to list benchmark jobs", error);