Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 47 additions & 4 deletions src/commands/benchmark-job/list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import chalk from "chalk";
import {
listBenchmarkJobs,
listBenchmarkRunScenarioRuns,
type BenchmarkJob,
} from "../../services/benchmarkJobService.js";
import { output, outputError } from "../../utils/output.js";
Expand Down Expand Up @@ -54,7 +55,16 @@ interface JobStats {
avgScore: number | null;
}

function aggregateJobStats(job: BenchmarkJob): JobStats {
// Scenario run states that count as finished
const SCENARIO_DONE_STATES = new Set([
"completed",
"failed",
"canceled",
"timeout",
"error",
]);

async function aggregateJobStats(job: BenchmarkJob): Promise<JobStats> {
const outcomes = job.benchmark_outcomes || [];
const scenarioCount = job.job_spec?.scenario_ids?.length || 0;
const agentCount = job.job_spec?.agent_configs?.length || 1;
Expand All @@ -65,6 +75,7 @@ function aggregateJobStats(job: BenchmarkJob): JobStats {
let scoreSum = 0;
let scoreCount = 0;

// Count from completed benchmark runs
for (const outcome of outcomes) {
done += outcome.n_completed + outcome.n_failed + outcome.n_timeout;
errors += outcome.n_failed + outcome.n_timeout;
Expand All @@ -74,6 +85,38 @@ function aggregateJobStats(job: BenchmarkJob): JobStats {
}
}

// Count finished scenarios from in-progress benchmark runs
const inProgressRuns = job.in_progress_runs || [];
if (inProgressRuns.length > 0) {
const runResults = await Promise.all(
inProgressRuns.map((run) =>
listBenchmarkRunScenarioRuns(run.benchmark_run_id),
),
);
for (const scenarioRuns of runResults) {
let runScoreSum = 0;
let runScoreCount = 0;
for (const sr of scenarioRuns) {
const state = sr.state?.toLowerCase() || "";
if (SCENARIO_DONE_STATES.has(state)) {
done++;
if (state !== "completed") {
errors++;
}
Comment on lines +103 to +105
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

canceled runs will count towards the error count

const score = sr.scoring_contract_result?.score;
if (score !== undefined && score !== null) {
runScoreSum += score;
runScoreCount++;
}
}
}
if (runScoreCount > 0) {
scoreSum += runScoreSum / runScoreCount;
scoreCount++;
}
}
}

return {
done,
total: total || done,
Expand Down Expand Up @@ -120,7 +163,7 @@ function truncate(str: string, maxLen: number): string {
return str.slice(0, maxLen - 1) + "…";
}

function printTable(jobs: BenchmarkJob[]): void {
async function printTable(jobs: BenchmarkJob[]): Promise<void> {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this method name sounds as though it's an idempotent screen dumping operation, but it does real work under the hood

if (jobs.length === 0) {
console.log(chalk.dim("No benchmark jobs found"));
return;
Expand Down Expand Up @@ -149,7 +192,7 @@ function printTable(jobs: BenchmarkJob[]): void {

// Rows
for (const job of jobs) {
const stats = aggregateJobStats(job);
const stats = await aggregateJobStats(job);

const id = truncate(job.id, COL_ID).padEnd(COL_ID);
const name = truncate(job.name || "", nameWidth).padEnd(nameWidth);
Expand Down Expand Up @@ -262,7 +305,7 @@ export async function listBenchmarkJobsCommand(
if (format !== "text") {
output(jobs, { format, defaultFormat: "json" });
} else {
printTable(jobs);
await printTable(jobs);
}
} catch (error) {
outputError("Failed to list benchmark jobs", error);
Expand Down
Loading