diff --git a/README.md b/README.md index c87cea3..c4ac873 100644 --- a/README.md +++ b/README.md @@ -181,12 +181,19 @@ rli mcp start # Start the MCP server rli mcp install # Install Runloop MCP server configurat... ``` +### Scenario Commands (alias: `scn`) + +```bash +rli scenario info # Display scenario definition details +``` + ### Benchmark-job Commands (alias: `bmj`) ```bash rli benchmark-job run # Run a benchmark job with one or more ... rli benchmark-job summary # Get benchmark job summary and results rli benchmark-job watch # Watch benchmark job progress in real-... +rli benchmark-job logs # Download devbox logs for all scenario... rli benchmark-job list # List benchmark jobs ``` diff --git a/package.json b/package.json index 9bb7933..7378c01 100644 --- a/package.json +++ b/package.json @@ -74,6 +74,7 @@ "@modelcontextprotocol/sdk": "^1.26.0", "@runloop/api-client": "1.10.3", "@types/express": "^5.0.6", + "adm-zip": "^0.5.16", "chalk": "^5.6.2", "commander": "^14.0.2", "conf": "^15.0.2", @@ -98,6 +99,7 @@ }, "devDependencies": { "@anthropic-ai/mcpb": "^2.1.2", + "@types/adm-zip": "^0.5.7", "@types/jest": "^29.5.14", "@types/node": "^22.19.7", "@types/react": "^19.2.10", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 6594257..142814e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -23,6 +23,9 @@ importers: '@types/express': specifier: ^5.0.6 version: 5.0.6 + adm-zip: + specifier: ^0.5.16 + version: 0.5.16 chalk: specifier: ^5.6.2 version: 5.6.2 @@ -75,6 +78,9 @@ importers: '@anthropic-ai/mcpb': specifier: ^2.1.2 version: 2.1.2 + '@types/adm-zip': + specifier: ^0.5.7 + version: 0.5.7 '@types/jest': specifier: ^29.5.14 version: 29.5.14 @@ -713,6 +719,9 @@ packages: '@tsconfig/node16@1.0.4': resolution: {integrity: sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==} + '@types/adm-zip@0.5.7': + resolution: {integrity: sha512-DNEs/QvmyRLurdQPChqq0Md4zGvPwHerAJYWk9l2jCbD1VPpnzRJorOdiq4zsw09NFbYnhfsoEhWtxIzXpn2yw==} + '@types/babel__core@7.20.5': resolution: {integrity: sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==} @@ -887,6 +896,10 @@ packages: engines: {node: '>=0.4.0'} hasBin: true + adm-zip@0.5.16: + resolution: {integrity: sha512-TGw5yVi4saajsSEgz25grObGHEUaDrniwvA2qwSC060KfqGPdglhvPMA2lPIoxs3PQIItj2iag35fONcQqgUaQ==} + engines: {node: '>=12.0'} + agentkeepalive@4.6.0: resolution: {integrity: sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==} engines: {node: '>= 8.0.0'} @@ -3857,6 +3870,10 @@ snapshots: '@tsconfig/node16@1.0.4': {} + '@types/adm-zip@0.5.7': + dependencies: + '@types/node': 22.19.7 + '@types/babel__core@7.20.5': dependencies: '@babel/parser': 7.28.6 @@ -4085,6 +4102,8 @@ snapshots: acorn@8.15.0: {} + adm-zip@0.5.16: {} + agentkeepalive@4.6.0: dependencies: humanize-ms: 1.2.1 diff --git a/src/commands/benchmark-job/logs.ts b/src/commands/benchmark-job/logs.ts new file mode 100644 index 0000000..d434210 --- /dev/null +++ b/src/commands/benchmark-job/logs.ts @@ -0,0 +1,348 @@ +/** + * Download devbox logs for all scenario runs in a benchmark job. + * + * Traverses job -> benchmark runs -> scenario runs, downloading and extracting + * log ZIPs into an organized directory structure: /// + * + * Also writes a results.json with scoring and state info for each scenario run. + */ + +import * as fs from "fs"; +import * as path from "path"; +import AdmZip from "adm-zip"; +import chalk from "chalk"; +import { getClient } from "../../utils/client.js"; +import { + getBenchmarkJob, + listBenchmarkRunScenarioRuns, + type BenchmarkJob, + type ScenarioRun, +} from "../../services/benchmarkJobService.js"; +import { outputError } from "../../utils/output.js"; +import type { BenchmarkJobView } from "@runloop/api-client/resources/benchmark-jobs"; + +interface LogsOptions { + outputDir?: string; + run?: string; + scenario?: string; +} + +// Info gathered for each benchmark run before downloading logs +interface BenchmarkRunInfo { + benchmarkRunId: string; + agentName: string; + modelName?: string; +} + +// A scenario outcome from a completed benchmark run +type ScenarioOutcome = NonNullable< + BenchmarkJobView["benchmark_outcomes"] +>[number]["scenario_outcomes"][number]; + +// Info for a single scenario run's log download +interface ScenarioLogTarget { + agentName: string; + modelName?: string; + scenarioName: string; + scenarioRunId: string; + scenarioRun: ScenarioRun; + outcome?: ScenarioOutcome; + destDir: string; +} + +/** Build the agent directory name, including model if present */ +function agentDirName(agentName: string, modelName?: string): string { + if (modelName) { + return sanitizeDirName(`${agentName}_${modelName}`); + } + return sanitizeDirName(agentName); +} + +/** Extract agent name and model from an in-progress run's agent config */ +function getAgentInfoFromInProgressRun( + run: NonNullable[number], +): { name: string; model?: string } { + const config = run.agent_config; + if (config && config.type === "job_agent") { + return { name: config.name, model: config.model_name ?? undefined }; + } + return { name: "unknown" }; +} + +/** Collect benchmark run IDs and agent names from the job */ +function collectBenchmarkRuns(job: BenchmarkJob): BenchmarkRunInfo[] { + const runs: BenchmarkRunInfo[] = []; + + for (const outcome of job.benchmark_outcomes || []) { + runs.push({ + benchmarkRunId: outcome.benchmark_run_id, + agentName: outcome.agent_name, + modelName: outcome.model_name ?? undefined, + }); + } + + for (const run of job.in_progress_runs || []) { + const info = getAgentInfoFromInProgressRun(run); + runs.push({ + benchmarkRunId: run.benchmark_run_id, + agentName: info.name, + modelName: info.model, + }); + } + + return runs; +} + +/** Build a map of scenario_run_id -> scenario outcome from completed outcomes */ +function buildScenarioOutcomeMap( + job: BenchmarkJob, +): Map { + const map = new Map(); + for (const outcome of job.benchmark_outcomes || []) { + for (const scenario of outcome.scenario_outcomes || []) { + map.set(scenario.scenario_run_id, scenario); + } + } + return map; +} + +/** Look up a scenario name, falling back to the API if not in the outcomes map */ +async function resolveScenarioName( + scenarioRunId: string, + scenarioId: string, + outcomeMap: Map, +): Promise { + const outcome = outcomeMap.get(scenarioRunId); + if (outcome) return outcome.scenario_name; + + try { + const client = getClient(); + const scenario = await client.scenarios.retrieve(scenarioId); + return scenario.name; + } catch { + return scenarioId; + } +} + +/** Sanitize a name for use as a directory name */ +function sanitizeDirName(name: string): string { + return name.replace(/[/\\:*?"<>|]/g, "_").replace(/\s+/g, "-"); +} + +/** + * Assign unique destination directories, appending -2, -3, etc. for duplicates. + * Mutates targets in place, setting destDir on each. + */ +function assignDestDirs(targets: ScenarioLogTarget[], outputDir: string): void { + const counts = new Map(); + + for (const target of targets) { + const baseDir = path.join( + outputDir, + agentDirName(target.agentName, target.modelName), + sanitizeDirName(target.scenarioName), + ); + const count = (counts.get(baseDir) || 0) + 1; + counts.set(baseDir, count); + target.destDir = count === 1 ? baseDir : `${baseDir}-${count}`; + } + + // If any key had duplicates, rename the first occurrence too (append -1) + const duplicateKeys = new Set( + [...counts.entries()].filter(([, c]) => c > 1).map(([k]) => k), + ); + if (duplicateKeys.size > 0) { + for (const target of targets) { + if (target.destDir && duplicateKeys.has(target.destDir)) { + target.destDir = `${target.destDir}-1`; + } + } + } +} + +/** Build a results summary object for a scenario run */ +function buildResultsSummary( + target: ScenarioLogTarget, +): Record { + const sr = target.scenarioRun; + const outcome = target.outcome; + + const summary: Record = { + scenario_run_id: sr.id, + scenario_id: sr.scenario_id, + scenario_name: target.scenarioName, + state: outcome?.state || sr.state, + score: outcome?.score ?? sr.scoring_contract_result?.score ?? null, + duration_ms: outcome?.duration_ms ?? sr.duration_ms ?? null, + }; + + // Include failure reason if present + if (outcome?.failure_reason) { + summary.failure_reason = outcome.failure_reason; + } + + // Include per-scorer details from the scenario run + const scoringResults = sr.scoring_contract_result?.scoring_function_results; + if (scoringResults && scoringResults.length > 0) { + summary.scoring_functions = scoringResults.map((fn) => ({ + name: fn.scoring_function_name, + score: fn.score, + state: fn.state, + output: fn.output, + })); + } + + return summary; +} + +/** Download logs ZIP, extract it, and write results.json for a single scenario run */ +async function downloadScenarioLogs( + target: ScenarioLogTarget, +): Promise { + const client = getClient(); + + try { + fs.mkdirSync(target.destDir, { recursive: true }); + + // Write results summary + const results = buildResultsSummary(target); + fs.writeFileSync( + path.join(target.destDir, "results.json"), + JSON.stringify(results, null, 2) + "\n", + ); + + // Download and extract log ZIP + const response = await client.scenarios.runs.downloadLogs( + target.scenarioRunId, + ); + const arrayBuffer = await response.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + + const zip = new AdmZip(buffer); + const prefix = `${target.scenarioRunId}_`; + for (const entry of zip.getEntries()) { + const name = entry.entryName.startsWith(prefix) + ? entry.entryName.slice(prefix.length) + : entry.entryName; + + // Prevent Zip Slip / directory traversal by ensuring the target path + // stays within target.destDir. + const destPath = path.resolve(target.destDir, name); + const destRoot = path.resolve(target.destDir) + path.sep; + if (!destPath.startsWith(destRoot)) { + console.warn( + chalk.yellow( + ` Skipping suspicious log file path in archive: ${entry.entryName}`, + ), + ); + continue; + } + + fs.writeFileSync(destPath, entry.getData()); + } + + return true; + } catch (error) { + const msg = error instanceof Error ? error.message : String(error); + console.error( + chalk.yellow( + ` Warning: failed to download logs for ${target.scenarioName} (${target.scenarioRunId}): ${msg}`, + ), + ); + return false; + } +} + +export async function downloadBenchmarkJobLogs( + jobId: string, + options: LogsOptions = {}, +) { + try { + const job = await getBenchmarkJob(jobId); + const outputDir = options.outputDir || `./logs/${jobId}`; + + // Collect all benchmark runs from the job + let runs = collectBenchmarkRuns(job); + if (runs.length === 0) { + console.log(chalk.yellow("No benchmark runs found for this job.")); + return; + } + + // Apply --run filter + if (options.run) { + runs = runs.filter((r) => r.benchmarkRunId === options.run); + if (runs.length === 0) { + outputError(`Benchmark run ${options.run} not found in job ${jobId}`); + } + } + + // Build scenario outcome lookup from completed outcomes + const outcomeMap = buildScenarioOutcomeMap(job); + + // Gather all scenario log targets across benchmark runs + const targets: ScenarioLogTarget[] = []; + + for (const run of runs) { + const agentLabel = run.modelName + ? `${run.agentName}:${run.modelName}` + : run.agentName; + console.log( + chalk.dim(`Fetching scenario runs for agent "${agentLabel}"...`), + ); + let scenarioRuns = await listBenchmarkRunScenarioRuns(run.benchmarkRunId); + + // Apply --scenario filter + if (options.scenario) { + scenarioRuns = scenarioRuns.filter((sr) => sr.id === options.scenario); + } + + for (const sr of scenarioRuns) { + const scenarioName = await resolveScenarioName( + sr.id, + sr.scenario_id, + outcomeMap, + ); + targets.push({ + agentName: run.agentName, + modelName: run.modelName, + scenarioName, + scenarioRunId: sr.id, + scenarioRun: sr, + outcome: outcomeMap.get(sr.id), + destDir: "", // assigned below + }); + } + } + + if (targets.length === 0) { + console.log(chalk.yellow("No scenario runs found to download logs for.")); + return; + } + + // Assign unique directory names, handling duplicates + assignDestDirs(targets, outputDir); + + console.log( + `\nDownloading logs for ${targets.length} scenario run(s) to ${chalk.bold(outputDir)}\n`, + ); + + // Download logs one at a time to avoid overwhelming the API + let succeeded = 0; + for (const target of targets) { + process.stdout.write( + ` ${target.agentName} / ${target.scenarioName}... `, + ); + const ok = await downloadScenarioLogs(target); + if (ok) { + console.log(chalk.green("done")); + succeeded++; + } + } + + console.log( + `\n${chalk.green(`Downloaded logs for ${succeeded}/${targets.length} scenario run(s)`)} to ${chalk.bold(outputDir)}`, + ); + } catch (error) { + outputError("Failed to download benchmark job logs", error); + } +} diff --git a/src/commands/scenario/info.ts b/src/commands/scenario/info.ts new file mode 100644 index 0000000..b2f66b8 --- /dev/null +++ b/src/commands/scenario/info.ts @@ -0,0 +1,198 @@ +/** + * Display scenario definition details in a readable format. + */ + +import chalk from "chalk"; +import { getClient } from "../../utils/client.js"; +import { output, outputError } from "../../utils/output.js"; +import type { ScenarioView } from "@runloop/api-client/resources/scenarios"; + +interface InfoOptions { + output?: string; +} + +/** Format a scoring function's details for display */ +function formatScorer( + scorer: ScenarioView["scoring_contract"]["scoring_function_parameters"][number], +): string { + const lines: string[] = []; + const s = scorer.scorer; + lines.push(` type: ${s.type}`); + lines.push(` weight: ${scorer.weight}`); + + switch (s.type) { + case "test_based_scorer": + if (s.test_command) lines.push(` test_command: ${s.test_command}`); + if (s.test_files) { + for (const tf of s.test_files) { + lines.push(` file: ${tf.file_path || "(unnamed)"}`); + if (tf.file_contents) { + const indented = tf.file_contents + .split("\n") + .map((l) => ` ${l}`) + .join("\n"); + lines.push(indented); + } + } + } + break; + case "bash_script_scorer": + if (s.bash_script) { + lines.push(" script:"); + lines.push( + s.bash_script + .split("\n") + .map((l) => ` ${l}`) + .join("\n"), + ); + } + break; + case "command_scorer": + if (s.command) lines.push(` command: ${s.command}`); + break; + case "python_script_scorer": + if (s.python_version_constraint) + lines.push(` python: ${s.python_version_constraint}`); + if (s.requirements_contents) + lines.push(` requirements: ${s.requirements_contents}`); + lines.push(" script:"); + lines.push( + s.python_script + .split("\n") + .map((l) => ` ${l}`) + .join("\n"), + ); + break; + case "ast_grep_scorer": + lines.push(` pattern: ${s.pattern}`); + lines.push(` search_directory: ${s.search_directory}`); + if (s.lang) lines.push(` lang: ${s.lang}`); + break; + case "custom_scorer": + lines.push(` custom_type: ${s.custom_scorer_type}`); + if (s.scorer_params) + lines.push(` params: ${JSON.stringify(s.scorer_params)}`); + break; + } + + return lines.join("\n"); +} + +function printScenario(scenario: ScenarioView): void { + console.log(chalk.bold("Scenario: ") + scenario.name); + console.log(chalk.dim("ID: ") + scenario.id); + console.log(chalk.dim("Status: ") + scenario.status); + if (scenario.validation_type && scenario.validation_type !== "UNSPECIFIED") { + console.log(chalk.dim("Validation: ") + scenario.validation_type); + } + + // Environment + const env = scenario.environment; + if (env) { + console.log(); + console.log(chalk.bold("Environment:")); + if (env.blueprint_id) console.log(` blueprint: ${env.blueprint_id}`); + if (env.snapshot_id) console.log(` snapshot: ${env.snapshot_id}`); + if (env.working_directory) + console.log(` working_directory: ${env.working_directory}`); + if (env.launch_parameters) { + const lp = env.launch_parameters; + if (lp.architecture) console.log(` architecture: ${lp.architecture}`); + if (lp.resource_size_request) + console.log(` resources: ${lp.resource_size_request}`); + if (lp.launch_commands?.length) { + console.log(" launch_commands:"); + for (const cmd of lp.launch_commands) { + console.log(` - ${cmd}`); + } + } + } + } + + // Required env vars / secrets + if (scenario.required_environment_variables?.length) { + console.log(); + console.log(chalk.bold("Required Environment Variables:")); + for (const v of scenario.required_environment_variables) { + console.log(` - ${v}`); + } + } + if (scenario.required_secret_names?.length) { + console.log(); + console.log(chalk.bold("Required Secrets:")); + for (const s of scenario.required_secret_names) { + console.log(` - ${s}`); + } + } + + // Metadata + if (scenario.metadata && Object.keys(scenario.metadata).length > 0) { + console.log(); + console.log(chalk.bold("Metadata:")); + for (const [k, v] of Object.entries(scenario.metadata)) { + console.log(` ${k}: ${v}`); + } + } + + // Problem statement + console.log(); + console.log(chalk.bold("Problem Statement:")); + console.log(indent(scenario.input_context.problem_statement, 2)); + + if (scenario.input_context.additional_context) { + console.log(); + console.log(chalk.bold("Additional Context:")); + console.log( + indent( + JSON.stringify(scenario.input_context.additional_context, null, 2), + 2, + ), + ); + } + + // Reference output + if (scenario.reference_output) { + console.log(); + console.log(chalk.bold("Reference Output:")); + console.log(indent(scenario.reference_output, 2)); + } + + // Scoring + const scorers = scenario.scoring_contract.scoring_function_parameters; + if (scorers.length > 0) { + console.log(); + console.log(chalk.bold("Scoring Functions:")); + for (const scorer of scorers) { + console.log(` ${chalk.cyan(scorer.name)}:`); + console.log(formatScorer(scorer)); + } + } + + if (scenario.scorer_timeout_sec) { + console.log(); + console.log(chalk.dim(`Scorer timeout: ${scenario.scorer_timeout_sec}s`)); + } +} + +function indent(text: string, spaces: number): string { + const pad = " ".repeat(spaces); + return text + .split("\n") + .map((l) => pad + l) + .join("\n"); +} + +export async function scenarioInfo(id: string, options: InfoOptions = {}) { + try { + const client = getClient(); + const scenario = await client.scenarios.retrieve(id); + + if (options.output && options.output !== "text") { + output(scenario, { format: options.output, defaultFormat: "json" }); + } else { + printScenario(scenario); + } + } catch (error) { + outputError("Failed to get scenario info", error); + } +} diff --git a/src/utils/commands.ts b/src/utils/commands.ts index 58d0839..0a25c86 100644 --- a/src/utils/commands.ts +++ b/src/utils/commands.ts @@ -1012,6 +1012,24 @@ export function createProgram(): Command { await installMcpConfig(); }); + // Scenario commands + const scenario = program + .command("scenario") + .description("Manage scenarios") + .alias("scn"); + + scenario + .command("info ") + .description("Display scenario definition details") + .option( + "-o, --output [format]", + "Output format: text|json|yaml (default: text)", + ) + .action(async (id, options) => { + const { scenarioInfo } = await import("../commands/scenario/info.js"); + await scenarioInfo(id, options); + }); + // Benchmark job commands const benchmarkJob = program .command("benchmark-job") @@ -1076,6 +1094,20 @@ export function createProgram(): Command { await watchBenchmarkJob(id); }); + benchmarkJob + .command("logs ") + .description( + "Download devbox logs for all scenario runs in a benchmark job", + ) + .option("-o, --output-dir ", "Output directory") + .option("--run ", "Download logs for a specific benchmark run only") + .option("--scenario ", "Download logs for a specific scenario run only") + .action(async (id, options) => { + const { downloadBenchmarkJobLogs } = + await import("../commands/benchmark-job/logs.js"); + await downloadBenchmarkJobLogs(id, options); + }); + benchmarkJob .command("list") .description("List benchmark jobs")