From 0ff1448b126c567ff605ffe9b5de55159d47bc91 Mon Sep 17 00:00:00 2001 From: Tony Deng Date: Fri, 6 Mar 2026 16:07:52 -0800 Subject: [PATCH 1/5] update with secretst and public benchmark display --- README.md | 23 ++ misc/config.yml | 3 +- package.json | 1 - src/screens/BenchmarkJobCreateScreen.tsx | 371 +++++++++++++++++++++++ src/screens/BenchmarkJobDetailScreen.tsx | 27 +- src/screens/BenchmarkJobListScreen.tsx | 82 +++-- src/screens/BenchmarkListScreen.tsx | 25 +- src/services/benchmarkService.ts | 4 +- 8 files changed, 488 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 86d1bb95..577406c8 100644 --- a/README.md +++ b/README.md @@ -235,6 +235,29 @@ pnpm run build # Watch mode pnpm run dev +``` + +### Debugging the TUI + +If the TUI crashes (e.g. when pressing Enter on a form field), you can capture logs to inspect the error: + +**Option 1 – stderr to file (no debug env)** +Useful to see uncaught errors and stack traces that the app writes to stderr: + +```bash +pnpm run build +pnpm run start:debug +# Reproduce the crash, then: +cat debug.log +``` + +**Option 2 – run under Node with inspector** +To get a stack trace from an uncaught exception, run with Node’s inspector and reproduce the crash; the process will pause and you can inspect the stack: + +```bash +node --inspect-brk dist/cli.js +# Attach Chrome/Edge to the URL shown (e.g. chrome://inspect) and resume; reproduce the crash. +``` ## Contributing diff --git a/misc/config.yml b/misc/config.yml index f3a417bc..f20e7d65 100644 --- a/misc/config.yml +++ b/misc/config.yml @@ -9,8 +9,7 @@ command: rli cwd: ~ # Export additional ENV variables -env: - recording: true +env: {} # Explicitly set the number of columns # or use `auto` to take the current diff --git a/package.json b/package.json index 82b3e67d..76abd48e 100644 --- a/package.json +++ b/package.json @@ -11,7 +11,6 @@ "build:mcp": "pnpm run build && node scripts/build-mcp.js", "dev": "tsc --watch", "start": "node dist/cli.js", - "start:debug": "node dist/cli.js 2> debug.log", "prepublishOnly": "pnpm run build", "version:patch": "pnpm version patch", "version:minor": "pnpm version minor", diff --git a/src/screens/BenchmarkJobCreateScreen.tsx b/src/screens/BenchmarkJobCreateScreen.tsx index 565e0171..c4138c52 100644 --- a/src/screens/BenchmarkJobCreateScreen.tsx +++ b/src/screens/BenchmarkJobCreateScreen.tsx @@ -29,12 +29,20 @@ import { type OrchestratorConfig, } from "../services/benchmarkJobService.js"; import type { Benchmark } from "../store/benchmarkStore.js"; +import { getClient } from "../utils/client.js"; + +/** Secret list item for account secrets picker */ +interface SecretListItem { + id: string; + name: string; +} type FormField = | "source_type" | "benchmark" | "scenarios" | "agents" + | "secrets" | "model_names" | "name" | "agent_timeout" @@ -49,6 +57,8 @@ interface FormData { scenarioNames: string[]; agentIds: string[]; agentNames: string[]; + /** Env var name -> secret name (account secret) */ + secretsMapping: Record; /** Comma-separated model names (one per agent, or one value applied to all) */ modelNamesInput: string; name: string; @@ -61,6 +71,9 @@ type ScreenState = | "picking_benchmark" | "picking_scenarios" | "picking_agents" + | "secrets_config" + | "picking_secret" + | "entering_env_var" | "creating" | "success" | "error"; @@ -80,6 +93,187 @@ interface BenchmarkJobCreateScreenProps { cloneConcurrentTrials?: string; } +/** + * Secrets config sub-screen: list mappings, Add, Done. Handles its own input so hooks are stable. + */ +function SecretsConfigView({ + mappingEntries, + selectedIndex, + onSelectIndex, + onAdd, + onDone, + onRemove, + onBack, +}: { + mappingEntries: [string, string][]; + selectedIndex: number; + onSelectIndex: (i: number) => void; + onAdd: () => void; + onDone: () => void; + onRemove: (envVar: string) => void; + onBack: () => void; +}) { + const totalOptions = mappingEntries.length + 2; + const idx = Math.min(selectedIndex, Math.max(0, totalOptions - 1)); + + useInput((_input, key) => { + if (key.upArrow && idx > 0) { + onSelectIndex(idx - 1); + } else if (key.downArrow && idx < totalOptions - 1) { + onSelectIndex(idx + 1); + } else if (key.return) { + if (idx === mappingEntries.length) { + onAdd(); + } else if (idx === mappingEntries.length + 1) { + onDone(); + } else { + const keyToRemove = mappingEntries[idx][0]; + onRemove(keyToRemove); + onSelectIndex(Math.max(0, idx - 1)); + } + } else if (key.escape) { + onBack(); + } + }); + + return ( + <> + + + + + {figures.pointer} Secrets (env → secret) + + + {mappingEntries.map(([envVar, secretName], i) => ( + + + + {idx === i ? figures.pointer : " "} + + + + {envVar} → {secretName} + + {idx === i && ( + + {" "} + Enter to remove + + )} + + ))} + + + + {idx === mappingEntries.length ? figures.pointer : " "} + + + + + Add secret + + + + + + {idx === mappingEntries.length + 1 ? figures.pointer : " "} + + + + Done + + + + + + ); +} + +/** + * Inline view to enter env var name for a selected secret + * Pre-fills with secret name so Enter uses it as-is; user can edit if needed. + */ +function EnvVarInputView({ + secretName, + onSubmit, + onCancel, +}: { + secretName: string; + onSubmit: (value: string) => void; + onCancel: () => void; +}) { + const [value, setValue] = React.useState(secretName); + useInput((_input, key) => { + if (key.return) { + onSubmit(value.trim() || secretName); + } else if (key.escape) { + onCancel(); + } + }); + return ( + <> + + + + + Env var name for secret "{secretName}": + + + + onSubmit(value.trim() || secretName)} + /> + + + + + ); +} + /** * Success screen component with input handling */ @@ -174,16 +368,26 @@ export function BenchmarkJobCreateScreen({ const [formData, setFormData] = React.useState(() => { let modelNamesInput = ""; + let secretsMapping: Record = {}; try { if (cloneAgentConfigs) { const arr = JSON.parse(cloneAgentConfigs) as Array<{ modelName?: string | null; model_name?: string | null; + secrets?: Record; + secret_names?: Record; }>; modelNamesInput = arr .map((a) => a.modelName ?? a.model_name ?? "") .filter(Boolean) .join(", "); + // Merge secrets from all agent configs into one mapping (clone prefill) + const allSecrets = arr + .map((a) => a.secrets ?? a.secret_names) + .filter((s): s is Record => !!s && typeof s === "object"); + if (allSecrets.length > 0) { + secretsMapping = Object.assign({}, ...allSecrets); + } } } catch { // ignore invalid JSON @@ -196,6 +400,7 @@ export function BenchmarkJobCreateScreen({ scenarioNames: [], agentIds: cloneAgentIds ? cloneAgentIds.split(",") : [], agentNames: cloneAgentNames ? cloneAgentNames.split(",") : [], + secretsMapping, modelNamesInput, name: cloneJobName ? `${cloneJobName} (clone)` : "", agentTimeout: cloneAgentTimeout || "", @@ -205,6 +410,14 @@ export function BenchmarkJobCreateScreen({ const [createdJob, setCreatedJob] = React.useState(null); const [error, setError] = React.useState(null); + /** When adding a secret: selected secret awaiting env var name */ + const [pendingSecretForEnv, setPendingSecretForEnv] = React.useState<{ + id: string; + name: string; + } | null>(null); + /** In secrets_config, index of mapping row selected for removal (or -1 for Add/Done) */ + const [secretsConfigSelectedIndex, setSecretsConfigSelectedIndex] = + React.useState(0); // Handle Ctrl+C to exit useExitOnCtrlC(); @@ -288,6 +501,16 @@ export function BenchmarkJobCreateScreen({ required: true, description: "Select one or more agents to run", }, + { + key: "secrets", + label: "Secrets (env → secret)", + type: "picker", + required: false, + description: + cloneFromJobId && Object.keys(formData.secretsMapping).length === 0 + ? "Optional. The API does not return secrets on job fetch; add any needed env→secret mappings here." + : "Optional. Map environment variable names to account secrets.", + }, { key: "model_names", label: "Model names (comma-separated, optional)", @@ -464,6 +687,55 @@ export function BenchmarkJobCreateScreen({ [fetchAgentsPage], ); + // Fetch account secrets for picker (client-side pagination) + const fetchSecretsPage = React.useCallback( + async (params: { limit: number; startingAt?: string; search?: string }) => { + const client = getClient(); + const result = await client.secrets.list({ limit: 5000 }); + const raw = (result.secrets || []) as Array<{ id: string; name: string }>; + let items = raw.map((s) => ({ id: s.id, name: s.name || s.id })); + if (params.search) { + const q = params.search.toLowerCase(); + items = items.filter( + (s) => + s.name.toLowerCase().includes(q) || s.id.toLowerCase().includes(q), + ); + } + const startIdx = params.startingAt + ? items.findIndex((s) => s.id === params.startingAt) + 1 + : 0; + const page = items.slice(startIdx, startIdx + params.limit); + return { + items: page, + hasMore: startIdx + params.limit < items.length, + totalCount: items.length, + }; + }, + [], + ); + + const secretPickerConfig = React.useMemo( + () => ({ + title: "Select Secret", + fetchPage: fetchSecretsPage, + getItemId: (s: SecretListItem) => s.id, + getItemLabel: (s: SecretListItem) => s.name, + getItemStatus: () => undefined, + mode: "single" as const, + minSelection: 1, + emptyMessage: "No secrets found", + searchPlaceholder: "Search secrets...", + breadcrumbItems: [ + { label: "Home" }, + { label: "Benchmarks" }, + { label: "Jobs" }, + { label: "Create" }, + { label: "Select Secret", active: true }, + ], + }), + [fetchSecretsPage], + ); + // Handle benchmark selection (single) const handleBenchmarkSelect = React.useCallback((items: Benchmark[]) => { if (items.length > 0) { @@ -497,6 +769,37 @@ export function BenchmarkJobCreateScreen({ setScreenState("form"); }, []); + // After picking a secret: set pending and go to env var input + const handleSecretSelect = React.useCallback((items: SecretListItem[]) => { + if (items.length > 0) { + const s = items[0]; + setPendingSecretForEnv({ id: s.id, name: s.name }); + setScreenState("entering_env_var"); + } else { + setScreenState("secrets_config"); + } + }, []); + + // After entering env var for pending secret: add mapping and return to secrets_config + // If envVarName is empty, use secret name as-is for the mapping (env var name = secret name). + const handleEnvVarForSecretSubmit = React.useCallback( + (envVarName: string) => { + const envVarToUse = envVarName.trim() || pendingSecretForEnv?.name || ""; + if (envVarToUse && pendingSecretForEnv) { + setFormData((prev) => ({ + ...prev, + secretsMapping: { + ...prev.secretsMapping, + [envVarToUse]: pendingSecretForEnv.name, + }, + })); + } + setPendingSecretForEnv(null); + setScreenState("secrets_config"); + }, + [pendingSecretForEnv], + ); + // Handle create const handleCreate = React.useCallback(async () => { if (!isFormValid) return; @@ -545,6 +848,13 @@ export function BenchmarkJobCreateScreen({ }); } + // Form secrets are source of truth: apply to all agents + if (Object.keys(formData.secretsMapping).length > 0) { + for (const config of agentConfigs) { + config.secrets = { ...formData.secretsMapping }; + } + } + // Use cloned orchestrator config if available, otherwise build from form let orchestratorConfig: OrchestratorConfig | undefined; if (cloneOrchestratorConfig) { @@ -617,6 +927,12 @@ export function BenchmarkJobCreateScreen({ currentField === "agents" ) { setScreenState("picking_agents"); + } else if ( + currentFieldDef?.type === "picker" && + currentField === "secrets" + ) { + setScreenState("secrets_config"); + setSecretsConfigSelectedIndex(0); } else if ( currentFieldDef?.type === "action" && currentField === "create" @@ -629,6 +945,54 @@ export function BenchmarkJobCreateScreen({ } }); + // ----- Secrets sub-flow ----- + const mappingEntries = Object.entries(formData.secretsMapping); + + if (screenState === "secrets_config") { + return ( + setScreenState("picking_secret")} + onDone={() => setScreenState("form")} + onRemove={(envVar) => { + setFormData((prev) => { + const next = { ...prev.secretsMapping }; + delete next[envVar]; + return { ...prev, secretsMapping: next }; + }); + setSecretsConfigSelectedIndex((i) => Math.max(0, i - 1)); + }} + onBack={() => setScreenState("form")} + /> + ); + } + + if (screenState === "entering_env_var" && pendingSecretForEnv) { + return ( + handleEnvVarForSecretSubmit(val)} + onCancel={() => { + setPendingSecretForEnv(null); + setScreenState("secrets_config"); + }} + /> + ); + } + + if (screenState === "picking_secret") { + return ( + + config={secretPickerConfig} + onSelect={handleSecretSelect} + onCancel={() => setScreenState("secrets_config")} + initialSelected={[]} + /> + ); + } + // Show benchmark picker (single-select) if (screenState === "picking_benchmark") { return ( @@ -750,6 +1114,13 @@ export function BenchmarkJobCreateScreen({ if (formData.agentNames.length === 0) return ""; if (formData.agentNames.length === 1) return formData.agentNames[0]; return `${formData.agentNames.length} agents selected`; + case "secrets": { + const keys = Object.keys(formData.secretsMapping); + if (keys.length === 0) return ""; + if (keys.length === 1) + return `${keys[0]} → ${formData.secretsMapping[keys[0]]}`; + return `${keys.length} mappings`; + } case "model_names": return formData.modelNamesInput; case "name": diff --git a/src/screens/BenchmarkJobDetailScreen.tsx b/src/screens/BenchmarkJobDetailScreen.tsx index d71ce721..5d0ae478 100644 --- a/src/screens/BenchmarkJobDetailScreen.tsx +++ b/src/screens/BenchmarkJobDetailScreen.tsx @@ -633,15 +633,24 @@ export function BenchmarkJobDetailScreen({ // Extract agent configs - both full configs and legacy fields if (resource.job_spec?.agent_configs) { - const agentConfigs = resource.job_spec.agent_configs.map((a: any) => ({ - agentId: a.agent_id, - name: a.name, - modelName: a.model_name, - timeoutSeconds: a.timeout_seconds, - kwargs: a.kwargs, - environmentVariables: a.agent_environment?.environment_variables, - secrets: a.agent_environment?.secrets, - })); + const agentConfigs = resource.job_spec.agent_configs.map((a: any) => { + const env = a.agent_environment; + const secrets = + env?.secrets ?? + env?.secret_names ?? + (typeof env?.secret_refs === "object" && env.secret_refs + ? env.secret_refs + : undefined); + return { + agentId: a.agent_id, + name: a.name, + modelName: a.model_name, + timeoutSeconds: a.timeout_seconds, + kwargs: a.kwargs, + environmentVariables: env?.environment_variables, + secrets, + }; + }); cloneParams.cloneAgentConfigs = JSON.stringify(agentConfigs); // Also extract legacy fields for form initialization diff --git a/src/screens/BenchmarkJobListScreen.tsx b/src/screens/BenchmarkJobListScreen.tsx index 83475e0b..8bd0b564 100644 --- a/src/screens/BenchmarkJobListScreen.tsx +++ b/src/screens/BenchmarkJobListScreen.tsx @@ -318,16 +318,24 @@ export function BenchmarkJobListScreen() { // Extract agent configs - both full configs and legacy fields if (selectedJob.job_spec?.agent_configs) { const agentConfigs = selectedJob.job_spec.agent_configs.map( - (a: any) => ({ - agentId: a.agent_id, - name: a.name, - modelName: a.model_name, - timeoutSeconds: a.timeout_seconds, - kwargs: a.kwargs, - environmentVariables: - a.agent_environment?.environment_variables, - secrets: a.agent_environment?.secrets, - }), + (a: any) => { + const env = a.agent_environment; + const secrets = + env?.secrets ?? + env?.secret_names ?? + (typeof env?.secret_refs === "object" && env.secret_refs + ? env.secret_refs + : undefined); + return { + agentId: a.agent_id, + name: a.name, + modelName: a.model_name, + timeoutSeconds: a.timeout_seconds, + kwargs: a.kwargs, + environmentVariables: env?.environment_variables, + secrets, + }; + }, ); cloneParams.cloneAgentConfigs = JSON.stringify(agentConfigs); @@ -396,15 +404,24 @@ export function BenchmarkJobListScreen() { // Extract agent configs - both full configs and legacy fields if (selectedJob.job_spec?.agent_configs) { const agentConfigs = selectedJob.job_spec.agent_configs.map( - (a: any) => ({ - agentId: a.agent_id, - name: a.name, - modelName: a.model_name, - timeoutSeconds: a.timeout_seconds, - kwargs: a.kwargs, - environmentVariables: a.agent_environment?.environment_variables, - secrets: a.agent_environment?.secrets, - }), + (a: any) => { + const env = a.agent_environment; + const secrets = + env?.secrets ?? + env?.secret_names ?? + (typeof env?.secret_refs === "object" && env.secret_refs + ? env.secret_refs + : undefined); + return { + agentId: a.agent_id, + name: a.name, + modelName: a.model_name, + timeoutSeconds: a.timeout_seconds, + kwargs: a.kwargs, + environmentVariables: env?.environment_variables, + secrets, + }; + }, ); cloneParams.cloneAgentConfigs = JSON.stringify(agentConfigs); @@ -504,15 +521,24 @@ export function BenchmarkJobListScreen() { // Extract agent configs - both full configs and legacy fields if (selectedJob.job_spec?.agent_configs) { const agentConfigs = selectedJob.job_spec.agent_configs.map( - (a: any) => ({ - agentId: a.agent_id, - name: a.name, - modelName: a.model_name, - timeoutSeconds: a.timeout_seconds, - kwargs: a.kwargs, - environmentVariables: a.agent_environment?.environment_variables, - secrets: a.agent_environment?.secrets, - }), + (a: any) => { + const env = a.agent_environment; + const secrets = + env?.secrets ?? + env?.secret_names ?? + (typeof env?.secret_refs === "object" && env.secret_refs + ? env.secret_refs + : undefined); + return { + agentId: a.agent_id, + name: a.name, + modelName: a.model_name, + timeoutSeconds: a.timeout_seconds, + kwargs: a.kwargs, + environmentVariables: env?.environment_variables, + secrets, + }; + }, ); cloneParams.cloneAgentConfigs = JSON.stringify(agentConfigs); diff --git a/src/screens/BenchmarkListScreen.tsx b/src/screens/BenchmarkListScreen.tsx index fe953b30..c37ee700 100644 --- a/src/screens/BenchmarkListScreen.tsx +++ b/src/screens/BenchmarkListScreen.tsx @@ -24,7 +24,10 @@ import { useViewportHeight } from "../hooks/useViewportHeight.js"; import { useExitOnCtrlC } from "../hooks/useExitOnCtrlC.js"; import { useCursorPagination } from "../hooks/useCursorPagination.js"; import { useListSearch } from "../hooks/useListSearch.js"; -import { listBenchmarks } from "../services/benchmarkService.js"; +import { + listBenchmarks, + listPublicBenchmarks, +} from "../services/benchmarkService.js"; import type { Benchmark } from "../store/benchmarkStore.js"; export function BenchmarkListScreen() { @@ -33,6 +36,7 @@ export function BenchmarkListScreen() { const [selectedIndex, setSelectedIndex] = React.useState(0); const [showPopup, setShowPopup] = React.useState(false); const [selectedOperation, setSelectedOperation] = React.useState(0); + const [showPublic, setShowPublic] = React.useState(false); // Search state const search = useListSearch({ @@ -61,7 +65,8 @@ export function BenchmarkListScreen() { // Fetch function for pagination hook const fetchPage = React.useCallback( async (params: { limit: number; startingAt?: string }) => { - const result = await listBenchmarks({ + const listFn = showPublic ? listPublicBenchmarks : listBenchmarks; + const result = await listFn({ limit: params.limit, startingAfter: params.startingAt, search: search.submittedSearchQuery || undefined, @@ -73,7 +78,7 @@ export function BenchmarkListScreen() { totalCount: result.totalCount, }; }, - [search.submittedSearchQuery], + [showPublic, search.submittedSearchQuery], ); // Use the shared pagination hook @@ -94,7 +99,7 @@ export function BenchmarkListScreen() { getItemId: (benchmark: Benchmark) => benchmark.id, pollInterval: 5000, pollingEnabled: !showPopup && !search.searchMode, - deps: [PAGE_SIZE, search.submittedSearchQuery], + deps: [PAGE_SIZE, search.submittedSearchQuery, showPublic], }); // Operations for benchmarks @@ -271,6 +276,9 @@ export function BenchmarkListScreen() { }); } else if (input === "/") { search.enterSearchMode(); + } else if (input === "t") { + setShowPublic((prev) => !prev); + setSelectedIndex(0); } else if (key.escape) { if (search.handleEscape()) { return; @@ -339,11 +347,11 @@ export function BenchmarkListScreen() { data={benchmarks} keyExtractor={(benchmark: Benchmark) => benchmark.id} selectedIndex={selectedIndex} - title={`benchmarks[${totalCount}]`} + title={`benchmarks[${totalCount}] ${showPublic ? "(public)" : "(private)"}`} columns={columns} emptyState={ - {figures.info} No benchmarks found + {figures.info} No {showPublic ? "public " : ""}benchmarks found } /> @@ -359,6 +367,10 @@ export function BenchmarkListScreen() { {" "} total + + {" "} + • {showPublic ? "Public" : "Private"} + {totalPages > 1 && ( <> @@ -421,6 +433,7 @@ export function BenchmarkListScreen() { { key: "Enter", label: "Details" }, { key: "c", label: "Create Job" }, { key: "a", label: "Actions" }, + { key: "t", label: showPublic ? "Private" : "Public" }, { key: "/", label: "Search" }, { key: "Esc", label: "Back" }, ]} diff --git a/src/services/benchmarkService.ts b/src/services/benchmarkService.ts index e6373464..7abab590 100644 --- a/src/services/benchmarkService.ts +++ b/src/services/benchmarkService.ts @@ -214,7 +214,7 @@ export async function listPublicBenchmarks( } /** - * Create/start a benchmark run with selected benchmarks + * Create/start a benchmark run with selected benchmarks (POST /v1/benchmark_runs) */ export async function createBenchmarkRun( benchmarkIds: string[], @@ -241,4 +241,4 @@ export async function createBenchmarkRun( // Use type assertion since the API client types may not be fully defined // eslint-disable-next-line @typescript-eslint/no-explicit-any return (client.benchmarkRuns as any).create(createParams); -} + } From e5b790408877a813465234cc915a4df9ca57cb18 Mon Sep 17 00:00:00 2001 From: Tony Deng Date: Fri, 6 Mar 2026 16:09:26 -0800 Subject: [PATCH 2/5] fmt --- src/screens/BenchmarkJobCreateScreen.tsx | 16 ++++++++++++---- src/screens/BenchmarkListScreen.tsx | 5 ++++- src/services/benchmarkService.ts | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/screens/BenchmarkJobCreateScreen.tsx b/src/screens/BenchmarkJobCreateScreen.tsx index c4138c52..c1af08be 100644 --- a/src/screens/BenchmarkJobCreateScreen.tsx +++ b/src/screens/BenchmarkJobCreateScreen.tsx @@ -177,13 +177,17 @@ function SecretsConfigView({ {idx === mappingEntries.length ? figures.pointer : " "} - + + Add secret @@ -191,7 +195,9 @@ function SecretsConfigView({ @@ -384,7 +390,9 @@ export function BenchmarkJobCreateScreen({ // Merge secrets from all agent configs into one mapping (clone prefill) const allSecrets = arr .map((a) => a.secrets ?? a.secret_names) - .filter((s): s is Record => !!s && typeof s === "object"); + .filter( + (s): s is Record => !!s && typeof s === "object", + ); if (allSecrets.length > 0) { secretsMapping = Object.assign({}, ...allSecrets); } diff --git a/src/screens/BenchmarkListScreen.tsx b/src/screens/BenchmarkListScreen.tsx index c37ee700..3d6fe786 100644 --- a/src/screens/BenchmarkListScreen.tsx +++ b/src/screens/BenchmarkListScreen.tsx @@ -367,7 +367,10 @@ export function BenchmarkListScreen() { {" "} total - + {" "} • {showPublic ? "Public" : "Private"} diff --git a/src/services/benchmarkService.ts b/src/services/benchmarkService.ts index 7abab590..d1aabb86 100644 --- a/src/services/benchmarkService.ts +++ b/src/services/benchmarkService.ts @@ -241,4 +241,4 @@ export async function createBenchmarkRun( // Use type assertion since the API client types may not be fully defined // eslint-disable-next-line @typescript-eslint/no-explicit-any return (client.benchmarkRuns as any).create(createParams); - } +} From a1d953a3fefab6a254d8be7add724bba85e1984c Mon Sep 17 00:00:00 2001 From: Tony Deng Date: Fri, 6 Mar 2026 16:14:51 -0800 Subject: [PATCH 3/5] "Custom" benchmarks, reduce surface area of changes --- misc/config.yml | 3 ++- package.json | 1 + src/screens/BenchmarkListScreen.tsx | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/misc/config.yml b/misc/config.yml index f20e7d65..f3a417bc 100644 --- a/misc/config.yml +++ b/misc/config.yml @@ -9,7 +9,8 @@ command: rli cwd: ~ # Export additional ENV variables -env: {} +env: + recording: true # Explicitly set the number of columns # or use `auto` to take the current diff --git a/package.json b/package.json index 76abd48e..82b3e67d 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ "build:mcp": "pnpm run build && node scripts/build-mcp.js", "dev": "tsc --watch", "start": "node dist/cli.js", + "start:debug": "node dist/cli.js 2> debug.log", "prepublishOnly": "pnpm run build", "version:patch": "pnpm version patch", "version:minor": "pnpm version minor", diff --git a/src/screens/BenchmarkListScreen.tsx b/src/screens/BenchmarkListScreen.tsx index 3d6fe786..0c6d35c4 100644 --- a/src/screens/BenchmarkListScreen.tsx +++ b/src/screens/BenchmarkListScreen.tsx @@ -372,7 +372,7 @@ export function BenchmarkListScreen() { dimColor={!showPublic} > {" "} - • {showPublic ? "Public" : "Private"} + • {showPublic ? "Public" : "Custom"} {totalPages > 1 && ( <> @@ -436,7 +436,7 @@ export function BenchmarkListScreen() { { key: "Enter", label: "Details" }, { key: "c", label: "Create Job" }, { key: "a", label: "Actions" }, - { key: "t", label: showPublic ? "Private" : "Public" }, + { key: "t", label: showPublic ? "Custom" : "Public" }, { key: "/", label: "Search" }, { key: "Esc", label: "Back" }, ]} From 8b46022bc28cbd6078e70a8a9d26e07983b9353b Mon Sep 17 00:00:00 2001 From: Tony Deng Date: Wed, 15 Apr 2026 14:42:25 -0700 Subject: [PATCH 4/5] show benchmarks interactively again --- src/components/MainMenu.tsx | 4 ---- src/screens/BenchmarkListScreen.tsx | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/components/MainMenu.tsx b/src/components/MainMenu.tsx index 683b3660..6e456835 100644 --- a/src/components/MainMenu.tsx +++ b/src/components/MainMenu.tsx @@ -25,7 +25,6 @@ interface MenuItem { } const allMenuItems: MenuItem[] = [ - /** { key: "benchmarks", label: "Benchmarks", @@ -33,7 +32,6 @@ const allMenuItems: MenuItem[] = [ icon: "▷", color: colors.success, }, - */ { key: "devboxes", label: "Devboxes", @@ -207,10 +205,8 @@ export const MainMenu = ({ onSelect }: MainMenuProps) => { selectByKey("snapshots"); } else if (input === "o") { selectByKey("objects"); - /** } else if (input === "e") { selectByKey("benchmarks"); - */ } else if (input === "n") { selectByKey("settings"); } else if (input >= "1" && input <= "9") { diff --git a/src/screens/BenchmarkListScreen.tsx b/src/screens/BenchmarkListScreen.tsx index c4144268..bd25cb64 100644 --- a/src/screens/BenchmarkListScreen.tsx +++ b/src/screens/BenchmarkListScreen.tsx @@ -36,7 +36,7 @@ export function BenchmarkListScreen() { const [selectedIndex, setSelectedIndex] = React.useState(0); const [showPopup, setShowPopup] = React.useState(false); const [selectedOperation, setSelectedOperation] = React.useState(0); - const [showPublic, setShowPublic] = React.useState(false); + const [showPublic, setShowPublic] = React.useState(true); // Search state const search = useListSearch({ @@ -347,7 +347,7 @@ export function BenchmarkListScreen() { data={benchmarks} keyExtractor={(benchmark: Benchmark) => benchmark.id} selectedIndex={selectedIndex} - title={`benchmarks[${totalCount}] ${showPublic ? "(public)" : "(private)"}`} + title={`benchmarks[${totalCount}] ${showPublic ? "(public)" : "(custom)"}`} columns={columns} emptyState={ From 947a44885da2e0cf31a97018e37315e8995c8e20 Mon Sep 17 00:00:00 2001 From: Rob von Behren Date: Wed, 15 Apr 2026 16:17:48 -0700 Subject: [PATCH 5/5] consolidate boilerplate into helper function --- src/screens/BenchmarkJobDetailScreen.tsx | 82 +------- src/screens/BenchmarkJobListScreen.tsx | 237 +---------------------- src/services/benchmarkJobService.ts | 78 ++++++++ 3 files changed, 87 insertions(+), 310 deletions(-) diff --git a/src/screens/BenchmarkJobDetailScreen.tsx b/src/screens/BenchmarkJobDetailScreen.tsx index cb1ecea1..e9fb2ed6 100644 --- a/src/screens/BenchmarkJobDetailScreen.tsx +++ b/src/screens/BenchmarkJobDetailScreen.tsx @@ -16,7 +16,10 @@ import { type DetailSection, type ResourceOperation, } from "../components/ResourceDetailPage.js"; -import { getBenchmarkJob } from "../services/benchmarkJobService.js"; +import { + getBenchmarkJob, + buildCloneParams, +} from "../services/benchmarkJobService.js"; import { getBenchmarkRun } from "../services/benchmarkService.js"; import { useResourceDetail } from "../hooks/useResourceDetail.js"; import { SpinnerComponent } from "../components/Spinner.js"; @@ -610,82 +613,7 @@ export function BenchmarkJobDetailScreen({ }); } } else if (operation === "clone-job") { - // Pass job data for cloning - const cloneParams: any = { - cloneFromJobId: resource.id, - cloneJobName: resource.name, - }; - - // Determine source type and extract IDs - if (resource.job_spec) { - const spec = resource.job_spec as any; - - // Check if it's a scenarios spec (has scenario_ids array) - if (spec.scenario_ids && Array.isArray(spec.scenario_ids)) { - cloneParams.cloneSourceType = "scenarios"; - cloneParams.initialScenarioIds = spec.scenario_ids.join(","); - } - // Check if it's a benchmark spec (has benchmark_id) - else if (spec.benchmark_id) { - cloneParams.cloneSourceType = "benchmark"; - cloneParams.initialBenchmarkIds = spec.benchmark_id; - } - // Fallback: check job_source - else if (resource.job_source) { - const source = resource.job_source as any; - if (source.scenario_ids && Array.isArray(source.scenario_ids)) { - cloneParams.cloneSourceType = "scenarios"; - cloneParams.initialScenarioIds = source.scenario_ids.join(","); - } else if (source.benchmark_id) { - cloneParams.cloneSourceType = "benchmark"; - cloneParams.initialBenchmarkIds = source.benchmark_id; - } - } - } - - // Extract agent configs - both full configs and legacy fields - if (resource.job_spec?.agent_configs) { - const agentConfigs = resource.job_spec.agent_configs.map((a: any) => { - const env = a.agent_environment; - const secrets = - env?.secrets ?? - env?.secret_names ?? - (typeof env?.secret_refs === "object" && env.secret_refs - ? env.secret_refs - : undefined); - return { - agentId: a.agent_id, - name: a.name, - modelName: a.model_name, - timeoutSeconds: a.timeout_seconds, - kwargs: a.kwargs, - environmentVariables: env?.environment_variables, - secrets, - }; - }); - cloneParams.cloneAgentConfigs = JSON.stringify(agentConfigs); - - // Also extract legacy fields for form initialization - cloneParams.cloneAgentIds = resource.job_spec.agent_configs - .map((a: any) => a.agent_id) - .join(","); - cloneParams.cloneAgentNames = resource.job_spec.agent_configs - .map((a: any) => a.name) - .join(","); - } - - // Extract orchestrator config - if (resource.job_spec?.orchestrator_config) { - const orch = resource.job_spec.orchestrator_config; - cloneParams.cloneOrchestratorConfig = JSON.stringify({ - nAttempts: orch.n_attempts, - nConcurrentTrials: orch.n_concurrent_trials, - quiet: orch.quiet, - timeoutMultiplier: orch.timeout_multiplier, - }); - } - - navigate("benchmark-job-create", cloneParams); + navigate("benchmark-job-create", buildCloneParams(resource)); } }; diff --git a/src/screens/BenchmarkJobListScreen.tsx b/src/screens/BenchmarkJobListScreen.tsx index b0222eed..82ad69ff 100644 --- a/src/screens/BenchmarkJobListScreen.tsx +++ b/src/screens/BenchmarkJobListScreen.tsx @@ -26,6 +26,7 @@ import { useCursorPagination } from "../hooks/useCursorPagination.js"; import { useListSearch } from "../hooks/useListSearch.js"; import { listBenchmarkJobs, + buildCloneParams, type BenchmarkJob, } from "../services/benchmarkJobService.js"; @@ -282,84 +283,7 @@ export function BenchmarkJobListScreen() { benchmarkJobId: selectedJob.id, }); } else if (operationKey === "clone_job" && selectedJob) { - // Pass job data for cloning - const cloneParams: any = { - cloneFromJobId: selectedJob.id, - cloneJobName: selectedJob.name, - }; - - // Determine source type and extract IDs - if (selectedJob.job_spec) { - const spec = selectedJob.job_spec as any; - - // Check if it's a scenarios spec (has scenario_ids array) - if (spec.scenario_ids && Array.isArray(spec.scenario_ids)) { - cloneParams.cloneSourceType = "scenarios"; - cloneParams.initialScenarioIds = spec.scenario_ids.join(","); - } - // Check if it's a benchmark spec (has benchmark_id) - else if (spec.benchmark_id) { - cloneParams.cloneSourceType = "benchmark"; - cloneParams.initialBenchmarkIds = spec.benchmark_id; - } - // Fallback: check job_source - else if (selectedJob.job_source) { - const source = selectedJob.job_source as any; - if (source.scenario_ids && Array.isArray(source.scenario_ids)) { - cloneParams.cloneSourceType = "scenarios"; - cloneParams.initialScenarioIds = source.scenario_ids.join(","); - } else if (source.benchmark_id) { - cloneParams.cloneSourceType = "benchmark"; - cloneParams.initialBenchmarkIds = source.benchmark_id; - } - } - } - - // Extract agent configs - both full configs and legacy fields - if (selectedJob.job_spec?.agent_configs) { - const agentConfigs = selectedJob.job_spec.agent_configs.map( - (a: any) => { - const env = a.agent_environment; - const secrets = - env?.secrets ?? - env?.secret_names ?? - (typeof env?.secret_refs === "object" && env.secret_refs - ? env.secret_refs - : undefined); - return { - agentId: a.agent_id, - name: a.name, - modelName: a.model_name, - timeoutSeconds: a.timeout_seconds, - kwargs: a.kwargs, - environmentVariables: env?.environment_variables, - secrets, - }; - }, - ); - cloneParams.cloneAgentConfigs = JSON.stringify(agentConfigs); - - // Also extract legacy fields for form initialization - cloneParams.cloneAgentIds = selectedJob.job_spec.agent_configs - .map((a: any) => a.agent_id) - .join(","); - cloneParams.cloneAgentNames = selectedJob.job_spec.agent_configs - .map((a: any) => a.name) - .join(","); - } - - // Extract orchestrator config - if (selectedJob.job_spec?.orchestrator_config) { - const orch = selectedJob.job_spec.orchestrator_config; - cloneParams.cloneOrchestratorConfig = JSON.stringify({ - nAttempts: orch.n_attempts, - nConcurrentTrials: orch.n_concurrent_trials, - quiet: orch.quiet, - timeoutMultiplier: orch.timeout_multiplier, - }); - } - - navigate("benchmark-job-create", cloneParams); + navigate("benchmark-job-create", buildCloneParams(selectedJob)); } } else if (input === "v" && selectedJob) { setShowPopup(false); @@ -368,84 +292,7 @@ export function BenchmarkJobListScreen() { }); } else if (input === "n" && selectedJob) { setShowPopup(false); - // Clone the selected job - const cloneParams: any = { - cloneFromJobId: selectedJob.id, - cloneJobName: selectedJob.name, - }; - - // Determine source type and extract IDs - if (selectedJob.job_spec) { - const spec = selectedJob.job_spec as any; - - // Check if it's a scenarios spec (has scenario_ids array) - if (spec.scenario_ids && Array.isArray(spec.scenario_ids)) { - cloneParams.cloneSourceType = "scenarios"; - cloneParams.initialScenarioIds = spec.scenario_ids.join(","); - } - // Check if it's a benchmark spec (has benchmark_id) - else if (spec.benchmark_id) { - cloneParams.cloneSourceType = "benchmark"; - cloneParams.initialBenchmarkIds = spec.benchmark_id; - } - // Fallback: check job_source - else if (selectedJob.job_source) { - const source = selectedJob.job_source as any; - if (source.scenario_ids && Array.isArray(source.scenario_ids)) { - cloneParams.cloneSourceType = "scenarios"; - cloneParams.initialScenarioIds = source.scenario_ids.join(","); - } else if (source.benchmark_id) { - cloneParams.cloneSourceType = "benchmark"; - cloneParams.initialBenchmarkIds = source.benchmark_id; - } - } - } - - // Extract agent configs - both full configs and legacy fields - if (selectedJob.job_spec?.agent_configs) { - const agentConfigs = selectedJob.job_spec.agent_configs.map( - (a: any) => { - const env = a.agent_environment; - const secrets = - env?.secrets ?? - env?.secret_names ?? - (typeof env?.secret_refs === "object" && env.secret_refs - ? env.secret_refs - : undefined); - return { - agentId: a.agent_id, - name: a.name, - modelName: a.model_name, - timeoutSeconds: a.timeout_seconds, - kwargs: a.kwargs, - environmentVariables: env?.environment_variables, - secrets, - }; - }, - ); - cloneParams.cloneAgentConfigs = JSON.stringify(agentConfigs); - - // Also extract legacy fields for form initialization - cloneParams.cloneAgentIds = selectedJob.job_spec.agent_configs - .map((a: any) => a.agent_id) - .join(","); - cloneParams.cloneAgentNames = selectedJob.job_spec.agent_configs - .map((a: any) => a.name) - .join(","); - } - - // Extract orchestrator config - if (selectedJob.job_spec?.orchestrator_config) { - const orch = selectedJob.job_spec.orchestrator_config; - cloneParams.cloneOrchestratorConfig = JSON.stringify({ - nAttempts: orch.n_attempts, - nConcurrentTrials: orch.n_concurrent_trials, - quiet: orch.quiet, - timeoutMultiplier: orch.timeout_multiplier, - }); - } - - navigate("benchmark-job-create", cloneParams); + navigate("benchmark-job-create", buildCloneParams(selectedJob)); } else if (key.escape || input === "q") { setShowPopup(false); setSelectedOperation(0); @@ -486,83 +333,7 @@ export function BenchmarkJobListScreen() { } else if (input === "3") { // Quick shortcut to clone the selected job, or create a new job if none selected if (selectedJob) { - const cloneParams: any = { - cloneFromJobId: selectedJob.id, - cloneJobName: selectedJob.name, - }; - - // Determine source type and extract IDs - if (selectedJob.job_spec) { - const spec = selectedJob.job_spec as any; - - // Check if it's a scenarios spec (has scenario_ids array) - if (spec.scenario_ids && Array.isArray(spec.scenario_ids)) { - cloneParams.cloneSourceType = "scenarios"; - cloneParams.initialScenarioIds = spec.scenario_ids.join(","); - } - // Check if it's a benchmark spec (has benchmark_id) - else if (spec.benchmark_id) { - cloneParams.cloneSourceType = "benchmark"; - cloneParams.initialBenchmarkIds = spec.benchmark_id; - } - // Fallback: check job_source - else if (selectedJob.job_source) { - const source = selectedJob.job_source as any; - if (source.scenario_ids && Array.isArray(source.scenario_ids)) { - cloneParams.cloneSourceType = "scenarios"; - cloneParams.initialScenarioIds = source.scenario_ids.join(","); - } else if (source.benchmark_id) { - cloneParams.cloneSourceType = "benchmark"; - cloneParams.initialBenchmarkIds = source.benchmark_id; - } - } - } - - // Extract agent configs - both full configs and legacy fields - if (selectedJob.job_spec?.agent_configs) { - const agentConfigs = selectedJob.job_spec.agent_configs.map( - (a: any) => { - const env = a.agent_environment; - const secrets = - env?.secrets ?? - env?.secret_names ?? - (typeof env?.secret_refs === "object" && env.secret_refs - ? env.secret_refs - : undefined); - return { - agentId: a.agent_id, - name: a.name, - modelName: a.model_name, - timeoutSeconds: a.timeout_seconds, - kwargs: a.kwargs, - environmentVariables: env?.environment_variables, - secrets, - }; - }, - ); - cloneParams.cloneAgentConfigs = JSON.stringify(agentConfigs); - - // Also extract legacy fields for form initialization - cloneParams.cloneAgentIds = selectedJob.job_spec.agent_configs - .map((a: any) => a.agent_id) - .join(","); - cloneParams.cloneAgentNames = selectedJob.job_spec.agent_configs - .map((a: any) => a.name) - .join(","); - } - - // Extract orchestrator config - if (selectedJob.job_spec?.orchestrator_config) { - const orch = selectedJob.job_spec.orchestrator_config; - cloneParams.cloneOrchestratorConfig = JSON.stringify({ - nAttempts: orch.n_attempts, - nConcurrentTrials: orch.n_concurrent_trials, - quiet: orch.quiet, - timeoutMultiplier: orch.timeout_multiplier, - }); - } - - navigate("benchmark-job-create", cloneParams); + navigate("benchmark-job-create", buildCloneParams(selectedJob)); } else { navigate("benchmark-job-create"); } diff --git a/src/services/benchmarkJobService.ts b/src/services/benchmarkJobService.ts index 9ad150b6..c766cb1f 100644 --- a/src/services/benchmarkJobService.ts +++ b/src/services/benchmarkJobService.ts @@ -19,6 +19,84 @@ export type BenchmarkRun = BenchmarkRunView; export type ScenarioRun = ScenarioRunView; export type { BenchmarkJobCreateParams }; +/** + * Extract clone parameters from a benchmark job for navigating to the create screen. + * Handles source type detection, agent config mapping (with secrets format variants), + * and orchestrator config extraction. + */ +export function buildCloneParams(job: BenchmarkJob): Record { + const params: Record = { + cloneFromJobId: job.id, + cloneJobName: job.name ?? "", + }; + + // Determine source type and extract IDs + if (job.job_spec) { + const spec = job.job_spec as any; + + if (spec.scenario_ids && Array.isArray(spec.scenario_ids)) { + params.cloneSourceType = "scenarios"; + params.initialScenarioIds = spec.scenario_ids.join(","); + } else if (spec.benchmark_id) { + params.cloneSourceType = "benchmark"; + params.initialBenchmarkIds = spec.benchmark_id; + } else if (job.job_source) { + const source = job.job_source as any; + if (source.scenario_ids && Array.isArray(source.scenario_ids)) { + params.cloneSourceType = "scenarios"; + params.initialScenarioIds = source.scenario_ids.join(","); + } else if (source.benchmark_id) { + params.cloneSourceType = "benchmark"; + params.initialBenchmarkIds = source.benchmark_id; + } + } + } + + // Extract agent configs + if (job.job_spec?.agent_configs) { + const agentConfigs = job.job_spec.agent_configs.map((a: any) => { + const env = a.agent_environment; + const secrets = + env?.secrets ?? + env?.secret_names ?? + (typeof env?.secret_refs === "object" && env.secret_refs + ? env.secret_refs + : undefined); + return { + agentId: a.agent_id, + name: a.name, + modelName: a.model_name, + timeoutSeconds: a.timeout_seconds, + kwargs: a.kwargs, + environmentVariables: env?.environment_variables, + secrets, + }; + }); + params.cloneAgentConfigs = JSON.stringify(agentConfigs); + + // Also extract legacy fields for form initialization + params.cloneAgentIds = job.job_spec.agent_configs + .map((a: any) => a.agent_id) + .join(","); + params.cloneAgentNames = job.job_spec.agent_configs + .map((a: any) => a.name) + .join(","); + } + + // Extract orchestrator config + if (job.job_spec?.orchestrator_config) { + const orch = job.job_spec.orchestrator_config; + params.cloneOrchestratorConfig = JSON.stringify({ + nAttempts: orch.n_attempts, + nConcurrentTrials: orch.n_concurrent_trials, + quiet: orch.quiet, + timeoutMultiplier: orch.timeout_multiplier, + }); + } + + return params; +} + export interface ListBenchmarkJobsOptions { limit?: number; startingAfter?: string;