From d928b80189fcbbca133b821c4775249b25cf0b20 Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Tue, 25 Nov 2025 16:30:48 -0700 Subject: [PATCH 1/4] feat(web-evals): enhance dashboard with dynamic tool columns and UX improvements - Add aggregate statistics panel on run details page - Add dynamic tool usage columns sorted by total usage - Add API config selector for multi-config imports - Add language toggle buttons for exercise selection - Persist concurrency/timeout settings to localStorage - Make table rows clickable for faster navigation - Add View Settings option in dropdown menu - Support controlled mode for MultiSelect component - Filter deprecated models from Roo Code Cloud list --- apps/web-evals/src/app/runs/[id]/run.tsx | 173 +++++++++++- apps/web-evals/src/app/runs/new/new-run.tsx | 258 ++++++++++++++++-- apps/web-evals/src/components/home/run.tsx | 81 ++++-- apps/web-evals/src/components/home/runs.tsx | 67 ++++- .../src/components/ui/multi-select.tsx | 26 +- .../src/hooks/use-roo-code-cloud-models.ts | 2 +- 6 files changed, 557 insertions(+), 50 deletions(-) diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx index b6c5290b135..7bb2cd1e115 100644 --- a/apps/web-evals/src/app/runs/[id]/run.tsx +++ b/apps/web-evals/src/app/runs/[id]/run.tsx @@ -5,15 +5,36 @@ import { LoaderCircle } from "lucide-react" import type { Run, TaskMetrics as _TaskMetrics } from "@roo-code/evals" -import { formatCurrency, formatDuration, formatTokens } from "@/lib/formatters" +import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters" import { useRunStatus } from "@/hooks/use-run-status" -import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui" +import { + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, + Tooltip, + TooltipContent, + TooltipTrigger, +} from "@/components/ui" import { TaskStatus } from "./task-status" import { RunStatus } from "./run-status" type TaskMetrics = Pick<_TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost"> +type ToolUsageEntry = { attempts: number; failures: number } +type ToolUsage = Record + +// Generate abbreviation from tool name (e.g., "read_file" -> "RF", "list_code_definition_names" -> "LCDN") +function getToolAbbreviation(toolName: string): string { + return toolName + .split("_") + .map((word) => word[0]?.toUpperCase() ?? "") + .join("") +} + export function Run({ run }: { run: Run }) { const runStatus = useRunStatus(run) const { tasks, tokenUsage, usageUpdatedAt } = runStatus @@ -41,16 +62,162 @@ export function Run({ run }: { run: Run }) { // eslint-disable-next-line react-hooks/exhaustive-deps }, [tasks, tokenUsage, usageUpdatedAt]) + // Compute aggregate stats + const stats = useMemo(() => { + if (!tasks) return null + + const passed = tasks.filter((t) => t.passed === true).length + const failed = tasks.filter((t) => t.passed === false).length + const running = tasks.filter((t) => t.startedAt && !t.finishedAt).length + const pending = tasks.filter((t) => !t.startedAt && !t.finishedAt).length + const total = tasks.length + const completed = passed + failed + + let totalTokensIn = 0 + let totalTokensOut = 0 + let totalCost = 0 + let totalDuration = 0 + + // Aggregate tool usage from completed tasks + const toolUsage: ToolUsage = {} + + for (const task of tasks) { + const metrics = taskMetrics[task.id] + if (metrics) { + totalTokensIn += metrics.tokensIn + totalTokensOut += metrics.tokensOut + totalCost += metrics.cost + totalDuration += metrics.duration + } + + // Aggregate tool usage from finished tasks with taskMetrics + if (task.finishedAt && task.taskMetrics?.toolUsage) { + for (const [key, usage] of Object.entries(task.taskMetrics.toolUsage)) { + const tool = key as keyof ToolUsage + if (!toolUsage[tool]) { + toolUsage[tool] = { attempts: 0, failures: 0 } + } + toolUsage[tool].attempts += usage.attempts + toolUsage[tool].failures += usage.failures + } + } + } + + return { + passed, + failed, + running, + pending, + total, + completed, + passRate: completed > 0 ? ((passed / completed) * 100).toFixed(1) : null, + totalTokensIn, + totalTokensOut, + totalCost, + totalDuration, + toolUsage, + } + }, [tasks, taskMetrics]) + return ( <>
-
+
{run.model}
{run.description &&
{run.description}
}
{!run.taskMetricsId && }
+ + {stats && ( +
+ {/* Main Stats Row */} +
+ {/* Passed/Failed */} +
+
+ {stats.passed} + / + {stats.failed} + {stats.running > 0 && ( + ({stats.running}) + )} +
+
Passed / Failed
+
+ + {/* Pass Rate */} +
+
{stats.passRate ? `${stats.passRate}%` : "-"}
+
Pass Rate
+
+ + {/* Tokens */} +
+
+ {formatTokens(stats.totalTokensIn)} + / + {formatTokens(stats.totalTokensOut)} +
+
Tokens In / Out
+
+ + {/* Cost */} +
+
{formatCurrency(stats.totalCost)}
+
Cost
+
+ + {/* Duration */} +
+
+ {stats.totalDuration > 0 ? formatDuration(stats.totalDuration) : "-"} +
+
Duration
+
+ + {/* Tool Usage - Inline */} + {Object.keys(stats.toolUsage).length > 0 && ( +
+ {Object.entries(stats.toolUsage) + .sort(([, a], [, b]) => b.attempts - a.attempts) + .map(([toolName, usage]) => { + const abbr = getToolAbbreviation(toolName) + const successRate = + usage.attempts > 0 + ? ((usage.attempts - usage.failures) / usage.attempts) * 100 + : 100 + const rateColor = + successRate === 100 + ? "text-green-500" + : successRate >= 80 + ? "text-yellow-500" + : "text-red-500" + return ( + + +
+ + {abbr} + + + {usage.attempts} + + + {formatToolUsageSuccessRate(usage)} + +
+
+ {toolName} +
+ ) + })} +
+ )} +
+
+ )} {!tasks ? ( ) : ( diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx index 2d424e35f72..3782f29a362 100644 --- a/apps/web-evals/src/app/runs/new/new-run.tsx +++ b/apps/web-evals/src/app/runs/new/new-run.tsx @@ -1,6 +1,6 @@ "use client" -import { useCallback, useState } from "react" +import { useCallback, useEffect, useMemo, useState } from "react" import { useRouter } from "next/navigation" import { z } from "zod" import { useQuery } from "@tanstack/react-query" @@ -9,7 +9,14 @@ import { zodResolver } from "@hookform/resolvers/zod" import { toast } from "sonner" import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal } from "lucide-react" -import { globalSettingsSchema, providerSettingsSchema, EVALS_SETTINGS, getModelId } from "@roo-code/types" +import { + globalSettingsSchema, + providerSettingsSchema, + EVALS_SETTINGS, + getModelId, + type ProviderSettings, + type GlobalSettings, +} from "@roo-code/types" import { createRun } from "@/actions/runs" import { getExercises } from "@/actions/exercises" @@ -59,6 +66,12 @@ import { import { SettingsDiff } from "./settings-diff" +type ImportedSettings = { + apiConfigs: Record + globalSettings: GlobalSettings + currentApiConfigName: string +} + export function NewRun() { const router = useRouter() @@ -66,6 +79,11 @@ export function NewRun() { const [modelPopoverOpen, setModelPopoverOpen] = useState(false) const [useNativeToolProtocol, setUseNativeToolProtocol] = useState(true) + // State for imported settings with config selection + const [importedSettings, setImportedSettings] = useState(null) + const [selectedConfigName, setSelectedConfigName] = useState("") + const [configPopoverOpen, setConfigPopoverOpen] = useState(false) + const openRouter = useOpenRouterModels() const rooCodeCloud = useRooCodeCloudModels() const models = provider === "openrouter" ? openRouter.data : rooCodeCloud.data @@ -75,6 +93,9 @@ export function NewRun() { const exercises = useQuery({ queryKey: ["getExercises"], queryFn: () => getExercises() }) + // State for selected exercises (needed for language toggle buttons) + const [selectedExercises, setSelectedExercises] = useState([]) + const form = useForm({ resolver: zodResolver(createRunSchema), defaultValues: { @@ -98,6 +119,88 @@ export function NewRun() { const [model, suite, settings] = watch(["model", "suite", "settings", "concurrency"]) + // Load concurrency and timeout from localStorage on mount + useEffect(() => { + const savedConcurrency = localStorage.getItem("evals-concurrency") + if (savedConcurrency) { + const parsed = parseInt(savedConcurrency, 10) + if (!isNaN(parsed) && parsed >= CONCURRENCY_MIN && parsed <= CONCURRENCY_MAX) { + setValue("concurrency", parsed) + } + } + const savedTimeout = localStorage.getItem("evals-timeout") + if (savedTimeout) { + const parsed = parseInt(savedTimeout, 10) + if (!isNaN(parsed) && parsed >= TIMEOUT_MIN && parsed <= TIMEOUT_MAX) { + setValue("timeout", parsed) + } + } + }, [setValue]) + + // Extract unique languages from exercises + const languages = useMemo(() => { + if (!exercises.data) return [] + const langs = new Set() + for (const path of exercises.data) { + const lang = path.split("/")[0] + if (lang) langs.add(lang) + } + return Array.from(langs).sort() + }, [exercises.data]) + + // Get exercises for a specific language + const getExercisesForLanguage = useCallback( + (lang: string) => { + if (!exercises.data) return [] + return exercises.data.filter((path) => path.startsWith(`${lang}/`)) + }, + [exercises.data], + ) + + // Toggle all exercises for a language + const toggleLanguage = useCallback( + (lang: string) => { + const langExercises = getExercisesForLanguage(lang) + const allSelected = langExercises.every((ex) => selectedExercises.includes(ex)) + + let newSelected: string[] + if (allSelected) { + // Remove all exercises for this language + newSelected = selectedExercises.filter((ex) => !ex.startsWith(`${lang}/`)) + } else { + // Add all exercises for this language (avoiding duplicates) + const existing = new Set(selectedExercises) + for (const ex of langExercises) { + existing.add(ex) + } + newSelected = Array.from(existing) + } + + setSelectedExercises(newSelected) + setValue("exercises", newSelected) + }, + [getExercisesForLanguage, selectedExercises, setValue], + ) + + // Check if all exercises for a language are selected + const isLanguageSelected = useCallback( + (lang: string) => { + const langExercises = getExercisesForLanguage(lang) + return langExercises.length > 0 && langExercises.every((ex) => selectedExercises.includes(ex)) + }, + [getExercisesForLanguage, selectedExercises], + ) + + // Check if some (but not all) exercises for a language are selected + const isLanguagePartiallySelected = useCallback( + (lang: string) => { + const langExercises = getExercisesForLanguage(lang) + const selectedCount = langExercises.filter((ex) => selectedExercises.includes(ex)).length + return selectedCount > 0 && selectedCount < langExercises.length + }, + [getExercisesForLanguage, selectedExercises], + ) + const onSubmit = useCallback( async (values: CreateRun) => { try { @@ -155,8 +258,19 @@ export function NewRun() { }) .parse(JSON.parse(await file.text())) - const providerSettings = providerProfiles.apiConfigs[providerProfiles.currentApiConfigName] ?? {} + // Store all imported configs for user selection + setImportedSettings({ + apiConfigs: providerProfiles.apiConfigs, + globalSettings, + currentApiConfigName: providerProfiles.currentApiConfigName, + }) + + // Default to the current config + const defaultConfigName = providerProfiles.currentApiConfigName + setSelectedConfigName(defaultConfigName) + // Apply the default config + const providerSettings = providerProfiles.apiConfigs[defaultConfigName] ?? {} setValue("model", getModelId(providerSettings) ?? "") setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...globalSettings }) @@ -169,6 +283,22 @@ export function NewRun() { [clearErrors, setValue], ) + const onSelectConfig = useCallback( + (configName: string) => { + if (!importedSettings) { + return + } + + setSelectedConfigName(configName) + setConfigPopoverOpen(false) + + const providerSettings = importedSettings.apiConfigs[configName] ?? {} + setValue("model", getModelId(providerSettings) ?? "") + setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...importedSettings.globalSettings }) + }, + [importedSettings, setValue], + ) + return ( <> @@ -207,6 +337,63 @@ export function NewRun() { className="hidden" onChange={onImportSettings} /> + + {importedSettings && Object.keys(importedSettings.apiConfigs).length > 1 && ( +
+ + + + + + + + + + No config found. + + {Object.keys(importedSettings.apiConfigs).map( + (configName) => ( + + {configName} + {configName === + importedSettings.currentApiConfigName && ( + + (default) + + )} + + + ), + )} + + + + + +
+ )} + {settings && ( )} @@ -306,18 +493,51 @@ export function NewRun() { render={() => ( Exercises - setValue("suite", value as "full" | "partial")}> - - All - Some - - +
+ { + setValue("suite", value as "full" | "partial") + if (value === "full") { + setSelectedExercises([]) + setValue("exercises", []) + } + }}> + + All + Some + + + {suite === "partial" && languages.length > 0 && ( +
+ {languages.map((lang) => ( + + ))} +
+ )} +
{suite === "partial" && ( ({ value: path, label: path })) || []} - onValueChange={(value) => setValue("exercises", value)} + value={selectedExercises} + onValueChange={(value) => { + setSelectedExercises(value) + setValue("exercises", value) + }} placeholder="Select" variant="inverted" maxCount={4} @@ -337,11 +557,14 @@ export function NewRun() {
field.onChange(value[0])} + onValueChange={(value) => { + field.onChange(value[0]) + localStorage.setItem("evals-concurrency", String(value[0])) + }} />
{field.value}
@@ -360,11 +583,14 @@ export function NewRun() {
field.onChange(value[0])} + onValueChange={(value) => { + field.onChange(value[0]) + localStorage.setItem("evals-timeout", String(value[0])) + }} />
{field.value}
diff --git a/apps/web-evals/src/components/home/run.tsx b/apps/web-evals/src/components/home/run.tsx index c35673885c3..7734219fafc 100644 --- a/apps/web-evals/src/components/home/run.tsx +++ b/apps/web-evals/src/components/home/run.tsx @@ -1,8 +1,10 @@ import { useCallback, useState, useRef } from "react" import Link from "next/link" -import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash } from "lucide-react" +import { useRouter } from "next/navigation" +import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, Settings } from "lucide-react" import type { Run as EvalsRun, TaskMetrics as EvalsTaskMetrics } from "@roo-code/evals" +import type { ToolName } from "@roo-code/types" import { deleteRun } from "@/actions/runs" import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters" @@ -23,15 +25,23 @@ import { AlertDialogFooter, AlertDialogHeader, AlertDialogTitle, + Dialog, + DialogContent, + DialogHeader, + DialogTitle, + ScrollArea, } from "@/components/ui" type RunProps = { run: EvalsRun taskMetrics: EvalsTaskMetrics | null + toolColumns: ToolName[] } -export function Run({ run, taskMetrics }: RunProps) { +export function Run({ run, taskMetrics, toolColumns }: RunProps) { + const router = useRouter() const [deleteRunId, setDeleteRunId] = useState() + const [showSettings, setShowSettings] = useState(false) const continueRef = useRef(null) const { isPending, copyRun, copied } = useCopyRun(run.id) @@ -48,9 +58,20 @@ export function Run({ run, taskMetrics }: RunProps) { } }, [deleteRunId]) + const handleRowClick = useCallback( + (e: React.MouseEvent) => { + // Don't navigate if clicking on the dropdown menu + if ((e.target as HTMLElement).closest("[data-dropdown-trigger]")) { + return + } + router.push(`/runs/${run.id}`) + }, + [router, run.id], + ) + return ( <> - + {run.model} {run.passed} {run.failed} @@ -61,27 +82,33 @@ export function Run({ run, taskMetrics }: RunProps) { {taskMetrics && ( -
-
{formatTokens(taskMetrics.tokensIn)}
/ -
{formatTokens(taskMetrics.tokensOut)}
-
- )} -
- - {taskMetrics?.toolUsage?.apply_diff && ( -
-
{taskMetrics.toolUsage.apply_diff.attempts}
-
/
-
{formatToolUsageSuccessRate(taskMetrics.toolUsage.apply_diff)}
+
+ {formatTokens(taskMetrics.tokensIn)}/ + {formatTokens(taskMetrics.tokensOut)}
)} + {toolColumns.map((toolName) => { + const usage = taskMetrics?.toolUsage?.[toolName] + return ( + + {usage ? ( +
+ {usage.attempts} + {formatToolUsageSuccessRate(usage)} +
+ ) : ( + - + )} +
+ ) + })} {taskMetrics && formatCurrency(taskMetrics.cost)} {taskMetrics && formatDuration(taskMetrics.duration)} - + e.stopPropagation()}> @@ -94,6 +121,14 @@ export function Run({ run, taskMetrics }: RunProps) {
+ {run.settings && ( + setShowSettings(true)}> +
+ +
View Settings
+
+
+ )} {run.taskMetricsId && ( copyRun()} disabled={isPending || copied}>
@@ -144,6 +179,18 @@ export function Run({ run, taskMetrics }: RunProps) { + + + + Run Settings + + +
+							{JSON.stringify(run.settings, null, 2)}
+						
+
+
+
) } diff --git a/apps/web-evals/src/components/home/runs.tsx b/apps/web-evals/src/components/home/runs.tsx index 8bc8739b28e..22dd3ff9376 100644 --- a/apps/web-evals/src/components/home/runs.tsx +++ b/apps/web-evals/src/components/home/runs.tsx @@ -1,18 +1,62 @@ "use client" +import { useMemo } from "react" import { useRouter } from "next/navigation" import { Rocket } from "lucide-react" import type { Run, TaskMetrics } from "@roo-code/evals" +import type { ToolName } from "@roo-code/types" -import { Button, Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui" +import { + Button, + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, + Tooltip, + TooltipContent, + TooltipTrigger, +} from "@/components/ui" import { Run as Row } from "@/components/home/run" type RunWithTaskMetrics = Run & { taskMetrics: TaskMetrics | null } +// Generate abbreviation from tool name (e.g., "read_file" -> "RF", "list_code_definition_names" -> "LCDN") +function getToolAbbreviation(toolName: string): string { + return toolName + .split("_") + .map((word) => word[0]?.toUpperCase() ?? "") + .join("") +} + export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { const router = useRouter() + // Collect all unique tool names from all runs and sort by total attempts + const toolColumns = useMemo(() => { + const toolTotals = new Map() + + for (const run of runs) { + if (run.taskMetrics?.toolUsage) { + for (const [toolName, usage] of Object.entries(run.taskMetrics.toolUsage)) { + const tool = toolName as ToolName + const current = toolTotals.get(tool) ?? 0 + toolTotals.set(tool, current + usage.attempts) + } + } + } + + // Sort by total attempts descending + return Array.from(toolTotals.entries()) + .sort((a, b) => b[1] - a[1]) + .map(([name]): ToolName => name) + }, [runs]) + + // Calculate colSpan for empty state (5 base columns + dynamic tools + 3 end columns) + const totalColumns = 5 + toolColumns.length + 3 + return ( <> @@ -21,20 +65,29 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { ModelPassedFailed - % Correct - Tokens In / Out - Diff Edits + % + Tokens + {toolColumns.map((toolName) => ( + + + {getToolAbbreviation(toolName)} + {toolName} + + + ))} CostDuration - + {runs.length ? ( - runs.map(({ taskMetrics, ...run }) => ) + runs.map(({ taskMetrics, ...run }) => ( + + )) ) : ( - + No eval runs yet.
- Model - Passed - Failed - % + handleSort("model")}> +
+ Model + +
+
+ handleSort("provider")}> +
+ Provider + +
+
+ handleSort("createdAt")}> +
+ Created + +
+
+ handleSort("passed")}> +
+ Passed + +
+
+ handleSort("failed")}> +
+ Failed + +
+
+ handleSort("percent")}> +
+ % + +
+
Tokens {toolColumns.map((toolName) => ( @@ -75,14 +196,24 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { ))} - Cost - Duration + handleSort("cost")}> +
+ Cost + +
+
+ handleSort("duration")}> +
+ Duration + +
+
- {runs.length ? ( - runs.map(({ taskMetrics, ...run }) => ( + {sortedRuns.length ? ( + sortedRuns.map(({ taskMetrics, ...run }) => ( )) ) : ( diff --git a/apps/web-evals/src/lib/formatters.ts b/apps/web-evals/src/lib/formatters.ts index 0f75b64a279..0e8a235ac2c 100644 --- a/apps/web-evals/src/lib/formatters.ts +++ b/apps/web-evals/src/lib/formatters.ts @@ -46,3 +46,13 @@ export const formatTokens = (tokens: number) => { export const formatToolUsageSuccessRate = (usage: { attempts: number; failures: number }) => usage.attempts === 0 ? "0%" : `${(((usage.attempts - usage.failures) / usage.attempts) * 100).toFixed(1)}%` + +export const formatDateTime = (date: Date) => { + return new Intl.DateTimeFormat("en-US", { + month: "short", + day: "numeric", + hour: "numeric", + minute: "2-digit", + hour12: true, + }).format(date) +}