From 57ab0752be44d058d6be860c92c8f9531b57800b Mon Sep 17 00:00:00 2001 From: David Gageot Date: Thu, 5 Feb 2026 14:49:42 +0100 Subject: [PATCH] Fix tool calls score rendering Signed-off-by: David Gageot --- pkg/evaluation/scoring.go | 24 +++++++++++++++--------- pkg/evaluation/types.go | 4 ++-- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pkg/evaluation/scoring.go b/pkg/evaluation/scoring.go index 2a4008c44..5d533d83c 100644 --- a/pkg/evaluation/scoring.go +++ b/pkg/evaluation/scoring.go @@ -91,8 +91,10 @@ func computeSummary(results []Result) Summary { } } - summary.ToolsTotal += r.ToolCallsExpected - summary.ToolsPassed += r.ToolCallsScore * r.ToolCallsExpected + if r.ToolCallsExpected > 0 { + summary.ToolsF1Sum += r.ToolCallsScore + summary.ToolsCount++ + } summary.HandoffsTotal++ if r.HandoffsMatch { @@ -115,24 +117,28 @@ func printSummary(out io.Writer, summary Summary, duration time.Duration) { } printMetric(out, "Sizes", summary.SizesPassed, summary.SizesTotal) - printMetricFloat(out, "Tool Calls", summary.ToolsPassed, summary.ToolsTotal) + printF1Score(out, "Tool Calls", summary.ToolsF1Sum, summary.ToolsCount) printMetric(out, "Handoffs", summary.HandoffsPassed, summary.HandoffsTotal) - printMetricFloat(out, "Relevance", summary.RelevancePassed, summary.RelevanceTotal) + printMetric(out, "Relevance", int(summary.RelevancePassed), int(summary.RelevanceTotal)) fmt.Fprintf(out, "\nTotal Cost: $%.6f\n", summary.TotalCost) fmt.Fprintf(out, "Total Time: %s\n", duration.Round(time.Second)) } func printMetric(out io.Writer, label string, passed, total int) { - printMetricFloat(out, label, float64(passed), float64(total)) + if total == 0 { + return // Skip metrics with no data + } + ratio := float64(passed) / float64(total) + fmt.Fprintf(out, "%s %14s: %d/%d passed (%.1f%%)\n", statusIcon(ratio), label, passed, total, ratio*100) } -func printMetricFloat(out io.Writer, label string, passed, total float64) { - if total == 0 { +func printF1Score(out io.Writer, label string, f1Sum float64, count int) { + if count == 0 { return // Skip metrics with no data } - ratio := passed / total - fmt.Fprintf(out, "%s %14s: %.0f/%.0f passed (%.1f%%)\n", statusIcon(ratio), label, passed, total, ratio*100) + avgF1 := f1Sum / float64(count) + fmt.Fprintf(out, "%s %14s: %.1f%% avg F1 (%d evals)\n", statusIcon(avgF1), label, avgF1*100, count) } func statusIcon(ratio float64) string { diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go index cfcff04e9..167111c5e 100644 --- a/pkg/evaluation/types.go +++ b/pkg/evaluation/types.go @@ -90,8 +90,8 @@ type Summary struct { TotalCost float64 `json:"total_cost"` SizesPassed int `json:"sizes_passed"` SizesTotal int `json:"sizes_total"` - ToolsPassed float64 `json:"tools_passed"` - ToolsTotal float64 `json:"tools_total"` + ToolsF1Sum float64 `json:"tools_f1_sum"` + ToolsCount int `json:"tools_count"` HandoffsPassed int `json:"handoffs_passed"` HandoffsTotal int `json:"handoffs_total"` RelevancePassed float64 `json:"relevance_passed"`