Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions cmd/root/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,15 @@ func (f *evalFlags) runEvalCommand(cmd *cobra.Command, args []string) error {
return evalErr
}

// Save results JSON
resultsPath, err := evaluation.SaveRunJSON(run, outputDir)
// Save sessions to SQLite database
dbPath, err := evaluation.SaveRunSessions(ctx, run, outputDir)
if err != nil {
slog.Error("Failed to save results", "error", err)
slog.Error("Failed to save sessions database", "error", err)
} else {
fmt.Fprintf(teeOut, "\nResults: %s\n", resultsPath)
fmt.Fprintf(teeOut, "Log: %s\n", logPath)
fmt.Fprintf(teeOut, "\nSessions: %s\n", dbPath)
}

fmt.Fprintf(teeOut, "Log: %s\n", logPath)

return evalErr
}
4 changes: 3 additions & 1 deletion pkg/evaluation/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,11 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Resu
result.Response = response
result.Cost = cost
result.OutputTokens = outputTokens
result.RawOutput = events
result.Size = getResponseSize(result.Response)

// Build session from events for database storage
result.Session = SessionFromEvents(events, evalSess.Title, result.Question)

if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 {
result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
}
Expand Down
293 changes: 293 additions & 0 deletions pkg/evaluation/save.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,307 @@ package evaluation

import (
"cmp"
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"time"

"github.com/docker/cagent/pkg/chat"
"github.com/docker/cagent/pkg/session"
"github.com/docker/cagent/pkg/tools"
)

// SaveRunSessions saves all eval sessions to a SQLite database file.
// The database follows the same schema as the main session store,
// allowing the sessions to be loaded and inspected using standard session tools.
func SaveRunSessions(ctx context.Context, run *EvalRun, outputDir string) (string, error) {
dbPath := filepath.Join(outputDir, run.Name+".db")

// Create output directory if needed
if err := os.MkdirAll(outputDir, 0o755); err != nil {
return "", fmt.Errorf("creating output directory: %w", err)
}

// Create a new SQLite session store for this eval run
store, err := session.NewSQLiteSessionStore(dbPath)
if err != nil {
return "", fmt.Errorf("creating session store: %w", err)
}
defer func() {
if closer, ok := store.(interface{ Close() error }); ok {
_ = closer.Close()
}
}()

// Save each result's session to the database
for i := range run.Results {
result := &run.Results[i]
if result.Session == nil {
continue
}

if err := store.AddSession(ctx, result.Session); err != nil {
return "", fmt.Errorf("saving session for %q: %w", result.Title, err)
}
}

return dbPath, nil
}

// SessionFromEvents reconstructs a session from raw container output events.
// This parses the JSON events emitted by cagent --json and builds a session
// with the conversation history.
func SessionFromEvents(events []map[string]any, title, question string) *session.Session {
sess := session.New(
session.WithTitle(title),
session.WithToolsApproved(true),
)

// Add the user question as the first message
if question != "" {
sess.AddMessage(session.UserMessage(question))
}

// Track current assistant message being built
var currentContent strings.Builder
var currentReasoningContent strings.Builder
var currentToolCalls []tools.ToolCall
var currentToolDefinitions []tools.Tool
var currentAgentName string
var currentModel string
var currentUsage *chat.Usage
var currentCost float64

// Helper to flush current assistant message
flushAssistantMessage := func() {
if currentContent.Len() > 0 || currentReasoningContent.Len() > 0 || len(currentToolCalls) > 0 {
msg := &session.Message{
AgentName: currentAgentName,
Message: chat.Message{
Role: chat.MessageRoleAssistant,
Content: currentContent.String(),
ReasoningContent: currentReasoningContent.String(),
ToolCalls: currentToolCalls,
ToolDefinitions: currentToolDefinitions,
CreatedAt: time.Now().Format(time.RFC3339),
Model: currentModel,
Usage: currentUsage,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ISSUE: Message timestamps use time.Now() instead of preserving event timing

The SessionFromEvents function reconstructs sessions from historical evaluation events, but uses time.Now() for message timestamps. This means:

  1. All messages get the timestamp of when the database was created, not when events actually occurred
  2. Messages that happened seconds/minutes apart during execution will appear simultaneous
  3. Temporal ordering and latency analysis is lost

This affects three locations:

  • Line 92 (this location): Assistant messages in flushAssistantMessage()
  • Line 161: Tool call response messages
  • Line 201: Error messages

Since runtime events don't contain timestamp information (verified in pkg/runtime/event.go), consider either:

  • Adding timestamps to runtime events, or
  • Using incrementing timestamps based on event sequence to preserve ordering, or
  • Using eval run start time + offset based on position

While this doesn't break functionality, it does reduce the debugging value of the eval session database for investigating timing and latency issues.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue is that our runtime events don't have a timing for now. I'll add this to my TODO

Cost: currentCost,
},
}
sess.AddMessage(msg)
currentContent.Reset()
currentReasoningContent.Reset()
currentToolCalls = nil
currentToolDefinitions = nil
currentModel = ""
currentUsage = nil
currentCost = 0
}
}

for _, event := range events {
eventType, _ := event["type"].(string)

switch eventType {
case "agent_choice":
// Accumulate agent response content
if content, ok := event["content"].(string); ok {
currentContent.WriteString(content)
}
if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
currentAgentName = agentName
}

case "agent_choice_reasoning":
// Accumulate reasoning content (for models like DeepSeek, Claude with extended thinking)
if content, ok := event["content"].(string); ok {
currentReasoningContent.WriteString(content)
}
if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
currentAgentName = agentName
}

case "tool_call":
// Parse tool call and add to current message
if tc, ok := event["tool_call"].(map[string]any); ok {
toolCall := parseToolCall(tc)
currentToolCalls = append(currentToolCalls, toolCall)
}
// Parse tool definition if present
if td, ok := event["tool_definition"].(map[string]any); ok {
toolDef := parseToolDefinition(td)
currentToolDefinitions = append(currentToolDefinitions, toolDef)
} else {
// Add empty tool definition to maintain index alignment with tool calls
currentToolDefinitions = append(currentToolDefinitions, tools.Tool{})
}
if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
currentAgentName = agentName
}

case "tool_call_response":
// Flush any pending assistant message before adding tool response
flushAssistantMessage()

// Add tool response message
if tc, ok := event["tool_call"].(map[string]any); ok {
toolCallID, _ := tc["id"].(string)
response, _ := event["response"].(string)

msg := &session.Message{
Message: chat.Message{
Role: chat.MessageRoleTool,
Content: response,
ToolCallID: toolCallID,
CreatedAt: time.Now().Format(time.RFC3339),
},
}
sess.AddMessage(msg)
}

case "token_usage":
// Update session token usage
if usage, ok := event["usage"].(map[string]any); ok {
if inputTokens, ok := usage["input_tokens"].(float64); ok {
sess.InputTokens = int64(inputTokens)
}
if outputTokens, ok := usage["output_tokens"].(float64); ok {
sess.OutputTokens = int64(outputTokens)
}
if cost, ok := usage["cost"].(float64); ok {
sess.Cost = cost
}
// Extract per-message usage if available
if lastMsg, ok := usage["last_message"].(map[string]any); ok {
currentUsage = parseMessageUsage(lastMsg)
if model, ok := lastMsg["Model"].(string); ok {
currentModel = model
}
if msgCost, ok := lastMsg["Cost"].(float64); ok {
currentCost = msgCost
}
}
}

case "error":
// Flush any pending assistant message before adding error
flushAssistantMessage()

// Add error as a system message so it's visible in the session
if errorMsg, ok := event["error"].(string); ok && errorMsg != "" {
msg := &session.Message{
Message: chat.Message{
Role: chat.MessageRoleSystem,
Content: "Error: " + errorMsg,
CreatedAt: time.Now().Format(time.RFC3339),
},
}
sess.AddMessage(msg)
}

case "session_title":
// Update session title if provided (may override the one from eval config)
if eventTitle, ok := event["title"].(string); ok && eventTitle != "" {
sess.Title = eventTitle
}

case "stream_stopped":
// Flush final assistant message
flushAssistantMessage()
}
}

// Flush any remaining content
flushAssistantMessage()

return sess
}

// parseToolCall converts a map representation of a tool call to tools.ToolCall
func parseToolCall(tc map[string]any) tools.ToolCall {
toolCall := tools.ToolCall{}

if id, ok := tc["id"].(string); ok {
toolCall.ID = id
}
if typ, ok := tc["type"].(string); ok {
toolCall.Type = tools.ToolType(typ)
}

if fn, ok := tc["function"].(map[string]any); ok {
if name, ok := fn["name"].(string); ok {
toolCall.Function.Name = name
}
if args, ok := fn["arguments"].(string); ok {
toolCall.Function.Arguments = args
}
}

return toolCall
}

// parseToolDefinition converts a map representation of a tool definition to tools.Tool
func parseToolDefinition(td map[string]any) tools.Tool {
toolDef := tools.Tool{}

if name, ok := td["name"].(string); ok {
toolDef.Name = name
}
if category, ok := td["category"].(string); ok {
toolDef.Category = category
}
if description, ok := td["description"].(string); ok {
toolDef.Description = description
}
if parameters, ok := td["parameters"]; ok {
toolDef.Parameters = parameters
}

return toolDef
}

// parseMessageUsage converts a map representation of message usage to chat.Usage
// Note: The embedded chat.Usage fields use snake_case JSON tags (input_tokens, etc.)
// while Cost and Model don't have JSON tags and serialize with capitalized names.
func parseMessageUsage(m map[string]any) *chat.Usage {
usage := &chat.Usage{}

// Try snake_case first (from JSON serialization), then capitalized (fallback)
if v, ok := m["input_tokens"].(float64); ok {
usage.InputTokens = int64(v)
} else if v, ok := m["InputTokens"].(float64); ok {
usage.InputTokens = int64(v)
}
if v, ok := m["output_tokens"].(float64); ok {
usage.OutputTokens = int64(v)
} else if v, ok := m["OutputTokens"].(float64); ok {
usage.OutputTokens = int64(v)
}
if v, ok := m["cached_input_tokens"].(float64); ok {
usage.CachedInputTokens = int64(v)
} else if v, ok := m["CachedInputTokens"].(float64); ok {
usage.CachedInputTokens = int64(v)
}
if v, ok := m["cached_write_tokens"].(float64); ok {
usage.CacheWriteTokens = int64(v)
} else if v, ok := m["CacheWriteTokens"].(float64); ok {
usage.CacheWriteTokens = int64(v)
}
if v, ok := m["reasoning_tokens"].(float64); ok {
usage.ReasoningTokens = int64(v)
} else if v, ok := m["ReasoningTokens"].(float64); ok {
usage.ReasoningTokens = int64(v)
}

return usage
}

// SaveRunJSON saves the eval run results to a JSON file.
// This is kept for backward compatibility and debugging purposes.
func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
return saveJSON(run, filepath.Join(outputDir, run.Name+".json"))
}
Expand Down
Loading
Loading