docker · dgageot · Feb 4, 2026 · Feb 4, 2026 · github-actions · Feb 4, 2026
@@ -118,14 +118,15 @@ func (f *evalFlags) runEvalCommand(cmd *cobra.Command, args []string) error {
 		return evalErr
 	}
 
-	// Save results JSON
-	resultsPath, err := evaluation.SaveRunJSON(run, outputDir)
+	// Save sessions to SQLite database
+	dbPath, err := evaluation.SaveRunSessions(ctx, run, outputDir)
 	if err != nil {
-		slog.Error("Failed to save results", "error", err)
+		slog.Error("Failed to save sessions database", "error", err)
 	} else {
-		fmt.Fprintf(teeOut, "\nResults: %s\n", resultsPath)
-		fmt.Fprintf(teeOut, "Log: %s\n", logPath)
+		fmt.Fprintf(teeOut, "\nSessions: %s\n", dbPath)
 	}
 
+	fmt.Fprintf(teeOut, "Log: %s\n", logPath)
+
 	return evalErr
 }
@@ -312,9 +312,11 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Resu
 	result.Response = response
 	result.Cost = cost
 	result.OutputTokens = outputTokens
-	result.RawOutput = events
 	result.Size = getResponseSize(result.Response)
 
+	// Build session from events for database storage
+	result.Session = SessionFromEvents(events, evalSess.Title, result.Question)
+
 	if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 {
 		result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
 	}

@@ -2,14 +2,307 @@ package evaluation
 
 import (
 	"cmp"
+	"context"
 	"encoding/json"
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
+	"time"
 
+	"github.com/docker/cagent/pkg/chat"
 	"github.com/docker/cagent/pkg/session"
+	"github.com/docker/cagent/pkg/tools"
 )
 
+// SaveRunSessions saves all eval sessions to a SQLite database file.
+// The database follows the same schema as the main session store,
+// allowing the sessions to be loaded and inspected using standard session tools.
+func SaveRunSessions(ctx context.Context, run *EvalRun, outputDir string) (string, error) {
+	dbPath := filepath.Join(outputDir, run.Name+".db")
+
+	// Create output directory if needed
+	if err := os.MkdirAll(outputDir, 0o755); err != nil {
+		return "", fmt.Errorf("creating output directory: %w", err)
+	}
+
+	// Create a new SQLite session store for this eval run
+	store, err := session.NewSQLiteSessionStore(dbPath)
+	if err != nil {
+		return "", fmt.Errorf("creating session store: %w", err)
+	}
+	defer func() {
+		if closer, ok := store.(interface{ Close() error }); ok {
+			_ = closer.Close()
+		}
+	}()
+
+	// Save each result's session to the database
+	for i := range run.Results {
+		result := &run.Results[i]
+		if result.Session == nil {
+			continue
+		}
+
+		if err := store.AddSession(ctx, result.Session); err != nil {
+			return "", fmt.Errorf("saving session for %q: %w", result.Title, err)
+		}
+	}
+
+	return dbPath, nil
+}
+
+// SessionFromEvents reconstructs a session from raw container output events.
+// This parses the JSON events emitted by cagent --json and builds a session
+// with the conversation history.
+func SessionFromEvents(events []map[string]any, title, question string) *session.Session {
+	sess := session.New(
+		session.WithTitle(title),
+		session.WithToolsApproved(true),
+	)
+
+	// Add the user question as the first message
+	if question != "" {
+		sess.AddMessage(session.UserMessage(question))
+	}
+
+	// Track current assistant message being built
+	var currentContent strings.Builder
+	var currentReasoningContent strings.Builder
+	var currentToolCalls []tools.ToolCall
+	var currentToolDefinitions []tools.Tool
+	var currentAgentName string
+	var currentModel string
+	var currentUsage *chat.Usage
+	var currentCost float64
+
+	// Helper to flush current assistant message
+	flushAssistantMessage := func() {
+		if currentContent.Len() > 0 || currentReasoningContent.Len() > 0 || len(currentToolCalls) > 0 {
+			msg := &session.Message{
+				AgentName: currentAgentName,
+				Message: chat.Message{
+					Role:             chat.MessageRoleAssistant,
+					Content:          currentContent.String(),
+					ReasoningContent: currentReasoningContent.String(),
+					ToolCalls:        currentToolCalls,
+					ToolDefinitions:  currentToolDefinitions,
+					CreatedAt:        time.Now().Format(time.RFC3339),
+					Model:            currentModel,
+					Usage:            currentUsage,
+					Cost:             currentCost,
+				},
+			}
+			sess.AddMessage(msg)
+			currentContent.Reset()
+			currentReasoningContent.Reset()
+			currentToolCalls = nil
+			currentToolDefinitions = nil
+			currentModel = ""
+			currentUsage = nil
+			currentCost = 0
+		}
+	}
+
+	for _, event := range events {
+		eventType, _ := event["type"].(string)
+
+		switch eventType {
+		case "agent_choice":
+			// Accumulate agent response content
+			if content, ok := event["content"].(string); ok {
+				currentContent.WriteString(content)
+			}
+			if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
+				currentAgentName = agentName
+			}
+
+		case "agent_choice_reasoning":
+			// Accumulate reasoning content (for models like DeepSeek, Claude with extended thinking)
+			if content, ok := event["content"].(string); ok {
+				currentReasoningContent.WriteString(content)
+			}
+			if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
+				currentAgentName = agentName
+			}
+
+		case "tool_call":
+			// Parse tool call and add to current message
+			if tc, ok := event["tool_call"].(map[string]any); ok {
+				toolCall := parseToolCall(tc)
+				currentToolCalls = append(currentToolCalls, toolCall)
+			}
+			// Parse tool definition if present
+			if td, ok := event["tool_definition"].(map[string]any); ok {
+				toolDef := parseToolDefinition(td)
+				currentToolDefinitions = append(currentToolDefinitions, toolDef)
+			} else {
+				// Add empty tool definition to maintain index alignment with tool calls
+				currentToolDefinitions = append(currentToolDefinitions, tools.Tool{})
+			}
+			if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
+				currentAgentName = agentName
+			}
+
+		case "tool_call_response":
+			// Flush any pending assistant message before adding tool response
+			flushAssistantMessage()
+
+			// Add tool response message
+			if tc, ok := event["tool_call"].(map[string]any); ok {
+				toolCallID, _ := tc["id"].(string)
+				response, _ := event["response"].(string)
+
+				msg := &session.Message{
+					Message: chat.Message{
+						Role:       chat.MessageRoleTool,
+						Content:    response,
+						ToolCallID: toolCallID,
+						CreatedAt:  time.Now().Format(time.RFC3339),
+					},
+				}
+				sess.AddMessage(msg)
+			}
+
+		case "token_usage":
+			// Update session token usage
+			if usage, ok := event["usage"].(map[string]any); ok {
+				if inputTokens, ok := usage["input_tokens"].(float64); ok {
+					sess.InputTokens = int64(inputTokens)
+				}
+				if outputTokens, ok := usage["output_tokens"].(float64); ok {
+					sess.OutputTokens = int64(outputTokens)
+				}
+				if cost, ok := usage["cost"].(float64); ok {
+					sess.Cost = cost
+				}
+				// Extract per-message usage if available
+				if lastMsg, ok := usage["last_message"].(map[string]any); ok {
+					currentUsage = parseMessageUsage(lastMsg)
+					if model, ok := lastMsg["Model"].(string); ok {
+						currentModel = model
+					}
+					if msgCost, ok := lastMsg["Cost"].(float64); ok {
+						currentCost = msgCost
+					}
+				}
+			}
+
+		case "error":
+			// Flush any pending assistant message before adding error
+			flushAssistantMessage()
+
+			// Add error as a system message so it's visible in the session
+			if errorMsg, ok := event["error"].(string); ok && errorMsg != "" {
+				msg := &session.Message{
+					Message: chat.Message{
+						Role:      chat.MessageRoleSystem,
+						Content:   "Error: " + errorMsg,
+						CreatedAt: time.Now().Format(time.RFC3339),
+					},
+				}
+				sess.AddMessage(msg)
+			}
+
+		case "session_title":
+			// Update session title if provided (may override the one from eval config)
+			if eventTitle, ok := event["title"].(string); ok && eventTitle != "" {
+				sess.Title = eventTitle
+			}
+
+		case "stream_stopped":
+			// Flush final assistant message
+			flushAssistantMessage()
+		}
+	}
+
+	// Flush any remaining content
+	flushAssistantMessage()
+
+	return sess
+}
+
+// parseToolCall converts a map representation of a tool call to tools.ToolCall
+func parseToolCall(tc map[string]any) tools.ToolCall {
+	toolCall := tools.ToolCall{}
+
+	if id, ok := tc["id"].(string); ok {
+		toolCall.ID = id
+	}
+	if typ, ok := tc["type"].(string); ok {
+		toolCall.Type = tools.ToolType(typ)
+	}
+
+	if fn, ok := tc["function"].(map[string]any); ok {
+		if name, ok := fn["name"].(string); ok {
+			toolCall.Function.Name = name
+		}
+		if args, ok := fn["arguments"].(string); ok {
+			toolCall.Function.Arguments = args
+		}
+	}
+
+	return toolCall
+}
+
+// parseToolDefinition converts a map representation of a tool definition to tools.Tool
+func parseToolDefinition(td map[string]any) tools.Tool {
+	toolDef := tools.Tool{}
+
+	if name, ok := td["name"].(string); ok {
+		toolDef.Name = name
+	}
+	if category, ok := td["category"].(string); ok {
+		toolDef.Category = category
+	}
+	if description, ok := td["description"].(string); ok {
+		toolDef.Description = description
+	}
+	if parameters, ok := td["parameters"]; ok {
+		toolDef.Parameters = parameters
+	}
+
+	return toolDef
+}
+
+// parseMessageUsage converts a map representation of message usage to chat.Usage
+// Note: The embedded chat.Usage fields use snake_case JSON tags (input_tokens, etc.)
+// while Cost and Model don't have JSON tags and serialize with capitalized names.
+func parseMessageUsage(m map[string]any) *chat.Usage {
+	usage := &chat.Usage{}
+
+	// Try snake_case first (from JSON serialization), then capitalized (fallback)
+	if v, ok := m["input_tokens"].(float64); ok {
+		usage.InputTokens = int64(v)
+	} else if v, ok := m["InputTokens"].(float64); ok {
+		usage.InputTokens = int64(v)
+	}
+	if v, ok := m["output_tokens"].(float64); ok {
+		usage.OutputTokens = int64(v)
+	} else if v, ok := m["OutputTokens"].(float64); ok {
+		usage.OutputTokens = int64(v)
+	}
+	if v, ok := m["cached_input_tokens"].(float64); ok {
+		usage.CachedInputTokens = int64(v)
+	} else if v, ok := m["CachedInputTokens"].(float64); ok {
+		usage.CachedInputTokens = int64(v)
+	}
+	if v, ok := m["cached_write_tokens"].(float64); ok {
+		usage.CacheWriteTokens = int64(v)
+	} else if v, ok := m["CacheWriteTokens"].(float64); ok {
+		usage.CacheWriteTokens = int64(v)
+	}
+	if v, ok := m["reasoning_tokens"].(float64); ok {
+		usage.ReasoningTokens = int64(v)
+	} else if v, ok := m["ReasoningTokens"].(float64); ok {
+		usage.ReasoningTokens = int64(v)
+	}
+
+	return usage
+}
+
+// SaveRunJSON saves the eval run results to a JSON file.
+// This is kept for backward compatibility and debugging purposes.
 func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
 	return saveJSON(run, filepath.Join(outputDir, run.Name+".json"))
 }