diff --git a/cmd/root/eval.go b/cmd/root/eval.go
index 4b3183c20..47ad65fb5 100644
--- a/cmd/root/eval.go
+++ b/cmd/root/eval.go
@@ -118,14 +118,15 @@ func (f *evalFlags) runEvalCommand(cmd *cobra.Command, args []string) error {
 		return evalErr
 	}
 
-	// Save results JSON
-	resultsPath, err := evaluation.SaveRunJSON(run, outputDir)
+	// Save sessions to SQLite database
+	dbPath, err := evaluation.SaveRunSessions(ctx, run, outputDir)
 	if err != nil {
-		slog.Error("Failed to save results", "error", err)
+		slog.Error("Failed to save sessions database", "error", err)
 	} else {
-		fmt.Fprintf(teeOut, "\nResults: %s\n", resultsPath)
-		fmt.Fprintf(teeOut, "Log: %s\n", logPath)
+		fmt.Fprintf(teeOut, "\nSessions: %s\n", dbPath)
 	}
 
+	fmt.Fprintf(teeOut, "Log: %s\n", logPath)
+
 	return evalErr
 }
diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
index 69258b765..27a75c265 100644
--- a/pkg/evaluation/eval.go
+++ b/pkg/evaluation/eval.go
@@ -312,9 +312,11 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Resu
 	result.Response = response
 	result.Cost = cost
 	result.OutputTokens = outputTokens
-	result.RawOutput = events
 	result.Size = getResponseSize(result.Response)
 
+	// Build session from events for database storage
+	result.Session = SessionFromEvents(events, evalSess.Title, result.Question)
+
 	if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 {
 		result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
 	}
diff --git a/pkg/evaluation/save.go b/pkg/evaluation/save.go
index 63588a1d7..2f6c93613 100644
--- a/pkg/evaluation/save.go
+++ b/pkg/evaluation/save.go
@@ -2,14 +2,307 @@ package evaluation
 
 import (
 	"cmp"
+	"context"
 	"encoding/json"
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
+	"time"
 
+	"github.com/docker/cagent/pkg/chat"
 	"github.com/docker/cagent/pkg/session"
+	"github.com/docker/cagent/pkg/tools"
 )
 
+// SaveRunSessions saves all eval sessions to a SQLite database file.
+// The database follows the same schema as the main session store,
+// allowing the sessions to be loaded and inspected using standard session tools.
+func SaveRunSessions(ctx context.Context, run *EvalRun, outputDir string) (string, error) {
+	dbPath := filepath.Join(outputDir, run.Name+".db")
+
+	// Create output directory if needed
+	if err := os.MkdirAll(outputDir, 0o755); err != nil {
+		return "", fmt.Errorf("creating output directory: %w", err)
+	}
+
+	// Create a new SQLite session store for this eval run
+	store, err := session.NewSQLiteSessionStore(dbPath)
+	if err != nil {
+		return "", fmt.Errorf("creating session store: %w", err)
+	}
+	defer func() {
+		if closer, ok := store.(interface{ Close() error }); ok {
+			_ = closer.Close()
+		}
+	}()
+
+	// Save each result's session to the database
+	for i := range run.Results {
+		result := &run.Results[i]
+		if result.Session == nil {
+			continue
+		}
+
+		if err := store.AddSession(ctx, result.Session); err != nil {
+			return "", fmt.Errorf("saving session for %q: %w", result.Title, err)
+		}
+	}
+
+	return dbPath, nil
+}
+
+// SessionFromEvents reconstructs a session from raw container output events.
+// This parses the JSON events emitted by cagent --json and builds a session
+// with the conversation history.
+func SessionFromEvents(events []map[string]any, title, question string) *session.Session {
+	sess := session.New(
+		session.WithTitle(title),
+		session.WithToolsApproved(true),
+	)
+
+	// Add the user question as the first message
+	if question != "" {
+		sess.AddMessage(session.UserMessage(question))
+	}
+
+	// Track current assistant message being built
+	var currentContent strings.Builder
+	var currentReasoningContent strings.Builder
+	var currentToolCalls []tools.ToolCall
+	var currentToolDefinitions []tools.Tool
+	var currentAgentName string
+	var currentModel string
+	var currentUsage *chat.Usage
+	var currentCost float64
+
+	// Helper to flush current assistant message
+	flushAssistantMessage := func() {
+		if currentContent.Len() > 0 || currentReasoningContent.Len() > 0 || len(currentToolCalls) > 0 {
+			msg := &session.Message{
+				AgentName: currentAgentName,
+				Message: chat.Message{
+					Role:             chat.MessageRoleAssistant,
+					Content:          currentContent.String(),
+					ReasoningContent: currentReasoningContent.String(),
+					ToolCalls:        currentToolCalls,
+					ToolDefinitions:  currentToolDefinitions,
+					CreatedAt:        time.Now().Format(time.RFC3339),
+					Model:            currentModel,
+					Usage:            currentUsage,
+					Cost:             currentCost,
+				},
+			}
+			sess.AddMessage(msg)
+			currentContent.Reset()
+			currentReasoningContent.Reset()
+			currentToolCalls = nil
+			currentToolDefinitions = nil
+			currentModel = ""
+			currentUsage = nil
+			currentCost = 0
+		}
+	}
+
+	for _, event := range events {
+		eventType, _ := event["type"].(string)
+
+		switch eventType {
+		case "agent_choice":
+			// Accumulate agent response content
+			if content, ok := event["content"].(string); ok {
+				currentContent.WriteString(content)
+			}
+			if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
+				currentAgentName = agentName
+			}
+
+		case "agent_choice_reasoning":
+			// Accumulate reasoning content (for models like DeepSeek, Claude with extended thinking)
+			if content, ok := event["content"].(string); ok {
+				currentReasoningContent.WriteString(content)
+			}
+			if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
+				currentAgentName = agentName
+			}
+
+		case "tool_call":
+			// Parse tool call and add to current message
+			if tc, ok := event["tool_call"].(map[string]any); ok {
+				toolCall := parseToolCall(tc)
+				currentToolCalls = append(currentToolCalls, toolCall)
+			}
+			// Parse tool definition if present
+			if td, ok := event["tool_definition"].(map[string]any); ok {
+				toolDef := parseToolDefinition(td)
+				currentToolDefinitions = append(currentToolDefinitions, toolDef)
+			} else {
+				// Add empty tool definition to maintain index alignment with tool calls
+				currentToolDefinitions = append(currentToolDefinitions, tools.Tool{})
+			}
+			if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
+				currentAgentName = agentName
+			}
+
+		case "tool_call_response":
+			// Flush any pending assistant message before adding tool response
+			flushAssistantMessage()
+
+			// Add tool response message
+			if tc, ok := event["tool_call"].(map[string]any); ok {
+				toolCallID, _ := tc["id"].(string)
+				response, _ := event["response"].(string)
+
+				msg := &session.Message{
+					Message: chat.Message{
+						Role:       chat.MessageRoleTool,
+						Content:    response,
+						ToolCallID: toolCallID,
+						CreatedAt:  time.Now().Format(time.RFC3339),
+					},
+				}
+				sess.AddMessage(msg)
+			}
+
+		case "token_usage":
+			// Update session token usage
+			if usage, ok := event["usage"].(map[string]any); ok {
+				if inputTokens, ok := usage["input_tokens"].(float64); ok {
+					sess.InputTokens = int64(inputTokens)
+				}
+				if outputTokens, ok := usage["output_tokens"].(float64); ok {
+					sess.OutputTokens = int64(outputTokens)
+				}
+				if cost, ok := usage["cost"].(float64); ok {
+					sess.Cost = cost
+				}
+				// Extract per-message usage if available
+				if lastMsg, ok := usage["last_message"].(map[string]any); ok {
+					currentUsage = parseMessageUsage(lastMsg)
+					if model, ok := lastMsg["Model"].(string); ok {
+						currentModel = model
+					}
+					if msgCost, ok := lastMsg["Cost"].(float64); ok {
+						currentCost = msgCost
+					}
+				}
+			}
+
+		case "error":
+			// Flush any pending assistant message before adding error
+			flushAssistantMessage()
+
+			// Add error as a system message so it's visible in the session
+			if errorMsg, ok := event["error"].(string); ok && errorMsg != "" {
+				msg := &session.Message{
+					Message: chat.Message{
+						Role:      chat.MessageRoleSystem,
+						Content:   "Error: " + errorMsg,
+						CreatedAt: time.Now().Format(time.RFC3339),
+					},
+				}
+				sess.AddMessage(msg)
+			}
+
+		case "session_title":
+			// Update session title if provided (may override the one from eval config)
+			if eventTitle, ok := event["title"].(string); ok && eventTitle != "" {
+				sess.Title = eventTitle
+			}
+
+		case "stream_stopped":
+			// Flush final assistant message
+			flushAssistantMessage()
+		}
+	}
+
+	// Flush any remaining content
+	flushAssistantMessage()
+
+	return sess
+}
+
+// parseToolCall converts a map representation of a tool call to tools.ToolCall
+func parseToolCall(tc map[string]any) tools.ToolCall {
+	toolCall := tools.ToolCall{}
+
+	if id, ok := tc["id"].(string); ok {
+		toolCall.ID = id
+	}
+	if typ, ok := tc["type"].(string); ok {
+		toolCall.Type = tools.ToolType(typ)
+	}
+
+	if fn, ok := tc["function"].(map[string]any); ok {
+		if name, ok := fn["name"].(string); ok {
+			toolCall.Function.Name = name
+		}
+		if args, ok := fn["arguments"].(string); ok {
+			toolCall.Function.Arguments = args
+		}
+	}
+
+	return toolCall
+}
+
+// parseToolDefinition converts a map representation of a tool definition to tools.Tool
+func parseToolDefinition(td map[string]any) tools.Tool {
+	toolDef := tools.Tool{}
+
+	if name, ok := td["name"].(string); ok {
+		toolDef.Name = name
+	}
+	if category, ok := td["category"].(string); ok {
+		toolDef.Category = category
+	}
+	if description, ok := td["description"].(string); ok {
+		toolDef.Description = description
+	}
+	if parameters, ok := td["parameters"]; ok {
+		toolDef.Parameters = parameters
+	}
+
+	return toolDef
+}
+
+// parseMessageUsage converts a map representation of message usage to chat.Usage
+// Note: The embedded chat.Usage fields use snake_case JSON tags (input_tokens, etc.)
+// while Cost and Model don't have JSON tags and serialize with capitalized names.
+func parseMessageUsage(m map[string]any) *chat.Usage {
+	usage := &chat.Usage{}
+
+	// Try snake_case first (from JSON serialization), then capitalized (fallback)
+	if v, ok := m["input_tokens"].(float64); ok {
+		usage.InputTokens = int64(v)
+	} else if v, ok := m["InputTokens"].(float64); ok {
+		usage.InputTokens = int64(v)
+	}
+	if v, ok := m["output_tokens"].(float64); ok {
+		usage.OutputTokens = int64(v)
+	} else if v, ok := m["OutputTokens"].(float64); ok {
+		usage.OutputTokens = int64(v)
+	}
+	if v, ok := m["cached_input_tokens"].(float64); ok {
+		usage.CachedInputTokens = int64(v)
+	} else if v, ok := m["CachedInputTokens"].(float64); ok {
+		usage.CachedInputTokens = int64(v)
+	}
+	if v, ok := m["cached_write_tokens"].(float64); ok {
+		usage.CacheWriteTokens = int64(v)
+	} else if v, ok := m["CacheWriteTokens"].(float64); ok {
+		usage.CacheWriteTokens = int64(v)
+	}
+	if v, ok := m["reasoning_tokens"].(float64); ok {
+		usage.ReasoningTokens = int64(v)
+	} else if v, ok := m["ReasoningTokens"].(float64); ok {
+		usage.ReasoningTokens = int64(v)
+	}
+
+	return usage
+}
+
+// SaveRunJSON saves the eval run results to a JSON file.
+// This is kept for backward compatibility and debugging purposes.
 func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
 	return saveJSON(run, filepath.Join(outputDir, run.Name+".json"))
 }
diff --git a/pkg/evaluation/save_test.go b/pkg/evaluation/save_test.go
index 89c640165..bb9f3f4d1 100644
--- a/pkg/evaluation/save_test.go
+++ b/pkg/evaluation/save_test.go
@@ -3,10 +3,14 @@ package evaluation
 import (
 	"path/filepath"
 	"testing"
+	"time"
 
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
+	"github.com/docker/cagent/pkg/chat"
 	"github.com/docker/cagent/pkg/session"
+	"github.com/docker/cagent/pkg/tools"
 )
 
 func TestSaveWithCustomFilename(t *testing.T) {
@@ -35,3 +39,465 @@ func TestSaveWithCustomFilename(t *testing.T) {
 	require.Equal(t, filepath.Join("evals", "my-custom-eval_1.json"), evalFile3)
 	require.FileExists(t, evalFile3)
 }
+
+func TestSaveRunSessions(t *testing.T) {
+	t.Parallel()
+
+	ctx := t.Context()
+	outputDir := t.TempDir()
+
+	// Create an eval run with sessions
+	run := &EvalRun{
+		Name:      "test-eval-001",
+		Timestamp: time.Now(),
+		Results: []Result{
+			{
+				Title:    "eval-test-1",
+				Question: "What is the capital of France?",
+				Response: "Paris is the capital of France.",
+				Session: session.New(
+					session.WithTitle("eval-test-1"),
+					session.WithUserMessage("What is the capital of France?"),
+				),
+			},
+			{
+				Title:    "eval-test-2",
+				Question: "What is 2+2?",
+				Response: "4",
+				Session: session.New(
+					session.WithTitle("eval-test-2"),
+					session.WithUserMessage("What is 2+2?"),
+				),
+			},
+			{
+				// Result without a session (error case)
+				Title:   "eval-test-3",
+				Error:   "container failed",
+				Session: nil,
+			},
+		},
+	}
+
+	// Save sessions to database
+	dbPath, err := SaveRunSessions(ctx, run, outputDir)
+	require.NoError(t, err)
+	assert.Equal(t, filepath.Join(outputDir, "test-eval-001.db"), dbPath)
+	assert.FileExists(t, dbPath)
+
+	// Verify we can read sessions back from the database
+	store, err := session.NewSQLiteSessionStore(dbPath)
+	require.NoError(t, err)
+	defer func() {
+		if closer, ok := store.(interface{ Close() error }); ok {
+			_ = closer.Close()
+		}
+	}()
+
+	// Get all sessions
+	sessions, err := store.GetSessions(ctx)
+	require.NoError(t, err)
+	assert.Len(t, sessions, 2, "should have 2 sessions (excluding the error case)")
+
+	// Verify session content
+	titles := make(map[string]bool)
+	for _, sess := range sessions {
+		titles[sess.Title] = true
+	}
+	assert.True(t, titles["eval-test-1"], "should have eval-test-1")
+	assert.True(t, titles["eval-test-2"], "should have eval-test-2")
+}
+
+func TestSaveRunSessionsWithCost(t *testing.T) {
+	t.Parallel()
+
+	ctx := t.Context()
+	outputDir := t.TempDir()
+
+	// Create a session with cost data
+	sess := session.New(
+		session.WithTitle("cost-test"),
+		session.WithUserMessage("test question"),
+	)
+	sess.InputTokens = 500
+	sess.OutputTokens = 200
+	sess.Cost = 0.0125
+
+	run := &EvalRun{
+		Name:      "test-cost-001",
+		Timestamp: time.Now(),
+		Results: []Result{
+			{
+				Title:    "cost-test",
+				Question: "test question",
+				Response: "test response",
+				Session:  sess,
+			},
+		},
+	}
+
+	// Save sessions to database
+	dbPath, err := SaveRunSessions(ctx, run, outputDir)
+	require.NoError(t, err)
+
+	// Verify we can read sessions back with cost preserved
+	store, err := session.NewSQLiteSessionStore(dbPath)
+	require.NoError(t, err)
+	defer func() {
+		if closer, ok := store.(interface{ Close() error }); ok {
+			_ = closer.Close()
+		}
+	}()
+
+	sessions, err := store.GetSessions(ctx)
+	require.NoError(t, err)
+	require.Len(t, sessions, 1)
+
+	loadedSess := sessions[0]
+	assert.Equal(t, int64(500), loadedSess.InputTokens)
+	assert.Equal(t, int64(200), loadedSess.OutputTokens)
+	assert.InDelta(t, 0.0125, loadedSess.Cost, 0.0001, "cost should be preserved")
+}
+
+func TestSessionFromEvents(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name         string
+		events       []map[string]any
+		title        string
+		question     string
+		wantMessages int
+		wantContent  string
+	}{
+		{
+			name:         "empty events",
+			events:       []map[string]any{},
+			title:        "test",
+			question:     "hello",
+			wantMessages: 1, // just the user message
+			wantContent:  "",
+		},
+		{
+			name: "agent choice events",
+			events: []map[string]any{
+				{"type": "agent_choice", "content": "Hello ", "agent_name": "root"},
+				{"type": "agent_choice", "content": "world!"},
+				{"type": "stream_stopped"},
+			},
+			title:        "test",
+			question:     "greet me",
+			wantMessages: 2, // user + assistant
+			wantContent:  "Hello world!",
+		},
+		{
+			name: "tool calls and responses",
+			events: []map[string]any{
+				{"type": "agent_choice", "content": "Let me help.", "agent_name": "root"},
+				{
+					"type": "tool_call",
+					"tool_call": map[string]any{
+						"id":   "call_123",
+						"type": "function",
+						"function": map[string]any{
+							"name":      "read_file",
+							"arguments": `{"path": "test.txt"}`,
+						},
+					},
+				},
+				{
+					"type": "tool_call_response",
+					"tool_call": map[string]any{
+						"id": "call_123",
+					},
+					"response": "file content",
+				},
+				{"type": "agent_choice", "content": "Done!"},
+				{"type": "stream_stopped"},
+			},
+			title:        "test",
+			question:     "read file",
+			wantMessages: 4, // user + assistant (with tool call) + tool response + assistant
+			wantContent:  "Done!",
+		},
+		{
+			name: "token usage updates session",
+			events: []map[string]any{
+				{"type": "agent_choice", "content": "Answer"},
+				{
+					"type": "token_usage",
+					"usage": map[string]any{
+						"input_tokens":  float64(100),
+						"output_tokens": float64(50),
+						"cost":          0.005,
+					},
+				},
+				{"type": "stream_stopped"},
+			},
+			title:        "test",
+			question:     "question",
+			wantMessages: 2,
+			wantContent:  "Answer",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			sess := SessionFromEvents(tt.events, tt.title, tt.question)
+
+			assert.Equal(t, tt.title, sess.Title)
+			assert.Len(t, sess.Messages, tt.wantMessages)
+
+			// Check first message is user message
+			if tt.question != "" {
+				assert.Equal(t, chat.MessageRoleUser, sess.Messages[0].Message.Message.Role)
+				assert.Equal(t, tt.question, sess.Messages[0].Message.Message.Content)
+			}
+
+			// Check last assistant message content if expected
+			if tt.wantContent != "" {
+				lastContent := sess.GetLastAssistantMessageContent()
+				assert.Equal(t, tt.wantContent, lastContent)
+			}
+		})
+	}
+}
+
+func TestSessionFromEventsTokenUsage(t *testing.T) {
+	t.Parallel()
+
+	events := []map[string]any{
+		{"type": "agent_choice", "content": "Answer"},
+		{
+			"type": "token_usage",
+			"usage": map[string]any{
+				"input_tokens":  float64(100),
+				"output_tokens": float64(50),
+				"cost":          0.005,
+			},
+		},
+		{"type": "stream_stopped"},
+	}
+
+	sess := SessionFromEvents(events, "test", "question")
+
+	assert.Equal(t, int64(100), sess.InputTokens)
+	assert.Equal(t, int64(50), sess.OutputTokens)
+	assert.InDelta(t, 0.005, sess.Cost, 0.0001)
+}
+
+func TestParseToolCall(t *testing.T) {
+	t.Parallel()
+
+	tc := map[string]any{
+		"id":   "call_abc",
+		"type": "function",
+		"function": map[string]any{
+			"name":      "read_file",
+			"arguments": `{"path": "foo.txt"}`,
+		},
+	}
+
+	toolCall := parseToolCall(tc)
+
+	assert.Equal(t, "call_abc", toolCall.ID)
+	assert.Equal(t, tools.ToolType("function"), toolCall.Type)
+	assert.Equal(t, "read_file", toolCall.Function.Name)
+	assert.JSONEq(t, `{"path": "foo.txt"}`, toolCall.Function.Arguments)
+}
+
+func TestParseToolDefinition(t *testing.T) {
+	t.Parallel()
+
+	td := map[string]any{
+		"name":        "read_file",
+		"category":    "filesystem",
+		"description": "Read the contents of a file",
+		"parameters": map[string]any{
+			"type": "object",
+			"properties": map[string]any{
+				"path": map[string]any{
+					"type":        "string",
+					"description": "The file path to read",
+				},
+			},
+		},
+	}
+
+	toolDef := parseToolDefinition(td)
+
+	assert.Equal(t, "read_file", toolDef.Name)
+	assert.Equal(t, "filesystem", toolDef.Category)
+	assert.Equal(t, "Read the contents of a file", toolDef.Description)
+	assert.NotNil(t, toolDef.Parameters)
+}
+
+func TestSessionFromEventsWithToolDefinitions(t *testing.T) {
+	t.Parallel()
+
+	events := []map[string]any{
+		{"type": "agent_choice", "content": "Let me read that file.", "agent_name": "root"},
+		{
+			"type": "tool_call",
+			"tool_call": map[string]any{
+				"id":   "call_123",
+				"type": "function",
+				"function": map[string]any{
+					"name":      "read_file",
+					"arguments": `{"path": "test.txt"}`,
+				},
+			},
+			"tool_definition": map[string]any{
+				"name":        "read_file",
+				"category":    "filesystem",
+				"description": "Read the contents of a file",
+			},
+		},
+		{
+			"type": "tool_call_response",
+			"tool_call": map[string]any{
+				"id": "call_123",
+			},
+			"response": "file content",
+		},
+		{"type": "stream_stopped"},
+	}
+
+	sess := SessionFromEvents(events, "test", "read the file")
+
+	// Find the assistant message with tool calls
+	var assistantMsg *session.Message
+	for _, item := range sess.Messages {
+		if item.Message != nil && item.Message.Message.Role == chat.MessageRoleAssistant && len(item.Message.Message.ToolCalls) > 0 {
+			assistantMsg = item.Message
+			break
+		}
+	}
+
+	require.NotNil(t, assistantMsg, "should have assistant message with tool calls")
+	assert.Len(t, assistantMsg.Message.ToolCalls, 1)
+	assert.Len(t, assistantMsg.Message.ToolDefinitions, 1)
+
+	// Verify tool call
+	toolCall := assistantMsg.Message.ToolCalls[0]
+	assert.Equal(t, "call_123", toolCall.ID)
+	assert.Equal(t, "read_file", toolCall.Function.Name)
+
+	// Verify tool definition
+	toolDef := assistantMsg.Message.ToolDefinitions[0]
+	assert.Equal(t, "read_file", toolDef.Name)
+	assert.Equal(t, "filesystem", toolDef.Category)
+	assert.Equal(t, "Read the contents of a file", toolDef.Description)
+}
+
+func TestSessionFromEventsWithReasoningContent(t *testing.T) {
+	t.Parallel()
+
+	events := []map[string]any{
+		{"type": "agent_choice_reasoning", "content": "Let me think about this...", "agent_name": "root"},
+		{"type": "agent_choice_reasoning", "content": " I should analyze the question."},
+		{"type": "agent_choice", "content": "Here is my answer."},
+		{"type": "stream_stopped"},
+	}
+
+	sess := SessionFromEvents(events, "test", "complex question")
+
+	// Find the assistant message
+	var assistantMsg *session.Message
+	for _, item := range sess.Messages {
+		if item.Message != nil && item.Message.Message.Role == chat.MessageRoleAssistant {
+			assistantMsg = item.Message
+			break
+		}
+	}
+
+	require.NotNil(t, assistantMsg, "should have assistant message")
+	assert.Equal(t, "Here is my answer.", assistantMsg.Message.Content)
+	assert.Equal(t, "Let me think about this... I should analyze the question.", assistantMsg.Message.ReasoningContent)
+}
+
+func TestSessionFromEventsWithPerMessageUsage(t *testing.T) {
+	t.Parallel()
+
+	events := []map[string]any{
+		{"type": "agent_choice", "content": "Hello!", "agent_name": "root"},
+		{
+			"type": "token_usage",
+			"usage": map[string]any{
+				"input_tokens":  float64(100),
+				"output_tokens": float64(50),
+				"cost":          0.005,
+				"last_message": map[string]any{
+					"input_tokens":        float64(100),
+					"output_tokens":       float64(50),
+					"cached_input_tokens": float64(25),
+					"Model":               "gpt-4o",
+					"Cost":                0.005,
+				},
+			},
+		},
+		{"type": "stream_stopped"},
+	}
+
+	sess := SessionFromEvents(events, "test", "hi")
+
+	// Check session-level usage
+	assert.Equal(t, int64(100), sess.InputTokens)
+	assert.Equal(t, int64(50), sess.OutputTokens)
+	assert.InDelta(t, 0.005, sess.Cost, 0.0001)
+
+	// Find the assistant message
+	var assistantMsg *session.Message
+	for _, item := range sess.Messages {
+		if item.Message != nil && item.Message.Message.Role == chat.MessageRoleAssistant {
+			assistantMsg = item.Message
+			break
+		}
+	}
+
+	require.NotNil(t, assistantMsg, "should have assistant message")
+	assert.Equal(t, "gpt-4o", assistantMsg.Message.Model)
+	assert.InDelta(t, 0.005, assistantMsg.Message.Cost, 0.0001)
+	require.NotNil(t, assistantMsg.Message.Usage)
+	assert.Equal(t, int64(100), assistantMsg.Message.Usage.InputTokens)
+	assert.Equal(t, int64(50), assistantMsg.Message.Usage.OutputTokens)
+	assert.Equal(t, int64(25), assistantMsg.Message.Usage.CachedInputTokens)
+}
+
+func TestSessionFromEventsWithError(t *testing.T) {
+	t.Parallel()
+
+	events := []map[string]any{
+		{"type": "agent_choice", "content": "Let me try...", "agent_name": "root"},
+		{"type": "error", "error": "API rate limit exceeded"},
+		{"type": "stream_stopped"},
+	}
+
+	sess := SessionFromEvents(events, "test", "do something")
+
+	// Should have: user message, assistant message, error message
+	assert.Len(t, sess.Messages, 3)
+
+	// Check the error message was captured
+	errorMsg := sess.Messages[2].Message
+	require.NotNil(t, errorMsg)
+	assert.Equal(t, chat.MessageRoleSystem, errorMsg.Message.Role)
+	assert.Contains(t, errorMsg.Message.Content, "API rate limit exceeded")
+}
+
+func TestSessionFromEventsWithSessionTitle(t *testing.T) {
+	t.Parallel()
+
+	events := []map[string]any{
+		{"type": "session_title", "title": "Auto-generated title"},
+		{"type": "agent_choice", "content": "Hello!"},
+		{"type": "stream_stopped"},
+	}
+
+	// Start with a default title
+	sess := SessionFromEvents(events, "default-title", "hi")
+
+	// Title should be updated from the event
+	assert.Equal(t, "Auto-generated title", sess.Title)
+}
diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go
index e69fc9768..4050551fc 100644
--- a/pkg/evaluation/types.go
+++ b/pkg/evaluation/types.go
@@ -38,7 +38,7 @@ type Result struct {
 	RelevanceExpected float64          `json:"relevance_expected"`
 	FailedRelevance   []string         `json:"failed_relevance,omitempty"`
 	Error             string           `json:"error,omitempty"`
-	RawOutput         []map[string]any `json:"raw_output,omitempty"`
+	Session           *session.Session `json:"-"` // Full session for database storage (not in JSON)
 }
 
 // checkResults returns successes and failures for this result.
diff --git a/pkg/session/store.go b/pkg/session/store.go
index 61e56e66b..024829670 100644
--- a/pkg/session/store.go
+++ b/pkg/session/store.go
@@ -505,8 +505,8 @@ func (s *SQLiteSessionStore) AddSession(ctx context.Context, session *Session) e
 	defer func() { _ = tx.Rollback() }()
 
 	_, err = tx.ExecContext(ctx,
-		"INSERT INTO sessions (id, tools_approved, input_tokens, output_tokens, title, send_user_message, max_iterations, working_dir, created_at, permissions, agent_model_overrides, custom_models_used, thinking, parent_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
-		session.ID, session.ToolsApproved, session.InputTokens, session.OutputTokens, session.Title, session.SendUserMessage, session.MaxIterations, session.WorkingDir, session.CreatedAt.Format(time.RFC3339), permissionsJSON, agentModelOverridesJSON, customModelsUsedJSON, session.Thinking, parentID)
+		"INSERT INTO sessions (id, tools_approved, input_tokens, output_tokens, title, cost, send_user_message, max_iterations, working_dir, created_at, permissions, agent_model_overrides, custom_models_used, thinking, parent_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+		session.ID, session.ToolsApproved, session.InputTokens, session.OutputTokens, session.Title, session.Cost, session.SendUserMessage, session.MaxIterations, session.WorkingDir, session.CreatedAt.Format(time.RFC3339), permissionsJSON, agentModelOverridesJSON, customModelsUsedJSON, session.Thinking, parentID)
 	if err != nil {
 		return err
 	}