diff --git a/cmd/root/eval.go b/cmd/root/eval.go index 4b3183c20..47ad65fb5 100644 --- a/cmd/root/eval.go +++ b/cmd/root/eval.go @@ -118,14 +118,15 @@ func (f *evalFlags) runEvalCommand(cmd *cobra.Command, args []string) error { return evalErr } - // Save results JSON - resultsPath, err := evaluation.SaveRunJSON(run, outputDir) + // Save sessions to SQLite database + dbPath, err := evaluation.SaveRunSessions(ctx, run, outputDir) if err != nil { - slog.Error("Failed to save results", "error", err) + slog.Error("Failed to save sessions database", "error", err) } else { - fmt.Fprintf(teeOut, "\nResults: %s\n", resultsPath) - fmt.Fprintf(teeOut, "Log: %s\n", logPath) + fmt.Fprintf(teeOut, "\nSessions: %s\n", dbPath) } + fmt.Fprintf(teeOut, "Log: %s\n", logPath) + return evalErr } diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go index 69258b765..27a75c265 100644 --- a/pkg/evaluation/eval.go +++ b/pkg/evaluation/eval.go @@ -312,9 +312,11 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Resu result.Response = response result.Cost = cost result.OutputTokens = outputTokens - result.RawOutput = events result.Size = getResponseSize(result.Response) + // Build session from events for database storage + result.Session = SessionFromEvents(events, evalSess.Title, result.Question) + if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 { result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls) } diff --git a/pkg/evaluation/save.go b/pkg/evaluation/save.go index 63588a1d7..2f6c93613 100644 --- a/pkg/evaluation/save.go +++ b/pkg/evaluation/save.go @@ -2,14 +2,307 @@ package evaluation import ( "cmp" + "context" "encoding/json" "fmt" "os" "path/filepath" + "strings" + "time" + "github.com/docker/cagent/pkg/chat" "github.com/docker/cagent/pkg/session" + "github.com/docker/cagent/pkg/tools" ) +// SaveRunSessions saves all eval sessions to a SQLite database file. +// The database follows the same schema as the main session store, +// allowing the sessions to be loaded and inspected using standard session tools. +func SaveRunSessions(ctx context.Context, run *EvalRun, outputDir string) (string, error) { + dbPath := filepath.Join(outputDir, run.Name+".db") + + // Create output directory if needed + if err := os.MkdirAll(outputDir, 0o755); err != nil { + return "", fmt.Errorf("creating output directory: %w", err) + } + + // Create a new SQLite session store for this eval run + store, err := session.NewSQLiteSessionStore(dbPath) + if err != nil { + return "", fmt.Errorf("creating session store: %w", err) + } + defer func() { + if closer, ok := store.(interface{ Close() error }); ok { + _ = closer.Close() + } + }() + + // Save each result's session to the database + for i := range run.Results { + result := &run.Results[i] + if result.Session == nil { + continue + } + + if err := store.AddSession(ctx, result.Session); err != nil { + return "", fmt.Errorf("saving session for %q: %w", result.Title, err) + } + } + + return dbPath, nil +} + +// SessionFromEvents reconstructs a session from raw container output events. +// This parses the JSON events emitted by cagent --json and builds a session +// with the conversation history. +func SessionFromEvents(events []map[string]any, title, question string) *session.Session { + sess := session.New( + session.WithTitle(title), + session.WithToolsApproved(true), + ) + + // Add the user question as the first message + if question != "" { + sess.AddMessage(session.UserMessage(question)) + } + + // Track current assistant message being built + var currentContent strings.Builder + var currentReasoningContent strings.Builder + var currentToolCalls []tools.ToolCall + var currentToolDefinitions []tools.Tool + var currentAgentName string + var currentModel string + var currentUsage *chat.Usage + var currentCost float64 + + // Helper to flush current assistant message + flushAssistantMessage := func() { + if currentContent.Len() > 0 || currentReasoningContent.Len() > 0 || len(currentToolCalls) > 0 { + msg := &session.Message{ + AgentName: currentAgentName, + Message: chat.Message{ + Role: chat.MessageRoleAssistant, + Content: currentContent.String(), + ReasoningContent: currentReasoningContent.String(), + ToolCalls: currentToolCalls, + ToolDefinitions: currentToolDefinitions, + CreatedAt: time.Now().Format(time.RFC3339), + Model: currentModel, + Usage: currentUsage, + Cost: currentCost, + }, + } + sess.AddMessage(msg) + currentContent.Reset() + currentReasoningContent.Reset() + currentToolCalls = nil + currentToolDefinitions = nil + currentModel = "" + currentUsage = nil + currentCost = 0 + } + } + + for _, event := range events { + eventType, _ := event["type"].(string) + + switch eventType { + case "agent_choice": + // Accumulate agent response content + if content, ok := event["content"].(string); ok { + currentContent.WriteString(content) + } + if agentName, ok := event["agent_name"].(string); ok && agentName != "" { + currentAgentName = agentName + } + + case "agent_choice_reasoning": + // Accumulate reasoning content (for models like DeepSeek, Claude with extended thinking) + if content, ok := event["content"].(string); ok { + currentReasoningContent.WriteString(content) + } + if agentName, ok := event["agent_name"].(string); ok && agentName != "" { + currentAgentName = agentName + } + + case "tool_call": + // Parse tool call and add to current message + if tc, ok := event["tool_call"].(map[string]any); ok { + toolCall := parseToolCall(tc) + currentToolCalls = append(currentToolCalls, toolCall) + } + // Parse tool definition if present + if td, ok := event["tool_definition"].(map[string]any); ok { + toolDef := parseToolDefinition(td) + currentToolDefinitions = append(currentToolDefinitions, toolDef) + } else { + // Add empty tool definition to maintain index alignment with tool calls + currentToolDefinitions = append(currentToolDefinitions, tools.Tool{}) + } + if agentName, ok := event["agent_name"].(string); ok && agentName != "" { + currentAgentName = agentName + } + + case "tool_call_response": + // Flush any pending assistant message before adding tool response + flushAssistantMessage() + + // Add tool response message + if tc, ok := event["tool_call"].(map[string]any); ok { + toolCallID, _ := tc["id"].(string) + response, _ := event["response"].(string) + + msg := &session.Message{ + Message: chat.Message{ + Role: chat.MessageRoleTool, + Content: response, + ToolCallID: toolCallID, + CreatedAt: time.Now().Format(time.RFC3339), + }, + } + sess.AddMessage(msg) + } + + case "token_usage": + // Update session token usage + if usage, ok := event["usage"].(map[string]any); ok { + if inputTokens, ok := usage["input_tokens"].(float64); ok { + sess.InputTokens = int64(inputTokens) + } + if outputTokens, ok := usage["output_tokens"].(float64); ok { + sess.OutputTokens = int64(outputTokens) + } + if cost, ok := usage["cost"].(float64); ok { + sess.Cost = cost + } + // Extract per-message usage if available + if lastMsg, ok := usage["last_message"].(map[string]any); ok { + currentUsage = parseMessageUsage(lastMsg) + if model, ok := lastMsg["Model"].(string); ok { + currentModel = model + } + if msgCost, ok := lastMsg["Cost"].(float64); ok { + currentCost = msgCost + } + } + } + + case "error": + // Flush any pending assistant message before adding error + flushAssistantMessage() + + // Add error as a system message so it's visible in the session + if errorMsg, ok := event["error"].(string); ok && errorMsg != "" { + msg := &session.Message{ + Message: chat.Message{ + Role: chat.MessageRoleSystem, + Content: "Error: " + errorMsg, + CreatedAt: time.Now().Format(time.RFC3339), + }, + } + sess.AddMessage(msg) + } + + case "session_title": + // Update session title if provided (may override the one from eval config) + if eventTitle, ok := event["title"].(string); ok && eventTitle != "" { + sess.Title = eventTitle + } + + case "stream_stopped": + // Flush final assistant message + flushAssistantMessage() + } + } + + // Flush any remaining content + flushAssistantMessage() + + return sess +} + +// parseToolCall converts a map representation of a tool call to tools.ToolCall +func parseToolCall(tc map[string]any) tools.ToolCall { + toolCall := tools.ToolCall{} + + if id, ok := tc["id"].(string); ok { + toolCall.ID = id + } + if typ, ok := tc["type"].(string); ok { + toolCall.Type = tools.ToolType(typ) + } + + if fn, ok := tc["function"].(map[string]any); ok { + if name, ok := fn["name"].(string); ok { + toolCall.Function.Name = name + } + if args, ok := fn["arguments"].(string); ok { + toolCall.Function.Arguments = args + } + } + + return toolCall +} + +// parseToolDefinition converts a map representation of a tool definition to tools.Tool +func parseToolDefinition(td map[string]any) tools.Tool { + toolDef := tools.Tool{} + + if name, ok := td["name"].(string); ok { + toolDef.Name = name + } + if category, ok := td["category"].(string); ok { + toolDef.Category = category + } + if description, ok := td["description"].(string); ok { + toolDef.Description = description + } + if parameters, ok := td["parameters"]; ok { + toolDef.Parameters = parameters + } + + return toolDef +} + +// parseMessageUsage converts a map representation of message usage to chat.Usage +// Note: The embedded chat.Usage fields use snake_case JSON tags (input_tokens, etc.) +// while Cost and Model don't have JSON tags and serialize with capitalized names. +func parseMessageUsage(m map[string]any) *chat.Usage { + usage := &chat.Usage{} + + // Try snake_case first (from JSON serialization), then capitalized (fallback) + if v, ok := m["input_tokens"].(float64); ok { + usage.InputTokens = int64(v) + } else if v, ok := m["InputTokens"].(float64); ok { + usage.InputTokens = int64(v) + } + if v, ok := m["output_tokens"].(float64); ok { + usage.OutputTokens = int64(v) + } else if v, ok := m["OutputTokens"].(float64); ok { + usage.OutputTokens = int64(v) + } + if v, ok := m["cached_input_tokens"].(float64); ok { + usage.CachedInputTokens = int64(v) + } else if v, ok := m["CachedInputTokens"].(float64); ok { + usage.CachedInputTokens = int64(v) + } + if v, ok := m["cached_write_tokens"].(float64); ok { + usage.CacheWriteTokens = int64(v) + } else if v, ok := m["CacheWriteTokens"].(float64); ok { + usage.CacheWriteTokens = int64(v) + } + if v, ok := m["reasoning_tokens"].(float64); ok { + usage.ReasoningTokens = int64(v) + } else if v, ok := m["ReasoningTokens"].(float64); ok { + usage.ReasoningTokens = int64(v) + } + + return usage +} + +// SaveRunJSON saves the eval run results to a JSON file. +// This is kept for backward compatibility and debugging purposes. func SaveRunJSON(run *EvalRun, outputDir string) (string, error) { return saveJSON(run, filepath.Join(outputDir, run.Name+".json")) } diff --git a/pkg/evaluation/save_test.go b/pkg/evaluation/save_test.go index 89c640165..bb9f3f4d1 100644 --- a/pkg/evaluation/save_test.go +++ b/pkg/evaluation/save_test.go @@ -3,10 +3,14 @@ package evaluation import ( "path/filepath" "testing" + "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/docker/cagent/pkg/chat" "github.com/docker/cagent/pkg/session" + "github.com/docker/cagent/pkg/tools" ) func TestSaveWithCustomFilename(t *testing.T) { @@ -35,3 +39,465 @@ func TestSaveWithCustomFilename(t *testing.T) { require.Equal(t, filepath.Join("evals", "my-custom-eval_1.json"), evalFile3) require.FileExists(t, evalFile3) } + +func TestSaveRunSessions(t *testing.T) { + t.Parallel() + + ctx := t.Context() + outputDir := t.TempDir() + + // Create an eval run with sessions + run := &EvalRun{ + Name: "test-eval-001", + Timestamp: time.Now(), + Results: []Result{ + { + Title: "eval-test-1", + Question: "What is the capital of France?", + Response: "Paris is the capital of France.", + Session: session.New( + session.WithTitle("eval-test-1"), + session.WithUserMessage("What is the capital of France?"), + ), + }, + { + Title: "eval-test-2", + Question: "What is 2+2?", + Response: "4", + Session: session.New( + session.WithTitle("eval-test-2"), + session.WithUserMessage("What is 2+2?"), + ), + }, + { + // Result without a session (error case) + Title: "eval-test-3", + Error: "container failed", + Session: nil, + }, + }, + } + + // Save sessions to database + dbPath, err := SaveRunSessions(ctx, run, outputDir) + require.NoError(t, err) + assert.Equal(t, filepath.Join(outputDir, "test-eval-001.db"), dbPath) + assert.FileExists(t, dbPath) + + // Verify we can read sessions back from the database + store, err := session.NewSQLiteSessionStore(dbPath) + require.NoError(t, err) + defer func() { + if closer, ok := store.(interface{ Close() error }); ok { + _ = closer.Close() + } + }() + + // Get all sessions + sessions, err := store.GetSessions(ctx) + require.NoError(t, err) + assert.Len(t, sessions, 2, "should have 2 sessions (excluding the error case)") + + // Verify session content + titles := make(map[string]bool) + for _, sess := range sessions { + titles[sess.Title] = true + } + assert.True(t, titles["eval-test-1"], "should have eval-test-1") + assert.True(t, titles["eval-test-2"], "should have eval-test-2") +} + +func TestSaveRunSessionsWithCost(t *testing.T) { + t.Parallel() + + ctx := t.Context() + outputDir := t.TempDir() + + // Create a session with cost data + sess := session.New( + session.WithTitle("cost-test"), + session.WithUserMessage("test question"), + ) + sess.InputTokens = 500 + sess.OutputTokens = 200 + sess.Cost = 0.0125 + + run := &EvalRun{ + Name: "test-cost-001", + Timestamp: time.Now(), + Results: []Result{ + { + Title: "cost-test", + Question: "test question", + Response: "test response", + Session: sess, + }, + }, + } + + // Save sessions to database + dbPath, err := SaveRunSessions(ctx, run, outputDir) + require.NoError(t, err) + + // Verify we can read sessions back with cost preserved + store, err := session.NewSQLiteSessionStore(dbPath) + require.NoError(t, err) + defer func() { + if closer, ok := store.(interface{ Close() error }); ok { + _ = closer.Close() + } + }() + + sessions, err := store.GetSessions(ctx) + require.NoError(t, err) + require.Len(t, sessions, 1) + + loadedSess := sessions[0] + assert.Equal(t, int64(500), loadedSess.InputTokens) + assert.Equal(t, int64(200), loadedSess.OutputTokens) + assert.InDelta(t, 0.0125, loadedSess.Cost, 0.0001, "cost should be preserved") +} + +func TestSessionFromEvents(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + events []map[string]any + title string + question string + wantMessages int + wantContent string + }{ + { + name: "empty events", + events: []map[string]any{}, + title: "test", + question: "hello", + wantMessages: 1, // just the user message + wantContent: "", + }, + { + name: "agent choice events", + events: []map[string]any{ + {"type": "agent_choice", "content": "Hello ", "agent_name": "root"}, + {"type": "agent_choice", "content": "world!"}, + {"type": "stream_stopped"}, + }, + title: "test", + question: "greet me", + wantMessages: 2, // user + assistant + wantContent: "Hello world!", + }, + { + name: "tool calls and responses", + events: []map[string]any{ + {"type": "agent_choice", "content": "Let me help.", "agent_name": "root"}, + { + "type": "tool_call", + "tool_call": map[string]any{ + "id": "call_123", + "type": "function", + "function": map[string]any{ + "name": "read_file", + "arguments": `{"path": "test.txt"}`, + }, + }, + }, + { + "type": "tool_call_response", + "tool_call": map[string]any{ + "id": "call_123", + }, + "response": "file content", + }, + {"type": "agent_choice", "content": "Done!"}, + {"type": "stream_stopped"}, + }, + title: "test", + question: "read file", + wantMessages: 4, // user + assistant (with tool call) + tool response + assistant + wantContent: "Done!", + }, + { + name: "token usage updates session", + events: []map[string]any{ + {"type": "agent_choice", "content": "Answer"}, + { + "type": "token_usage", + "usage": map[string]any{ + "input_tokens": float64(100), + "output_tokens": float64(50), + "cost": 0.005, + }, + }, + {"type": "stream_stopped"}, + }, + title: "test", + question: "question", + wantMessages: 2, + wantContent: "Answer", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + sess := SessionFromEvents(tt.events, tt.title, tt.question) + + assert.Equal(t, tt.title, sess.Title) + assert.Len(t, sess.Messages, tt.wantMessages) + + // Check first message is user message + if tt.question != "" { + assert.Equal(t, chat.MessageRoleUser, sess.Messages[0].Message.Message.Role) + assert.Equal(t, tt.question, sess.Messages[0].Message.Message.Content) + } + + // Check last assistant message content if expected + if tt.wantContent != "" { + lastContent := sess.GetLastAssistantMessageContent() + assert.Equal(t, tt.wantContent, lastContent) + } + }) + } +} + +func TestSessionFromEventsTokenUsage(t *testing.T) { + t.Parallel() + + events := []map[string]any{ + {"type": "agent_choice", "content": "Answer"}, + { + "type": "token_usage", + "usage": map[string]any{ + "input_tokens": float64(100), + "output_tokens": float64(50), + "cost": 0.005, + }, + }, + {"type": "stream_stopped"}, + } + + sess := SessionFromEvents(events, "test", "question") + + assert.Equal(t, int64(100), sess.InputTokens) + assert.Equal(t, int64(50), sess.OutputTokens) + assert.InDelta(t, 0.005, sess.Cost, 0.0001) +} + +func TestParseToolCall(t *testing.T) { + t.Parallel() + + tc := map[string]any{ + "id": "call_abc", + "type": "function", + "function": map[string]any{ + "name": "read_file", + "arguments": `{"path": "foo.txt"}`, + }, + } + + toolCall := parseToolCall(tc) + + assert.Equal(t, "call_abc", toolCall.ID) + assert.Equal(t, tools.ToolType("function"), toolCall.Type) + assert.Equal(t, "read_file", toolCall.Function.Name) + assert.JSONEq(t, `{"path": "foo.txt"}`, toolCall.Function.Arguments) +} + +func TestParseToolDefinition(t *testing.T) { + t.Parallel() + + td := map[string]any{ + "name": "read_file", + "category": "filesystem", + "description": "Read the contents of a file", + "parameters": map[string]any{ + "type": "object", + "properties": map[string]any{ + "path": map[string]any{ + "type": "string", + "description": "The file path to read", + }, + }, + }, + } + + toolDef := parseToolDefinition(td) + + assert.Equal(t, "read_file", toolDef.Name) + assert.Equal(t, "filesystem", toolDef.Category) + assert.Equal(t, "Read the contents of a file", toolDef.Description) + assert.NotNil(t, toolDef.Parameters) +} + +func TestSessionFromEventsWithToolDefinitions(t *testing.T) { + t.Parallel() + + events := []map[string]any{ + {"type": "agent_choice", "content": "Let me read that file.", "agent_name": "root"}, + { + "type": "tool_call", + "tool_call": map[string]any{ + "id": "call_123", + "type": "function", + "function": map[string]any{ + "name": "read_file", + "arguments": `{"path": "test.txt"}`, + }, + }, + "tool_definition": map[string]any{ + "name": "read_file", + "category": "filesystem", + "description": "Read the contents of a file", + }, + }, + { + "type": "tool_call_response", + "tool_call": map[string]any{ + "id": "call_123", + }, + "response": "file content", + }, + {"type": "stream_stopped"}, + } + + sess := SessionFromEvents(events, "test", "read the file") + + // Find the assistant message with tool calls + var assistantMsg *session.Message + for _, item := range sess.Messages { + if item.Message != nil && item.Message.Message.Role == chat.MessageRoleAssistant && len(item.Message.Message.ToolCalls) > 0 { + assistantMsg = item.Message + break + } + } + + require.NotNil(t, assistantMsg, "should have assistant message with tool calls") + assert.Len(t, assistantMsg.Message.ToolCalls, 1) + assert.Len(t, assistantMsg.Message.ToolDefinitions, 1) + + // Verify tool call + toolCall := assistantMsg.Message.ToolCalls[0] + assert.Equal(t, "call_123", toolCall.ID) + assert.Equal(t, "read_file", toolCall.Function.Name) + + // Verify tool definition + toolDef := assistantMsg.Message.ToolDefinitions[0] + assert.Equal(t, "read_file", toolDef.Name) + assert.Equal(t, "filesystem", toolDef.Category) + assert.Equal(t, "Read the contents of a file", toolDef.Description) +} + +func TestSessionFromEventsWithReasoningContent(t *testing.T) { + t.Parallel() + + events := []map[string]any{ + {"type": "agent_choice_reasoning", "content": "Let me think about this...", "agent_name": "root"}, + {"type": "agent_choice_reasoning", "content": " I should analyze the question."}, + {"type": "agent_choice", "content": "Here is my answer."}, + {"type": "stream_stopped"}, + } + + sess := SessionFromEvents(events, "test", "complex question") + + // Find the assistant message + var assistantMsg *session.Message + for _, item := range sess.Messages { + if item.Message != nil && item.Message.Message.Role == chat.MessageRoleAssistant { + assistantMsg = item.Message + break + } + } + + require.NotNil(t, assistantMsg, "should have assistant message") + assert.Equal(t, "Here is my answer.", assistantMsg.Message.Content) + assert.Equal(t, "Let me think about this... I should analyze the question.", assistantMsg.Message.ReasoningContent) +} + +func TestSessionFromEventsWithPerMessageUsage(t *testing.T) { + t.Parallel() + + events := []map[string]any{ + {"type": "agent_choice", "content": "Hello!", "agent_name": "root"}, + { + "type": "token_usage", + "usage": map[string]any{ + "input_tokens": float64(100), + "output_tokens": float64(50), + "cost": 0.005, + "last_message": map[string]any{ + "input_tokens": float64(100), + "output_tokens": float64(50), + "cached_input_tokens": float64(25), + "Model": "gpt-4o", + "Cost": 0.005, + }, + }, + }, + {"type": "stream_stopped"}, + } + + sess := SessionFromEvents(events, "test", "hi") + + // Check session-level usage + assert.Equal(t, int64(100), sess.InputTokens) + assert.Equal(t, int64(50), sess.OutputTokens) + assert.InDelta(t, 0.005, sess.Cost, 0.0001) + + // Find the assistant message + var assistantMsg *session.Message + for _, item := range sess.Messages { + if item.Message != nil && item.Message.Message.Role == chat.MessageRoleAssistant { + assistantMsg = item.Message + break + } + } + + require.NotNil(t, assistantMsg, "should have assistant message") + assert.Equal(t, "gpt-4o", assistantMsg.Message.Model) + assert.InDelta(t, 0.005, assistantMsg.Message.Cost, 0.0001) + require.NotNil(t, assistantMsg.Message.Usage) + assert.Equal(t, int64(100), assistantMsg.Message.Usage.InputTokens) + assert.Equal(t, int64(50), assistantMsg.Message.Usage.OutputTokens) + assert.Equal(t, int64(25), assistantMsg.Message.Usage.CachedInputTokens) +} + +func TestSessionFromEventsWithError(t *testing.T) { + t.Parallel() + + events := []map[string]any{ + {"type": "agent_choice", "content": "Let me try...", "agent_name": "root"}, + {"type": "error", "error": "API rate limit exceeded"}, + {"type": "stream_stopped"}, + } + + sess := SessionFromEvents(events, "test", "do something") + + // Should have: user message, assistant message, error message + assert.Len(t, sess.Messages, 3) + + // Check the error message was captured + errorMsg := sess.Messages[2].Message + require.NotNil(t, errorMsg) + assert.Equal(t, chat.MessageRoleSystem, errorMsg.Message.Role) + assert.Contains(t, errorMsg.Message.Content, "API rate limit exceeded") +} + +func TestSessionFromEventsWithSessionTitle(t *testing.T) { + t.Parallel() + + events := []map[string]any{ + {"type": "session_title", "title": "Auto-generated title"}, + {"type": "agent_choice", "content": "Hello!"}, + {"type": "stream_stopped"}, + } + + // Start with a default title + sess := SessionFromEvents(events, "default-title", "hi") + + // Title should be updated from the event + assert.Equal(t, "Auto-generated title", sess.Title) +} diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go index e69fc9768..4050551fc 100644 --- a/pkg/evaluation/types.go +++ b/pkg/evaluation/types.go @@ -38,7 +38,7 @@ type Result struct { RelevanceExpected float64 `json:"relevance_expected"` FailedRelevance []string `json:"failed_relevance,omitempty"` Error string `json:"error,omitempty"` - RawOutput []map[string]any `json:"raw_output,omitempty"` + Session *session.Session `json:"-"` // Full session for database storage (not in JSON) } // checkResults returns successes and failures for this result. diff --git a/pkg/session/store.go b/pkg/session/store.go index 61e56e66b..024829670 100644 --- a/pkg/session/store.go +++ b/pkg/session/store.go @@ -505,8 +505,8 @@ func (s *SQLiteSessionStore) AddSession(ctx context.Context, session *Session) e defer func() { _ = tx.Rollback() }() _, err = tx.ExecContext(ctx, - "INSERT INTO sessions (id, tools_approved, input_tokens, output_tokens, title, send_user_message, max_iterations, working_dir, created_at, permissions, agent_model_overrides, custom_models_used, thinking, parent_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", - session.ID, session.ToolsApproved, session.InputTokens, session.OutputTokens, session.Title, session.SendUserMessage, session.MaxIterations, session.WorkingDir, session.CreatedAt.Format(time.RFC3339), permissionsJSON, agentModelOverridesJSON, customModelsUsedJSON, session.Thinking, parentID) + "INSERT INTO sessions (id, tools_approved, input_tokens, output_tokens, title, cost, send_user_message, max_iterations, working_dir, created_at, permissions, agent_model_overrides, custom_models_used, thinking, parent_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + session.ID, session.ToolsApproved, session.InputTokens, session.OutputTokens, session.Title, session.Cost, session.SendUserMessage, session.MaxIterations, session.WorkingDir, session.CreatedAt.Format(time.RFC3339), permissionsJSON, agentModelOverridesJSON, customModelsUsedJSON, session.Thinking, parentID) if err != nil { return err }