From 880c78ff4270597746c5c64f392b6060125f0f77 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 21 Mar 2026 06:09:04 +0000 Subject: [PATCH 1/2] Initial plan From cf189425fe5cadb2fa5b2e3aec21e519fcc58d7b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 21 Mar 2026 06:42:13 +0000 Subject: [PATCH 2/2] Fix metrics tracking bug: Turns always 0 and ToolCalls always null in run summaries - Fix Copilot CLI debug log format: count [DEBUG] data: blocks as turns (previously counted 'User:'/'Human:'/'Query:' patterns absent from Copilot logs) - Fix Copilot CLI debug log format: parseCopilotToolCallsWithSequence now updates toolCallMap from 'Executing tool: ' lines so ToolCalls is populated instead of null - Fix Copilot JSONL session format: fall back to counting assistant message entries as turns when num_turns is 0 or absent from the result entry - Fix buildToolCallsFromRPCMessages: emit tool call records for requests with null IDs instead of silently dropping them (fixes mcp_tool_usage.tool_calls null) - Fix audit.go: include MCPToolUsage in saved RunSummary for persistent caching - Add regression tests for all five bugs Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> Agent-Logs-Url: https://github.com/github/gh-aw/sessions/381cb673-bdaf-4cd0-9e90-6c530894d475 --- pkg/cli/audit.go | 1 + pkg/cli/copilot_metrics_fix_test.go | 136 +++++++++++++++++++++ pkg/cli/gateway_logs.go | 18 ++- pkg/cli/gateway_logs_test.go | 31 +++++ pkg/workflow/copilot_logs.go | 58 ++++++--- pkg/workflow/copilot_session_jsonl_test.go | 51 ++++++++ 6 files changed, 275 insertions(+), 20 deletions(-) create mode 100644 pkg/cli/copilot_metrics_fix_test.go diff --git a/pkg/cli/audit.go b/pkg/cli/audit.go index 6fca34096f9..12bcc2f104b 100644 --- a/pkg/cli/audit.go +++ b/pkg/cli/audit.go @@ -420,6 +420,7 @@ func AuditWorkflowRun(ctx context.Context, runID int64, owner, repo, hostname st MissingData: missingData, Noops: noops, MCPFailures: mcpFailures, + MCPToolUsage: mcpToolUsage, ArtifactsList: artifacts, JobDetails: jobDetails, } diff --git a/pkg/cli/copilot_metrics_fix_test.go b/pkg/cli/copilot_metrics_fix_test.go new file mode 100644 index 00000000000..871f5ff4c09 --- /dev/null +++ b/pkg/cli/copilot_metrics_fix_test.go @@ -0,0 +1,136 @@ +//go:build !integration + +package cli + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestCopilotDebugLogTurnsExtraction verifies that Turns are correctly counted from +// [DEBUG] data: blocks in the Copilot CLI debug log format. +// +// This is a regression test for the bug where Turns was always 0 because the parser +// counted "User:"/"Human:"/"Query:" patterns that do not appear in Copilot CLI debug logs. +// The fix counts each "[DEBUG] data:" block as one API response (one turn). +func TestCopilotDebugLogTurnsExtraction(t *testing.T) { + tempDir := t.TempDir() + + awInfoContent := `{"engine_id": "copilot"}` + require.NoError(t, os.WriteFile(filepath.Join(tempDir, "aw_info.json"), []byte(awInfoContent), 0644)) + + // Debug log with 2 API responses (2 data blocks) and "Executing tool: bash" between them. + // The JSON blocks have empty tool_calls (common in production Copilot CLI logs). + logContent := `2025-09-26T11:13:11.798Z [DEBUG] Starting Copilot CLI: 0.0.400 +2025-09-26T11:13:12.575Z [DEBUG] data: +2025-09-26T11:13:12.575Z [DEBUG] { +2025-09-26T11:13:12.575Z [DEBUG] "choices": [{ +2025-09-26T11:13:12.575Z [DEBUG] "message": { +2025-09-26T11:13:12.575Z [DEBUG] "role": "assistant", +2025-09-26T11:13:12.575Z [DEBUG] "content": null, +2025-09-26T11:13:12.575Z [DEBUG] "tool_calls": [] +2025-09-26T11:13:12.575Z [DEBUG] } +2025-09-26T11:13:12.575Z [DEBUG] }], +2025-09-26T11:13:12.575Z [DEBUG] "usage": { +2025-09-26T11:13:12.575Z [DEBUG] "prompt_tokens": 1000, +2025-09-26T11:13:12.575Z [DEBUG] "completion_tokens": 50, +2025-09-26T11:13:12.575Z [DEBUG] "total_tokens": 1050 +2025-09-26T11:13:12.575Z [DEBUG] } +2025-09-26T11:13:12.575Z [DEBUG] } +2025-09-26T11:13:13.000Z [DEBUG] Executing tool: bash +2025-09-26T11:13:13.500Z [DEBUG] Tool execution completed +2025-09-26T11:13:14.000Z [DEBUG] data: +2025-09-26T11:13:14.000Z [DEBUG] { +2025-09-26T11:13:14.000Z [DEBUG] "choices": [{ +2025-09-26T11:13:14.000Z [DEBUG] "message": { +2025-09-26T11:13:14.000Z [DEBUG] "role": "assistant", +2025-09-26T11:13:14.000Z [DEBUG] "content": "Task done.", +2025-09-26T11:13:14.000Z [DEBUG] "tool_calls": [] +2025-09-26T11:13:14.000Z [DEBUG] } +2025-09-26T11:13:14.000Z [DEBUG] }], +2025-09-26T11:13:14.000Z [DEBUG] "usage": { +2025-09-26T11:13:14.000Z [DEBUG] "prompt_tokens": 1200, +2025-09-26T11:13:14.000Z [DEBUG] "completion_tokens": 20, +2025-09-26T11:13:14.000Z [DEBUG] "total_tokens": 1220 +2025-09-26T11:13:14.000Z [DEBUG] } +2025-09-26T11:13:14.000Z [DEBUG] } +2025-09-26T11:13:14.500Z [DEBUG] Workflow completed` + + require.NoError(t, os.WriteFile(filepath.Join(tempDir, "agent.log"), []byte(logContent), 0644)) + + metrics, err := extractLogMetrics(tempDir, false) + require.NoError(t, err) + + // Tokens should be accumulated from both data blocks + assert.Equal(t, 2270, metrics.TokenUsage, "Should accumulate tokens from both data blocks: 1050 + 1220 = 2270") + + // Turns should be 2 (one per [DEBUG] data: block) + assert.Equal(t, 2, metrics.Turns, + "Turns should count [DEBUG] data: blocks; got %d", metrics.Turns) + + // Tool calls should be extracted from "Executing tool: bash" line + assert.NotEmpty(t, metrics.ToolCalls, "ToolCalls should not be null when 'Executing tool:' is present") + if len(metrics.ToolCalls) > 0 { + found := false + for _, tc := range metrics.ToolCalls { + if tc.Name == "bash" { + found = true + assert.Equal(t, 1, tc.CallCount, "bash should have been called once") + } + } + assert.True(t, found, "Expected to find 'bash' in tool calls") + } +} + +// TestCopilotDebugLogMultipleToolCalls verifies that multiple "Executing tool:" lines +// produce correct call counts in ToolCalls. +func TestCopilotDebugLogMultipleToolCalls(t *testing.T) { + tempDir := t.TempDir() + + awInfoContent := `{"engine_id": "copilot"}` + require.NoError(t, os.WriteFile(filepath.Join(tempDir, "aw_info.json"), []byte(awInfoContent), 0644)) + + // Three API response blocks with different tool executions + logContent := `2025-09-26T11:13:11.798Z [DEBUG] Starting Copilot CLI: 0.0.400 +2025-09-26T11:13:12.000Z [DEBUG] data: +2025-09-26T11:13:12.000Z [DEBUG] { +2025-09-26T11:13:12.000Z [DEBUG] "choices": [{"message": {"tool_calls": []}}], +2025-09-26T11:13:12.000Z [DEBUG] "usage": {"prompt_tokens": 500, "completion_tokens": 10, "total_tokens": 510} +2025-09-26T11:13:12.000Z [DEBUG] } +2025-09-26T11:13:12.500Z [DEBUG] Executing tool: bash +2025-09-26T11:13:13.000Z [DEBUG] data: +2025-09-26T11:13:13.000Z [DEBUG] { +2025-09-26T11:13:13.000Z [DEBUG] "choices": [{"message": {"tool_calls": []}}], +2025-09-26T11:13:13.000Z [DEBUG] "usage": {"prompt_tokens": 600, "completion_tokens": 10, "total_tokens": 610} +2025-09-26T11:13:13.000Z [DEBUG] } +2025-09-26T11:13:13.500Z [DEBUG] Executing tool: bash +2025-09-26T11:13:14.000Z [DEBUG] data: +2025-09-26T11:13:14.000Z [DEBUG] { +2025-09-26T11:13:14.000Z [DEBUG] "choices": [{"message": {"tool_calls": []}}], +2025-09-26T11:13:14.000Z [DEBUG] "usage": {"prompt_tokens": 700, "completion_tokens": 20, "total_tokens": 720} +2025-09-26T11:13:14.000Z [DEBUG] } +2025-09-26T11:13:14.500Z [DEBUG] Executing tool: mcp_github +2025-09-26T11:13:15.000Z [DEBUG] Workflow done` + + require.NoError(t, os.WriteFile(filepath.Join(tempDir, "agent.log"), []byte(logContent), 0644)) + + metrics, err := extractLogMetrics(tempDir, false) + require.NoError(t, err) + + // Turns: 3 data blocks + assert.Equal(t, 3, metrics.Turns, "Should count 3 turns from 3 data blocks") + + // Tool calls: bash x2, mcp_github x1 + assert.NotEmpty(t, metrics.ToolCalls, "ToolCalls should not be null") + + toolCounts := make(map[string]int) + for _, tc := range metrics.ToolCalls { + toolCounts[tc.Name] = tc.CallCount + } + assert.Equal(t, 2, toolCounts["bash"], "bash should have 2 calls") + assert.Equal(t, 1, toolCounts["mcp_github"], "mcp_github should have 1 call") +} diff --git a/pkg/cli/gateway_logs.go b/pkg/cli/gateway_logs.go index 790c2df8f98..3078c635bcc 100644 --- a/pkg/cli/gateway_logs.go +++ b/pkg/cli/gateway_logs.go @@ -769,6 +769,11 @@ func buildToolCallsFromRPCMessages(logPath string) ([]MCPToolCall, error) { return nil, fmt.Errorf("error reading rpc-messages.jsonl: %w", err) } + // Second pass: build MCPToolCall records. + // Declared before first pass so requests without IDs can be appended immediately. + var toolCalls []MCPToolCall + processedKeys := make(map[string]bool) + // First pass: index outgoing tool-call requests by (serverID, id) for i := range entries { e := &entries[i] @@ -783,6 +788,15 @@ func buildToolCallsFromRPCMessages(logPath string) ([]MCPToolCall, error) { continue } if e.req.ID == nil { + // Requests without an ID cannot be matched to responses. + // Emit the tool call immediately with "unknown" status so it appears + // in the tool_calls list (same as parseRPCMessages counts it in the summary). + toolCalls = append(toolCalls, MCPToolCall{ + Timestamp: e.entry.Timestamp, + ServerName: e.entry.ServerID, + ToolName: params.Name, + Status: "unknown", + }) continue } t, err := time.Parse(time.RFC3339Nano, e.entry.Timestamp) @@ -797,9 +811,7 @@ func buildToolCallsFromRPCMessages(logPath string) ([]MCPToolCall, error) { } } - // Second pass: build MCPToolCall records - var toolCalls []MCPToolCall - processedKeys := make(map[string]bool) + // Second pass: pair responses with pending requests to compute durations for i := range entries { e := &entries[i] diff --git a/pkg/cli/gateway_logs_test.go b/pkg/cli/gateway_logs_test.go index 5845548d0e8..c9fb235c707 100644 --- a/pkg/cli/gateway_logs_test.go +++ b/pkg/cli/gateway_logs_test.go @@ -764,3 +764,34 @@ func TestBuildToolCallsFromRPCMessages(t *testing.T) { assert.Equal(t, "error", getRepo.Status, "status should be error") assert.Equal(t, "rate limit", getRepo.Error, "error message should be set") } + +// TestBuildToolCallsFromRPCMessagesNullID verifies that requests with a null/missing ID +// are still included in the tool_calls output (regression test for mcp_tool_usage.tool_calls +// always being null when parseRPCMessages counted tool calls in the summary but +// buildToolCallsFromRPCMessages skipped null-ID requests). +func TestBuildToolCallsFromRPCMessagesNullID(t *testing.T) { + tmpDir := t.TempDir() + + // Requests with null ID (id:null) - these are counted in the summary by parseRPCMessages + // but were previously skipped by buildToolCallsFromRPCMessages, causing tool_calls=null. + rpcContent := `{"timestamp":"2024-01-12T10:00:00.000000000Z","direction":"OUT","type":"REQUEST","server_id":"github","payload":{"jsonrpc":"2.0","id":null,"method":"tools/call","params":{"name":"list_issues","arguments":{}}}} +{"timestamp":"2024-01-12T10:00:01.000000000Z","direction":"OUT","type":"REQUEST","server_id":"github","payload":{"jsonrpc":"2.0","id":null,"method":"tools/call","params":{"name":"issue_read","arguments":{}}}} +` + logPath := filepath.Join(tmpDir, "rpc-messages.jsonl") + require.NoError(t, os.WriteFile(logPath, []byte(rpcContent), 0644)) + + calls, err := buildToolCallsFromRPCMessages(logPath) + require.NoError(t, err, "should build tool calls without error") + + // Both requests should produce tool call records even without IDs + assert.Len(t, calls, 2, "null-ID requests should still produce tool call records") + + toolNames := make(map[string]bool) + for _, c := range calls { + toolNames[c.ToolName] = true + assert.Equal(t, "github", c.ServerName, "server name should be set") + assert.Equal(t, "unknown", c.Status, "status should be 'unknown' for null-ID requests") + } + assert.True(t, toolNames["list_issues"], "should include list_issues") + assert.True(t, toolNames["issue_read"], "should include issue_read") +} diff --git a/pkg/workflow/copilot_logs.go b/pkg/workflow/copilot_logs.go index 2db03d3088b..59d5ef15c36 100644 --- a/pkg/workflow/copilot_logs.go +++ b/pkg/workflow/copilot_logs.go @@ -49,6 +49,7 @@ func (e *CopilotEngine) parseSessionJSONL(logContent string, verbose bool) (LogM toolCallMap := make(map[string]*ToolCallInfo) var currentSequence []string turns := 0 + assistantMessageCount := 0 // fallback: count assistant messages when num_turns is absent lines := strings.Split(logContent, "\n") foundSessionEntry := false @@ -78,6 +79,9 @@ func (e *CopilotEngine) parseSessionJSONL(logContent string, verbose bool) (LogM } case "assistant": + // Each assistant message represents one LLM turn + assistantMessageCount++ + // Assistant message with potential tool calls if entry.Message != nil { for _, content := range entry.Message.Content { @@ -153,6 +157,14 @@ func (e *CopilotEngine) parseSessionJSONL(logContent string, verbose bool) (LogM } } + // If turns was not set from num_turns (0 or absent), fall back to counting assistant messages. + // The Copilot CLI may omit num_turns from the result entry; each assistant message represents + // one LLM conversation turn. + if turns == 0 && assistantMessageCount > 0 { + turns = assistantMessageCount + copilotLogsLog.Printf("num_turns not available in result entry, using assistant message count as turns: %d", turns) + } + // If we found no session entries, return false to indicate fallback needed if !foundSessionEntry { return metrics, false @@ -227,6 +239,15 @@ func (e *CopilotEngine) ParseLogMetrics(logContent string, verbose bool) LogMetr if strings.Contains(line, "[DEBUG] data:") { inDataBlock = true currentJSONLines = []string{} + // Each API response data block represents one LLM conversation turn. + // Copilot CLI debug logs don't have "User:"/"Human:" patterns, so we + // count turns based on the number of API responses (data blocks). + turns++ + // Save previous sequence before starting new turn + if len(currentSequence) > 0 { + metrics.ToolSequences = append(metrics.ToolSequences, currentSequence) + currentSequence = []string{} + } continue } @@ -282,17 +303,10 @@ func (e *CopilotEngine) ParseLogMetrics(logContent string, verbose bool) LogMetr } } - // Count turns based on interaction patterns (adjust based on actual Copilot CLI output) - if strings.Contains(line, "User:") || strings.Contains(line, "Human:") || strings.Contains(line, "Query:") { - turns++ - // Start of a new turn, save previous sequence if any - if len(currentSequence) > 0 { - metrics.ToolSequences = append(metrics.ToolSequences, currentSequence) - currentSequence = []string{} - } - } - - // Extract tool calls and add to sequence (adjust based on actual Copilot CLI output format) + // Extract tool calls and add to sequence and toolCallMap + // "Executing tool: " lines confirm tool execution and are used to populate + // both the tool sequence and tool call statistics. This handles the common case where + // Copilot CLI JSON blocks have empty tool_calls arrays but emit execution log lines. if toolName := e.parseCopilotToolCallsWithSequence(line, toolCallMap); toolName != "" { currentSequence = append(currentSequence, toolName) } @@ -403,19 +417,29 @@ func (e *CopilotEngine) processToolCalls(toolCalls []any, toolCallMap map[string } } -// parseCopilotToolCallsWithSequence extracts tool call information from Copilot CLI log lines and returns tool name +// parseCopilotToolCallsWithSequence extracts tool call information from Copilot CLI log lines and returns tool name. +// It also updates toolCallMap with the tool execution count for statistics tracking. func (e *CopilotEngine) parseCopilotToolCallsWithSequence(line string, toolCallMap map[string]*ToolCallInfo) string { - // This method handles simple tool execution log lines for sequence tracking - // Tool size extraction is now handled by extractToolCallSizes which parses JSON - // Look for "Executing tool:" pattern in Copilot logs if strings.Contains(line, "Executing tool:") { // Extract tool name from "Executing tool: " format parts := strings.Split(line, "Executing tool:") if len(parts) > 1 { toolName := strings.TrimSpace(parts[1]) - // Return the tool name for sequence tracking - // Size information is handled separately by extractToolCallSizes + if toolName == "" { + return "" + } + // Update toolCallMap: this captures tool calls from execution log lines. + // This is the primary source of tool call data in the Copilot CLI debug log + // format, since JSON response blocks often have empty tool_calls arrays. + if toolInfo, exists := toolCallMap[toolName]; exists { + toolInfo.CallCount++ + } else { + toolCallMap[toolName] = &ToolCallInfo{ + Name: toolName, + CallCount: 1, + } + } return toolName } } diff --git a/pkg/workflow/copilot_session_jsonl_test.go b/pkg/workflow/copilot_session_jsonl_test.go index bca8e8a6ea8..48c0d3ac507 100644 --- a/pkg/workflow/copilot_session_jsonl_test.go +++ b/pkg/workflow/copilot_session_jsonl_test.go @@ -216,3 +216,54 @@ func TestCopilotSessionJSONLToolSizes(t *testing.T) { t.Error("Expected MaxInputSize to be tracked") } } + +// TestCopilotSessionJSONLMissingNumTurns verifies that turns are counted from assistant +// messages when the result entry does not include num_turns (or has num_turns=0). +// This is a regression test for the bug where Turns was always 0 in run summaries +// because the Copilot CLI sometimes omits num_turns from the result entry. +func TestCopilotSessionJSONLMissingNumTurns(t *testing.T) { + // Session JSONL where the result entry omits num_turns entirely + logContent := `{"type":"system","subtype":"init","session_id":"copilot-no-turns","tools":["Bash"],"model":"gpt-4"} +{"type":"assistant","message":{"content":[{"type":"tool_use","id":"tool_1","name":"Bash","input":{"command":"ls"}}]}} +{"type":"user","message":{"content":[{"type":"tool_result","tool_use_id":"tool_1","content":"file.txt"}]}} +{"type":"assistant","message":{"content":[{"type":"tool_use","id":"tool_2","name":"Bash","input":{"command":"cat file.txt"}}]}} +{"type":"user","message":{"content":[{"type":"tool_result","tool_use_id":"tool_2","content":"contents"}]}} +{"type":"result","usage":{"input_tokens":500,"output_tokens":50}}` + + engine := NewCopilotEngine() + metrics := engine.ParseLogMetrics(logContent, false) + + // Tokens should still be extracted + if metrics.TokenUsage != 550 { + t.Errorf("Expected 550 tokens, got %d", metrics.TokenUsage) + } + + // Turns should fall back to assistant message count (2 assistant entries) + if metrics.Turns != 2 { + t.Errorf("Expected 2 turns (from assistant message count fallback), got %d", metrics.Turns) + } + + // Tool calls should be extracted from assistant entries + if len(metrics.ToolCalls) == 0 { + t.Error("Expected tool calls to be extracted") + } +} + +// TestCopilotSessionJSONLZeroNumTurns verifies that turns fall back to assistant count +// when num_turns is explicitly 0 in the result entry. +func TestCopilotSessionJSONLZeroNumTurns(t *testing.T) { + // Session JSONL where num_turns is explicitly 0 + logContent := `{"type":"system","subtype":"init","session_id":"copilot-zero-turns","tools":["Bash"],"model":"gpt-4"} +{"type":"assistant","message":{"content":[{"type":"tool_use","id":"tool_1","name":"Bash","input":{"command":"echo hi"}}]}} +{"type":"user","message":{"content":[{"type":"tool_result","tool_use_id":"tool_1","content":"hi"}]}} +{"type":"assistant","message":{"content":[{"type":"text","text":"Task complete."}]}} +{"type":"result","usage":{"input_tokens":300,"output_tokens":30},"num_turns":0}` + + engine := NewCopilotEngine() + metrics := engine.ParseLogMetrics(logContent, false) + + // Turns should fall back to assistant message count (2 assistant entries) + if metrics.Turns != 2 { + t.Errorf("Expected 2 turns (fallback from zero num_turns), got %d", metrics.Turns) + } +}