Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/cli/audit.go
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ func AuditWorkflowRun(ctx context.Context, runID int64, owner, repo, hostname st
MissingData: missingData,
Noops: noops,
MCPFailures: mcpFailures,
MCPToolUsage: mcpToolUsage,
ArtifactsList: artifacts,
JobDetails: jobDetails,
}
Expand Down
136 changes: 136 additions & 0 deletions pkg/cli/copilot_metrics_fix_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
//go:build !integration

package cli

import (
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

// TestCopilotDebugLogTurnsExtraction verifies that Turns are correctly counted from
// [DEBUG] data: blocks in the Copilot CLI debug log format.
//
// This is a regression test for the bug where Turns was always 0 because the parser
// counted "User:"/"Human:"/"Query:" patterns that do not appear in Copilot CLI debug logs.
// The fix counts each "[DEBUG] data:" block as one API response (one turn).
func TestCopilotDebugLogTurnsExtraction(t *testing.T) {
tempDir := t.TempDir()

awInfoContent := `{"engine_id": "copilot"}`
require.NoError(t, os.WriteFile(filepath.Join(tempDir, "aw_info.json"), []byte(awInfoContent), 0644))

// Debug log with 2 API responses (2 data blocks) and "Executing tool: bash" between them.
// The JSON blocks have empty tool_calls (common in production Copilot CLI logs).
logContent := `2025-09-26T11:13:11.798Z [DEBUG] Starting Copilot CLI: 0.0.400
2025-09-26T11:13:12.575Z [DEBUG] data:
2025-09-26T11:13:12.575Z [DEBUG] {
2025-09-26T11:13:12.575Z [DEBUG] "choices": [{
2025-09-26T11:13:12.575Z [DEBUG] "message": {
2025-09-26T11:13:12.575Z [DEBUG] "role": "assistant",
2025-09-26T11:13:12.575Z [DEBUG] "content": null,
2025-09-26T11:13:12.575Z [DEBUG] "tool_calls": []
2025-09-26T11:13:12.575Z [DEBUG] }
2025-09-26T11:13:12.575Z [DEBUG] }],
2025-09-26T11:13:12.575Z [DEBUG] "usage": {
2025-09-26T11:13:12.575Z [DEBUG] "prompt_tokens": 1000,
2025-09-26T11:13:12.575Z [DEBUG] "completion_tokens": 50,
2025-09-26T11:13:12.575Z [DEBUG] "total_tokens": 1050
2025-09-26T11:13:12.575Z [DEBUG] }
2025-09-26T11:13:12.575Z [DEBUG] }
2025-09-26T11:13:13.000Z [DEBUG] Executing tool: bash
2025-09-26T11:13:13.500Z [DEBUG] Tool execution completed
2025-09-26T11:13:14.000Z [DEBUG] data:
2025-09-26T11:13:14.000Z [DEBUG] {
2025-09-26T11:13:14.000Z [DEBUG] "choices": [{
2025-09-26T11:13:14.000Z [DEBUG] "message": {
2025-09-26T11:13:14.000Z [DEBUG] "role": "assistant",
2025-09-26T11:13:14.000Z [DEBUG] "content": "Task done.",
2025-09-26T11:13:14.000Z [DEBUG] "tool_calls": []
2025-09-26T11:13:14.000Z [DEBUG] }
2025-09-26T11:13:14.000Z [DEBUG] }],
2025-09-26T11:13:14.000Z [DEBUG] "usage": {
2025-09-26T11:13:14.000Z [DEBUG] "prompt_tokens": 1200,
2025-09-26T11:13:14.000Z [DEBUG] "completion_tokens": 20,
2025-09-26T11:13:14.000Z [DEBUG] "total_tokens": 1220
2025-09-26T11:13:14.000Z [DEBUG] }
2025-09-26T11:13:14.000Z [DEBUG] }
2025-09-26T11:13:14.500Z [DEBUG] Workflow completed`

require.NoError(t, os.WriteFile(filepath.Join(tempDir, "agent.log"), []byte(logContent), 0644))

metrics, err := extractLogMetrics(tempDir, false)
require.NoError(t, err)

// Tokens should be accumulated from both data blocks
assert.Equal(t, 2270, metrics.TokenUsage, "Should accumulate tokens from both data blocks: 1050 + 1220 = 2270")

// Turns should be 2 (one per [DEBUG] data: block)
assert.Equal(t, 2, metrics.Turns,
"Turns should count [DEBUG] data: blocks; got %d", metrics.Turns)

// Tool calls should be extracted from "Executing tool: bash" line
assert.NotEmpty(t, metrics.ToolCalls, "ToolCalls should not be null when 'Executing tool:' is present")
if len(metrics.ToolCalls) > 0 {
found := false
for _, tc := range metrics.ToolCalls {
if tc.Name == "bash" {
found = true
assert.Equal(t, 1, tc.CallCount, "bash should have been called once")
}
}
assert.True(t, found, "Expected to find 'bash' in tool calls")
}
}

// TestCopilotDebugLogMultipleToolCalls verifies that multiple "Executing tool:" lines
// produce correct call counts in ToolCalls.
func TestCopilotDebugLogMultipleToolCalls(t *testing.T) {
tempDir := t.TempDir()

awInfoContent := `{"engine_id": "copilot"}`
require.NoError(t, os.WriteFile(filepath.Join(tempDir, "aw_info.json"), []byte(awInfoContent), 0644))

// Three API response blocks with different tool executions
logContent := `2025-09-26T11:13:11.798Z [DEBUG] Starting Copilot CLI: 0.0.400
2025-09-26T11:13:12.000Z [DEBUG] data:
2025-09-26T11:13:12.000Z [DEBUG] {
2025-09-26T11:13:12.000Z [DEBUG] "choices": [{"message": {"tool_calls": []}}],
2025-09-26T11:13:12.000Z [DEBUG] "usage": {"prompt_tokens": 500, "completion_tokens": 10, "total_tokens": 510}
2025-09-26T11:13:12.000Z [DEBUG] }
2025-09-26T11:13:12.500Z [DEBUG] Executing tool: bash
2025-09-26T11:13:13.000Z [DEBUG] data:
2025-09-26T11:13:13.000Z [DEBUG] {
2025-09-26T11:13:13.000Z [DEBUG] "choices": [{"message": {"tool_calls": []}}],
2025-09-26T11:13:13.000Z [DEBUG] "usage": {"prompt_tokens": 600, "completion_tokens": 10, "total_tokens": 610}
2025-09-26T11:13:13.000Z [DEBUG] }
2025-09-26T11:13:13.500Z [DEBUG] Executing tool: bash
2025-09-26T11:13:14.000Z [DEBUG] data:
2025-09-26T11:13:14.000Z [DEBUG] {
2025-09-26T11:13:14.000Z [DEBUG] "choices": [{"message": {"tool_calls": []}}],
2025-09-26T11:13:14.000Z [DEBUG] "usage": {"prompt_tokens": 700, "completion_tokens": 20, "total_tokens": 720}
2025-09-26T11:13:14.000Z [DEBUG] }
2025-09-26T11:13:14.500Z [DEBUG] Executing tool: mcp_github
2025-09-26T11:13:15.000Z [DEBUG] Workflow done`

require.NoError(t, os.WriteFile(filepath.Join(tempDir, "agent.log"), []byte(logContent), 0644))

metrics, err := extractLogMetrics(tempDir, false)
require.NoError(t, err)

// Turns: 3 data blocks
assert.Equal(t, 3, metrics.Turns, "Should count 3 turns from 3 data blocks")

// Tool calls: bash x2, mcp_github x1
assert.NotEmpty(t, metrics.ToolCalls, "ToolCalls should not be null")

toolCounts := make(map[string]int)
for _, tc := range metrics.ToolCalls {
toolCounts[tc.Name] = tc.CallCount
}
assert.Equal(t, 2, toolCounts["bash"], "bash should have 2 calls")
assert.Equal(t, 1, toolCounts["mcp_github"], "mcp_github should have 1 call")
}
18 changes: 15 additions & 3 deletions pkg/cli/gateway_logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,11 @@ func buildToolCallsFromRPCMessages(logPath string) ([]MCPToolCall, error) {
return nil, fmt.Errorf("error reading rpc-messages.jsonl: %w", err)
}

// Second pass: build MCPToolCall records.
// Declared before first pass so requests without IDs can be appended immediately.
var toolCalls []MCPToolCall
processedKeys := make(map[string]bool)

// First pass: index outgoing tool-call requests by (serverID, id)
for i := range entries {
e := &entries[i]
Expand All @@ -783,6 +788,15 @@ func buildToolCallsFromRPCMessages(logPath string) ([]MCPToolCall, error) {
continue
}
if e.req.ID == nil {
// Requests without an ID cannot be matched to responses.
// Emit the tool call immediately with "unknown" status so it appears
// in the tool_calls list (same as parseRPCMessages counts it in the summary).
toolCalls = append(toolCalls, MCPToolCall{
Timestamp: e.entry.Timestamp,
ServerName: e.entry.ServerID,
ToolName: params.Name,
Status: "unknown",
})
continue
}
t, err := time.Parse(time.RFC3339Nano, e.entry.Timestamp)
Expand All @@ -797,9 +811,7 @@ func buildToolCallsFromRPCMessages(logPath string) ([]MCPToolCall, error) {
}
}

// Second pass: build MCPToolCall records
var toolCalls []MCPToolCall
processedKeys := make(map[string]bool)
// Second pass: pair responses with pending requests to compute durations

for i := range entries {
e := &entries[i]
Expand Down
31 changes: 31 additions & 0 deletions pkg/cli/gateway_logs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -764,3 +764,34 @@ func TestBuildToolCallsFromRPCMessages(t *testing.T) {
assert.Equal(t, "error", getRepo.Status, "status should be error")
assert.Equal(t, "rate limit", getRepo.Error, "error message should be set")
}

// TestBuildToolCallsFromRPCMessagesNullID verifies that requests with a null/missing ID
// are still included in the tool_calls output (regression test for mcp_tool_usage.tool_calls
// always being null when parseRPCMessages counted tool calls in the summary but
// buildToolCallsFromRPCMessages skipped null-ID requests).
func TestBuildToolCallsFromRPCMessagesNullID(t *testing.T) {
tmpDir := t.TempDir()

// Requests with null ID (id:null) - these are counted in the summary by parseRPCMessages
// but were previously skipped by buildToolCallsFromRPCMessages, causing tool_calls=null.
rpcContent := `{"timestamp":"2024-01-12T10:00:00.000000000Z","direction":"OUT","type":"REQUEST","server_id":"github","payload":{"jsonrpc":"2.0","id":null,"method":"tools/call","params":{"name":"list_issues","arguments":{}}}}
{"timestamp":"2024-01-12T10:00:01.000000000Z","direction":"OUT","type":"REQUEST","server_id":"github","payload":{"jsonrpc":"2.0","id":null,"method":"tools/call","params":{"name":"issue_read","arguments":{}}}}
`
logPath := filepath.Join(tmpDir, "rpc-messages.jsonl")
require.NoError(t, os.WriteFile(logPath, []byte(rpcContent), 0644))

calls, err := buildToolCallsFromRPCMessages(logPath)
require.NoError(t, err, "should build tool calls without error")

// Both requests should produce tool call records even without IDs
assert.Len(t, calls, 2, "null-ID requests should still produce tool call records")

toolNames := make(map[string]bool)
for _, c := range calls {
toolNames[c.ToolName] = true
assert.Equal(t, "github", c.ServerName, "server name should be set")
assert.Equal(t, "unknown", c.Status, "status should be 'unknown' for null-ID requests")
}
assert.True(t, toolNames["list_issues"], "should include list_issues")
assert.True(t, toolNames["issue_read"], "should include issue_read")
}
58 changes: 41 additions & 17 deletions pkg/workflow/copilot_logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ func (e *CopilotEngine) parseSessionJSONL(logContent string, verbose bool) (LogM
toolCallMap := make(map[string]*ToolCallInfo)
var currentSequence []string
turns := 0
assistantMessageCount := 0 // fallback: count assistant messages when num_turns is absent

lines := strings.Split(logContent, "\n")
foundSessionEntry := false
Expand Down Expand Up @@ -78,6 +79,9 @@ func (e *CopilotEngine) parseSessionJSONL(logContent string, verbose bool) (LogM
}

case "assistant":
// Each assistant message represents one LLM turn
assistantMessageCount++

// Assistant message with potential tool calls
if entry.Message != nil {
for _, content := range entry.Message.Content {
Expand Down Expand Up @@ -153,6 +157,14 @@ func (e *CopilotEngine) parseSessionJSONL(logContent string, verbose bool) (LogM
}
}

// If turns was not set from num_turns (0 or absent), fall back to counting assistant messages.
// The Copilot CLI may omit num_turns from the result entry; each assistant message represents
// one LLM conversation turn.
if turns == 0 && assistantMessageCount > 0 {
turns = assistantMessageCount
copilotLogsLog.Printf("num_turns not available in result entry, using assistant message count as turns: %d", turns)
}

// If we found no session entries, return false to indicate fallback needed
if !foundSessionEntry {
return metrics, false
Expand Down Expand Up @@ -227,6 +239,15 @@ func (e *CopilotEngine) ParseLogMetrics(logContent string, verbose bool) LogMetr
if strings.Contains(line, "[DEBUG] data:") {
inDataBlock = true
currentJSONLines = []string{}
// Each API response data block represents one LLM conversation turn.
// Copilot CLI debug logs don't have "User:"/"Human:" patterns, so we
// count turns based on the number of API responses (data blocks).
turns++
// Save previous sequence before starting new turn
Comment on lines 239 to +246
Copy link

Copilot AI Mar 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When a new "[DEBUG] data:" marker is encountered, the parser resets currentJSONLines without first finalizing/parsing any in-progress JSON block. If Copilot debug logs emit back-to-back data: blocks (i.e., the next data: line immediately follows the previous JSON’s closing }), the previous block’s token usage/tool sizes will be dropped, causing undercounted metrics. Treat a new data: marker as an implicit end-of-previous-block: if inDataBlock && len(currentJSONLines)>0, parse/accumulate metrics before resetting, then start the new block and increment turns.

Copilot uses AI. Check for mistakes.
if len(currentSequence) > 0 {
metrics.ToolSequences = append(metrics.ToolSequences, currentSequence)
currentSequence = []string{}
}
continue
}

Expand Down Expand Up @@ -282,17 +303,10 @@ func (e *CopilotEngine) ParseLogMetrics(logContent string, verbose bool) LogMetr
}
}

// Count turns based on interaction patterns (adjust based on actual Copilot CLI output)
if strings.Contains(line, "User:") || strings.Contains(line, "Human:") || strings.Contains(line, "Query:") {
turns++
// Start of a new turn, save previous sequence if any
if len(currentSequence) > 0 {
metrics.ToolSequences = append(metrics.ToolSequences, currentSequence)
currentSequence = []string{}
}
}

// Extract tool calls and add to sequence (adjust based on actual Copilot CLI output format)
// Extract tool calls and add to sequence and toolCallMap
// "Executing tool: <name>" lines confirm tool execution and are used to populate
// both the tool sequence and tool call statistics. This handles the common case where
// Copilot CLI JSON blocks have empty tool_calls arrays but emit execution log lines.
if toolName := e.parseCopilotToolCallsWithSequence(line, toolCallMap); toolName != "" {
currentSequence = append(currentSequence, toolName)
}
Expand Down Expand Up @@ -403,19 +417,29 @@ func (e *CopilotEngine) processToolCalls(toolCalls []any, toolCallMap map[string
}
}

// parseCopilotToolCallsWithSequence extracts tool call information from Copilot CLI log lines and returns tool name
// parseCopilotToolCallsWithSequence extracts tool call information from Copilot CLI log lines and returns tool name.
// It also updates toolCallMap with the tool execution count for statistics tracking.
func (e *CopilotEngine) parseCopilotToolCallsWithSequence(line string, toolCallMap map[string]*ToolCallInfo) string {
// This method handles simple tool execution log lines for sequence tracking
// Tool size extraction is now handled by extractToolCallSizes which parses JSON

// Look for "Executing tool:" pattern in Copilot logs
if strings.Contains(line, "Executing tool:") {
// Extract tool name from "Executing tool: <name>" format
parts := strings.Split(line, "Executing tool:")
if len(parts) > 1 {
toolName := strings.TrimSpace(parts[1])
// Return the tool name for sequence tracking
// Size information is handled separately by extractToolCallSizes
if toolName == "" {
return ""
}
// Update toolCallMap: this captures tool calls from execution log lines.
// This is the primary source of tool call data in the Copilot CLI debug log
// format, since JSON response blocks often have empty tool_calls arrays.
if toolInfo, exists := toolCallMap[toolName]; exists {
toolInfo.CallCount++
} else {
toolCallMap[toolName] = &ToolCallInfo{
Name: toolName,
CallCount: 1,
}
}
Comment on lines +432 to +442
Copy link

Copilot AI Mar 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This increments toolInfo.CallCount for every "Executing tool:" line, but call counts are also incremented when extractToolCallSizes parses non-empty JSON tool_calls arrays (via processToolCalls). In logs where both signals are present (e.g., the log content used in pkg/workflow/copilot_engine_test.go:1364-1386 has a JSON tool_calls entry and then an "Executing tool:" line), the same tool execution will be double-counted. Consider de-duplicating: either (a) treat JSON tool_calls as size-only and derive call counts from execution lines, or (b) only increment from execution lines when the current/previous JSON block had an empty tool_calls array for that turn.

Copilot uses AI. Check for mistakes.
return toolName
}
}
Expand Down
51 changes: 51 additions & 0 deletions pkg/workflow/copilot_session_jsonl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,54 @@ func TestCopilotSessionJSONLToolSizes(t *testing.T) {
t.Error("Expected MaxInputSize to be tracked")
}
}

// TestCopilotSessionJSONLMissingNumTurns verifies that turns are counted from assistant
// messages when the result entry does not include num_turns (or has num_turns=0).
// This is a regression test for the bug where Turns was always 0 in run summaries
// because the Copilot CLI sometimes omits num_turns from the result entry.
func TestCopilotSessionJSONLMissingNumTurns(t *testing.T) {
// Session JSONL where the result entry omits num_turns entirely
logContent := `{"type":"system","subtype":"init","session_id":"copilot-no-turns","tools":["Bash"],"model":"gpt-4"}
{"type":"assistant","message":{"content":[{"type":"tool_use","id":"tool_1","name":"Bash","input":{"command":"ls"}}]}}
{"type":"user","message":{"content":[{"type":"tool_result","tool_use_id":"tool_1","content":"file.txt"}]}}
{"type":"assistant","message":{"content":[{"type":"tool_use","id":"tool_2","name":"Bash","input":{"command":"cat file.txt"}}]}}
{"type":"user","message":{"content":[{"type":"tool_result","tool_use_id":"tool_2","content":"contents"}]}}
{"type":"result","usage":{"input_tokens":500,"output_tokens":50}}`

engine := NewCopilotEngine()
metrics := engine.ParseLogMetrics(logContent, false)

// Tokens should still be extracted
if metrics.TokenUsage != 550 {
t.Errorf("Expected 550 tokens, got %d", metrics.TokenUsage)
}

// Turns should fall back to assistant message count (2 assistant entries)
if metrics.Turns != 2 {
t.Errorf("Expected 2 turns (from assistant message count fallback), got %d", metrics.Turns)
}

// Tool calls should be extracted from assistant entries
if len(metrics.ToolCalls) == 0 {
t.Error("Expected tool calls to be extracted")
}
}

// TestCopilotSessionJSONLZeroNumTurns verifies that turns fall back to assistant count
// when num_turns is explicitly 0 in the result entry.
func TestCopilotSessionJSONLZeroNumTurns(t *testing.T) {
// Session JSONL where num_turns is explicitly 0
logContent := `{"type":"system","subtype":"init","session_id":"copilot-zero-turns","tools":["Bash"],"model":"gpt-4"}
{"type":"assistant","message":{"content":[{"type":"tool_use","id":"tool_1","name":"Bash","input":{"command":"echo hi"}}]}}
{"type":"user","message":{"content":[{"type":"tool_result","tool_use_id":"tool_1","content":"hi"}]}}
{"type":"assistant","message":{"content":[{"type":"text","text":"Task complete."}]}}
{"type":"result","usage":{"input_tokens":300,"output_tokens":30},"num_turns":0}`

engine := NewCopilotEngine()
metrics := engine.ParseLogMetrics(logContent, false)

// Turns should fall back to assistant message count (2 assistant entries)
if metrics.Turns != 2 {
t.Errorf("Expected 2 turns (fallback from zero num_turns), got %d", metrics.Turns)
}
}
Loading