From 38d0b91101a98b9474fa8c8105023a6862d30d31 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 25 Apr 2026 20:21:19 +0000
Subject: [PATCH 1/4] feat: add tool calls diff and tokens-per-turn to audit
 diff mode

- Add ToolCallDiffEntry, ToolCallsDiff, ToolCallsDiffSummary, BashCommandsDiff types
- Add tokens-per-turn fields (Run1TokensPerTurn, Run2TokensPerTurn, TokensPerTurnChange) to RunMetricsDiff
- Add ToolCallsDiff field to RunMetricsDiff for engine-level tool call analysis
- Implement computeToolCallsDiff() diffing LogMetrics.ToolCalls between runs
- Implement computeBashCommandsDiff() for bash-specific analysis (handles generic bash/Bash and per-command bash_* entries from Codex)
- Implement isBashTool() helper for bash tool name matching
- Update computeRunMetricsDiff() to compute tokens-per-turn and include tool calls diff
- Add renderToolCallsDiffPrettySection() and renderBashCommandsDiffPrettySection()
- Add renderToolCallsDiffMarkdownSection() and renderBashCommandsDiffMarkdownSection()
- Update Run Metrics table in both renderers to show Tokens/turn row
- Export ToolCallInfo type alias in pkg/cli/logs_models.go
- Add 15 new unit tests covering all new functionality

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/dbe42488-aa10-4336-bfeb-170f75f44adf

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 pkg/cli/audit_diff.go        | 262 +++++++++++++++++++++++++
 pkg/cli/audit_diff_render.go | 193 ++++++++++++++++++
 pkg/cli/audit_diff_test.go   | 368 +++++++++++++++++++++++++++++++++++
 pkg/cli/logs_models.go       |   4 +
 4 files changed, 827 insertions(+)

diff --git a/pkg/cli/audit_diff.go b/pkg/cli/audit_diff.go
index df89dbd5a6c..f6ccc0c967b 100644
--- a/pkg/cli/audit_diff.go
+++ b/pkg/cli/audit_diff.go
@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 	"sort"
 	"strconv"
+	"strings"
 	"time"
 
 	"github.com/github/gh-aw/pkg/constants"
@@ -283,6 +284,51 @@ type TokenUsageDiff struct {
 	CacheEfficiencyChange  string  `json:"cache_efficiency_change,omitempty"` // Percentage-point delta, e.g. "+1.5pp"
 }
 
+// ToolCallDiffEntry represents the diff for a single engine-level tool between two runs.
+// Tool data comes from RunSummary.Metrics.ToolCalls (LogMetrics.ToolCalls).
+type ToolCallDiffEntry struct {
+	Name              string `json:"name"`
+	Status            string `json:"status"`                         // "new", "removed", "changed", "unchanged"
+	Run1CallCount     int    `json:"run1_call_count"`                // Call count in run 1 (0 if new)
+	Run2CallCount     int    `json:"run2_call_count"`                // Call count in run 2 (0 if removed)
+	CallCountChange   string `json:"call_count_change,omitempty"`    // e.g. "+3", "-1"
+	Run1MaxInputSize  int    `json:"run1_max_input_size,omitempty"`  // Max input size (tokens) seen in run 1
+	Run2MaxInputSize  int    `json:"run2_max_input_size,omitempty"`  // Max input size (tokens) seen in run 2
+	Run1MaxOutputSize int    `json:"run1_max_output_size,omitempty"` // Max output size (tokens) seen in run 1
+	Run2MaxOutputSize int    `json:"run2_max_output_size,omitempty"` // Max output size (tokens) seen in run 2
+}
+
+// BashCommandsDiff tracks bash-specific tool call differences between two runs.
+// It aggregates calls to the generic "bash" / "Bash" tool and per-command "bash_*" entries
+// (the latter are generated by the Codex engine log parser which records each unique shell command).
+type BashCommandsDiff struct {
+	Run1TotalCalls   int                 `json:"run1_total_calls"`
+	Run2TotalCalls   int                 `json:"run2_total_calls"`
+	TotalCallsChange string              `json:"total_calls_change,omitempty"` // e.g. "+5", "-2"
+	Commands         []ToolCallDiffEntry `json:"commands,omitempty"`           // per-command breakdown (from bash_* names)
+}
+
+// ToolCallsDiffSummary provides a quick overview of engine-level tool call changes
+type ToolCallsDiffSummary struct {
+	NewToolCount     int `json:"new_tool_count"`
+	RemovedToolCount int `json:"removed_tool_count"`
+	ChangedToolCount int `json:"changed_tool_count"`
+	Run1TotalCalls   int `json:"run1_total_calls"` // Total across all tools in run 1
+	Run2TotalCalls   int `json:"run2_total_calls"` // Total across all tools in run 2
+}
+
+// ToolCallsDiff represents the diff of engine-level tool invocations between two runs.
+// It uses data from RunSummary.Metrics.ToolCalls (LogMetrics.ToolCalls) which is populated
+// by engine-specific log parsers (Claude, Codex, Copilot).
+type ToolCallsDiff struct {
+	NewTools     []ToolCallDiffEntry  `json:"new_tools,omitempty"`     // Tools only in run 2
+	RemovedTools []ToolCallDiffEntry  `json:"removed_tools,omitempty"` // Tools only in run 1
+	ChangedTools []ToolCallDiffEntry  `json:"changed_tools,omitempty"` // Tools with changed call counts
+	AllTools     []ToolCallDiffEntry  `json:"all_tools,omitempty"`     // Complete view of all tools across both runs
+	BashDiff     *BashCommandsDiff    `json:"bash_diff,omitempty"`     // Bash-specific analysis
+	Summary      ToolCallsDiffSummary `json:"summary"`
+}
+
 // RunMetricsDiff represents the diff of run-level metrics (token usage, duration, turns) between two runs
 type RunMetricsDiff struct {
 	Run1TokenUsage         int                  `json:"run1_token_usage"`
@@ -294,8 +340,12 @@ type RunMetricsDiff struct {
 	Run1Turns              int                  `json:"run1_turns,omitempty"`
 	Run2Turns              int                  `json:"run2_turns,omitempty"`
 	TurnsChange            int                  `json:"turns_change,omitempty"`
+	Run1TokensPerTurn      int                  `json:"run1_tokens_per_turn,omitempty"`      // Avg effective tokens per turn in run 1
+	Run2TokensPerTurn      int                  `json:"run2_tokens_per_turn,omitempty"`      // Avg effective tokens per turn in run 2
+	TokensPerTurnChange    string               `json:"tokens_per_turn_change,omitempty"`    // e.g. "+20%", "-10%"
 	TokenUsageDetails      *TokenUsageDiff      `json:"token_usage_details,omitempty"`       // Detailed breakdown from firewall proxy
 	GitHubRateLimitDetails *GitHubRateLimitDiff `json:"github_rate_limit_details,omitempty"` // GitHub API quota consumption diff
+	ToolCallsDiff          *ToolCallsDiff       `json:"tool_calls_diff,omitempty"`           // Engine-level tool call diff
 }
 
 // GitHubRateLimitDiff represents the diff of GitHub API quota consumption between two runs.
@@ -471,6 +521,7 @@ func computeRunMetricsDiff(summary1, summary2 *RunSummary) *RunMetricsDiff {
 	var run1Turns, run2Turns int
 	var tu1, tu2 *TokenUsageSummary
 	var rl1, rl2 *GitHubRateLimitUsage
+	var m1, m2 *LogMetrics
 
 	if summary1 != nil {
 		run1Tokens = summary1.Run.TokenUsage
@@ -478,6 +529,7 @@ func computeRunMetricsDiff(summary1, summary2 *RunSummary) *RunMetricsDiff {
 		run1Turns = summary1.Run.Turns
 		tu1 = summary1.TokenUsage
 		rl1 = summary1.GitHubRateLimitUsage
+		m1 = &summary1.Metrics
 	}
 	if summary2 != nil {
 		run2Tokens = summary2.Run.TokenUsage
@@ -485,6 +537,7 @@ func computeRunMetricsDiff(summary1, summary2 *RunSummary) *RunMetricsDiff {
 		run2Turns = summary2.Run.Turns
 		tu2 = summary2.TokenUsage
 		rl2 = summary2.GitHubRateLimitUsage
+		m2 = &summary2.Metrics
 	}
 
 	// Skip if there is no meaningful data
@@ -521,12 +574,221 @@ func computeRunMetricsDiff(summary1, summary2 *RunSummary) *RunMetricsDiff {
 		}
 	}
 
+	// Compute tokens per turn using effective tokens from firewall proxy when available,
+	// otherwise fall back to the engine-level token count.
+	run1Effective := run1Tokens
+	run2Effective := run2Tokens
+	if tu1 != nil && tu1.TotalEffectiveTokens > 0 {
+		run1Effective = tu1.TotalEffectiveTokens
+	}
+	if tu2 != nil && tu2.TotalEffectiveTokens > 0 {
+		run2Effective = tu2.TotalEffectiveTokens
+	}
+	if run1Turns > 0 {
+		diff.Run1TokensPerTurn = run1Effective / run1Turns
+	}
+	if run2Turns > 0 {
+		diff.Run2TokensPerTurn = run2Effective / run2Turns
+	}
+	if diff.Run1TokensPerTurn > 0 || diff.Run2TokensPerTurn > 0 {
+		diff.TokensPerTurnChange = formatVolumeChange(diff.Run1TokensPerTurn, diff.Run2TokensPerTurn)
+	}
+
 	diff.TokenUsageDetails = computeTokenUsageDiff(tu1, tu2)
 	diff.GitHubRateLimitDetails = computeGitHubRateLimitDiff(rl1, rl2)
+	diff.ToolCallsDiff = computeToolCallsDiff(m1, m2)
+
+	return diff
+}
+
+// isBashTool returns true if the tool name represents a bash/shell invocation.
+// It matches the generic "bash" / "Bash" tool names used by most engines and the
+// per-command "bash_*" entries generated by the Codex log parser.
+func isBashTool(name string) bool {
+	lower := strings.ToLower(name)
+	return lower == "bash" || strings.HasPrefix(lower, "bash_")
+}
+
+// computeToolCallsDiff diffs engine-level tool calls from two LogMetrics values.
+// Returns nil when both metrics have no tool call data.
+func computeToolCallsDiff(m1, m2 *LogMetrics) *ToolCallsDiff {
+	run1Tools := make(map[string]ToolCallInfo)
+	run2Tools := make(map[string]ToolCallInfo)
+
+	if m1 != nil {
+		for _, tc := range m1.ToolCalls {
+			run1Tools[tc.Name] = tc
+		}
+	}
+	if m2 != nil {
+		for _, tc := range m2.ToolCalls {
+			run2Tools[tc.Name] = tc
+		}
+	}
+
+	if len(run1Tools) == 0 && len(run2Tools) == 0 {
+		return nil
+	}
+
+	allNames := make(map[string]bool)
+	for k := range run1Tools {
+		allNames[k] = true
+	}
+	for k := range run2Tools {
+		allNames[k] = true
+	}
+
+	sortedNames := make([]string, 0, len(allNames))
+	for k := range allNames {
+		sortedNames = append(sortedNames, k)
+	}
+	sort.Strings(sortedNames)
+
+	diff := &ToolCallsDiff{}
+	var run1Total, run2Total int
 
+	for _, name := range sortedNames {
+		tc1, inRun1 := run1Tools[name]
+		tc2, inRun2 := run2Tools[name]
+
+		if inRun1 {
+			run1Total += tc1.CallCount
+		}
+		if inRun2 {
+			run2Total += tc2.CallCount
+		}
+
+		var entry ToolCallDiffEntry
+		switch {
+		case !inRun1 && inRun2:
+			entry = ToolCallDiffEntry{
+				Name:              name,
+				Status:            "new",
+				Run2CallCount:     tc2.CallCount,
+				Run2MaxInputSize:  tc2.MaxInputSize,
+				Run2MaxOutputSize: tc2.MaxOutputSize,
+			}
+			diff.NewTools = append(diff.NewTools, entry)
+		case inRun1 && !inRun2:
+			entry = ToolCallDiffEntry{
+				Name:              name,
+				Status:            "removed",
+				Run1CallCount:     tc1.CallCount,
+				Run1MaxInputSize:  tc1.MaxInputSize,
+				Run1MaxOutputSize: tc1.MaxOutputSize,
+			}
+			diff.RemovedTools = append(diff.RemovedTools, entry)
+		case tc1.CallCount != tc2.CallCount:
+			entry = ToolCallDiffEntry{
+				Name:              name,
+				Status:            "changed",
+				Run1CallCount:     tc1.CallCount,
+				Run2CallCount:     tc2.CallCount,
+				CallCountChange:   formatCountChange(tc1.CallCount, tc2.CallCount),
+				Run1MaxInputSize:  tc1.MaxInputSize,
+				Run2MaxInputSize:  tc2.MaxInputSize,
+				Run1MaxOutputSize: tc1.MaxOutputSize,
+				Run2MaxOutputSize: tc2.MaxOutputSize,
+			}
+			diff.ChangedTools = append(diff.ChangedTools, entry)
+		default:
+			entry = ToolCallDiffEntry{
+				Name:              name,
+				Status:            "unchanged",
+				Run1CallCount:     tc1.CallCount,
+				Run2CallCount:     tc2.CallCount,
+				Run1MaxInputSize:  tc1.MaxInputSize,
+				Run2MaxInputSize:  tc2.MaxInputSize,
+				Run1MaxOutputSize: tc1.MaxOutputSize,
+				Run2MaxOutputSize: tc2.MaxOutputSize,
+			}
+		}
+		diff.AllTools = append(diff.AllTools, entry)
+	}
+
+	diff.BashDiff = computeBashCommandsDiff(run1Tools, run2Tools)
+	diff.Summary = ToolCallsDiffSummary{
+		NewToolCount:     len(diff.NewTools),
+		RemovedToolCount: len(diff.RemovedTools),
+		ChangedToolCount: len(diff.ChangedTools),
+		Run1TotalCalls:   run1Total,
+		Run2TotalCalls:   run2Total,
+	}
+
+	auditDiffLog.Printf("Tool calls diff: new=%d, removed=%d, changed=%d, run1_total=%d, run2_total=%d",
+		len(diff.NewTools), len(diff.RemovedTools), len(diff.ChangedTools), run1Total, run2Total)
 	return diff
 }
 
+// computeBashCommandsDiff builds bash-specific analysis from the tool call maps.
+// It collects all bash-related entries (generic "bash"/"Bash" and per-command "bash_*")
+// and produces a summary with per-command call count comparison.
+// Returns nil when no bash tool calls are present in either run.
+func computeBashCommandsDiff(run1Tools, run2Tools map[string]ToolCallInfo) *BashCommandsDiff {
+	allNames := make(map[string]bool)
+	for k := range run1Tools {
+		if isBashTool(k) {
+			allNames[k] = true
+		}
+	}
+	for k := range run2Tools {
+		if isBashTool(k) {
+			allNames[k] = true
+		}
+	}
+
+	if len(allNames) == 0 {
+		return nil
+	}
+
+	sortedNames := make([]string, 0, len(allNames))
+	for k := range allNames {
+		sortedNames = append(sortedNames, k)
+	}
+	sort.Strings(sortedNames)
+
+	bashDiff := &BashCommandsDiff{}
+	for _, name := range sortedNames {
+		tc1 := run1Tools[name]
+		tc2 := run2Tools[name]
+		bashDiff.Run1TotalCalls += tc1.CallCount
+		bashDiff.Run2TotalCalls += tc2.CallCount
+
+		var status string
+		switch {
+		case tc1.CallCount == 0 && tc2.CallCount > 0:
+			status = "new"
+		case tc1.CallCount > 0 && tc2.CallCount == 0:
+			status = "removed"
+		case tc1.CallCount != tc2.CallCount:
+			status = "changed"
+		default:
+			status = "unchanged"
+		}
+
+		cmd := ToolCallDiffEntry{
+			Name:              name,
+			Status:            status,
+			Run1CallCount:     tc1.CallCount,
+			Run2CallCount:     tc2.CallCount,
+			Run1MaxInputSize:  tc1.MaxInputSize,
+			Run2MaxInputSize:  tc2.MaxInputSize,
+			Run1MaxOutputSize: tc1.MaxOutputSize,
+			Run2MaxOutputSize: tc2.MaxOutputSize,
+		}
+		if tc1.CallCount != tc2.CallCount {
+			cmd.CallCountChange = formatCountChange(tc1.CallCount, tc2.CallCount)
+		}
+		bashDiff.Commands = append(bashDiff.Commands, cmd)
+	}
+
+	if bashDiff.Run1TotalCalls > 0 || bashDiff.Run2TotalCalls > 0 {
+		bashDiff.TotalCallsChange = formatCountChange(bashDiff.Run1TotalCalls, bashDiff.Run2TotalCalls)
+	}
+
+	return bashDiff
+}
+
 // computeGitHubRateLimitDiff computes the diff of GitHub API quota consumption between two
 // runs using the GitHubRateLimitUsage data from RunSummary.GitHubRateLimitUsage.
 // Returns nil when both summaries are nil.
diff --git a/pkg/cli/audit_diff_render.go b/pkg/cli/audit_diff_render.go
index 0a6cece62b6..de9dc0dba58 100644
--- a/pkg/cli/audit_diff_render.go
+++ b/pkg/cli/audit_diff_render.go
@@ -263,6 +263,9 @@ func renderRunMetricsDiffMarkdownSection(run1ID, run2ID int64, diff *RunMetricsD
 		turnsChange := fmt.Sprintf("%+d", diff.TurnsChange)
 		fmt.Printf("| Turns | %d | %d | %s |\n", diff.Run1Turns, diff.Run2Turns, turnsChange)
 	}
+	if diff.Run1TokensPerTurn > 0 || diff.Run2TokensPerTurn > 0 {
+		fmt.Printf("| Tokens / turn | %d | %d | %s |\n", diff.Run1TokensPerTurn, diff.Run2TokensPerTurn, diff.TokensPerTurnChange)
+	}
 	fmt.Println()
 
 	if diff.TokenUsageDetails != nil {
@@ -271,6 +274,9 @@ func renderRunMetricsDiffMarkdownSection(run1ID, run2ID int64, diff *RunMetricsD
 	if diff.GitHubRateLimitDetails != nil {
 		renderGitHubRateLimitDiffMarkdownSection(run1ID, run2ID, diff.GitHubRateLimitDetails)
 	}
+	if diff.ToolCallsDiff != nil {
+		renderToolCallsDiffMarkdownSection(run1ID, run2ID, diff.ToolCallsDiff)
+	}
 }
 
 // renderTokenUsageDiffMarkdownSection renders detailed token usage as a markdown sub-section
@@ -503,6 +509,14 @@ func renderRunMetricsDiffPrettySection(run1ID, run2ID int64, diff *RunMetricsDif
 			fmt.Sprintf("%+d", diff.TurnsChange),
 		})
 	}
+	if diff.Run1TokensPerTurn > 0 || diff.Run2TokensPerTurn > 0 {
+		config.Rows = append(config.Rows, []string{
+			"Tokens / turn",
+			strconv.Itoa(diff.Run1TokensPerTurn),
+			strconv.Itoa(diff.Run2TokensPerTurn),
+			diff.TokensPerTurnChange,
+		})
+	}
 
 	if len(config.Rows) > 0 {
 		fmt.Fprint(os.Stderr, console.RenderTable(config))
@@ -516,6 +530,10 @@ func renderRunMetricsDiffPrettySection(run1ID, run2ID int64, diff *RunMetricsDif
 		fmt.Fprintln(os.Stderr)
 		renderGitHubRateLimitDiffPrettySection(run1ID, run2ID, diff.GitHubRateLimitDetails)
 	}
+	if diff.ToolCallsDiff != nil {
+		fmt.Fprintln(os.Stderr)
+		renderToolCallsDiffPrettySection(run1ID, run2ID, diff.ToolCallsDiff)
+	}
 }
 
 // renderTokenUsageDiffPrettySection renders detailed token usage as a pretty console sub-section
@@ -660,6 +678,181 @@ func renderGitHubRateLimitDiffPrettySection(run1ID, run2ID int64, diff *GitHubRa
 	}
 }
 
+// renderToolCallsDiffPrettySection renders the engine-level tool calls diff as a pretty console sub-section.
+// It shows a high-level table of all tool types and a dedicated bash commands breakdown.
+func renderToolCallsDiffPrettySection(run1ID, run2ID int64, diff *ToolCallsDiff) {
+	if diff == nil {
+		return
+	}
+
+	fmt.Fprintln(os.Stderr, console.FormatSectionHeader("Tool Call Breakdown"))
+	fmt.Fprintln(os.Stderr)
+
+	// All-tools overview table
+	if len(diff.AllTools) > 0 {
+		config := console.TableConfig{
+			Headers: []string{"Tool", fmt.Sprintf("Run #%d", run1ID), fmt.Sprintf("Run #%d", run2ID), "Change"},
+			Rows:    make([][]string, 0, len(diff.AllTools)),
+		}
+		for _, entry := range diff.AllTools {
+			change := entry.CallCountChange
+			if change == "" && entry.Status == "unchanged" {
+				change = "—"
+			}
+			config.Rows = append(config.Rows, []string{
+				entry.Name,
+				strconv.Itoa(entry.Run1CallCount),
+				strconv.Itoa(entry.Run2CallCount),
+				change,
+			})
+		}
+		fmt.Fprint(os.Stderr, console.RenderTable(config))
+	}
+
+	// Bash-specific breakdown
+	if diff.BashDiff != nil {
+		fmt.Fprintln(os.Stderr)
+		renderBashCommandsDiffPrettySection(run1ID, run2ID, diff.BashDiff)
+	}
+}
+
+// renderBashCommandsDiffPrettySection renders the bash commands breakdown as a pretty console sub-section.
+func renderBashCommandsDiffPrettySection(run1ID, run2ID int64, diff *BashCommandsDiff) {
+	fmt.Fprintln(os.Stderr, console.FormatSectionHeader("Bash Commands"))
+	fmt.Fprintln(os.Stderr)
+
+	// Summary line
+	fmt.Fprintln(os.Stderr, console.FormatInfoMessage(
+		fmt.Sprintf("Total bash calls: Run #%d=%d, Run #%d=%d (%s)",
+			run1ID, diff.Run1TotalCalls,
+			run2ID, diff.Run2TotalCalls,
+			diff.TotalCallsChange),
+	))
+
+	if len(diff.Commands) > 0 {
+		fmt.Fprintln(os.Stderr)
+		config := console.TableConfig{
+			Headers: []string{"Command", fmt.Sprintf("Run #%d", run1ID), fmt.Sprintf("Run #%d", run2ID), "Change", "Max Input", "Max Output"},
+			Rows:    make([][]string, 0, len(diff.Commands)),
+		}
+		for _, cmd := range diff.Commands {
+			change := cmd.CallCountChange
+			if change == "" {
+				change = "—"
+			}
+			maxInput := "—"
+			if cmd.Run1MaxInputSize > 0 || cmd.Run2MaxInputSize > 0 {
+				v1 := strconv.Itoa(cmd.Run1MaxInputSize)
+				v2 := strconv.Itoa(cmd.Run2MaxInputSize)
+				if cmd.Run1MaxInputSize == 0 {
+					v1 = "—"
+				}
+				if cmd.Run2MaxInputSize == 0 {
+					v2 = "—"
+				}
+				maxInput = v1 + " / " + v2
+			}
+			maxOutput := "—"
+			if cmd.Run1MaxOutputSize > 0 || cmd.Run2MaxOutputSize > 0 {
+				v1 := strconv.Itoa(cmd.Run1MaxOutputSize)
+				v2 := strconv.Itoa(cmd.Run2MaxOutputSize)
+				if cmd.Run1MaxOutputSize == 0 {
+					v1 = "—"
+				}
+				if cmd.Run2MaxOutputSize == 0 {
+					v2 = "—"
+				}
+				maxOutput = v1 + " / " + v2
+			}
+			config.Rows = append(config.Rows, []string{
+				cmd.Name,
+				strconv.Itoa(cmd.Run1CallCount),
+				strconv.Itoa(cmd.Run2CallCount),
+				change,
+				maxInput,
+				maxOutput,
+			})
+		}
+		fmt.Fprint(os.Stderr, console.RenderTable(config))
+	}
+}
+
+// renderToolCallsDiffMarkdownSection renders the engine-level tool calls diff as markdown.
+// It includes a full tool type table and a bash commands breakdown.
+func renderToolCallsDiffMarkdownSection(run1ID, run2ID int64, diff *ToolCallsDiff) {
+	if diff == nil {
+		return
+	}
+
+	fmt.Println("#### Tool Call Breakdown")
+	fmt.Println()
+
+	if len(diff.AllTools) > 0 {
+		fmt.Printf("| Tool | Run #%d | Run #%d | Change |\n", run1ID, run2ID)
+		fmt.Println("|------|---------|---------|--------|")
+		for _, entry := range diff.AllTools {
+			change := entry.CallCountChange
+			if change == "" {
+				change = "—"
+			}
+			fmt.Printf("| `%s` | %d | %d | %s |\n", entry.Name, entry.Run1CallCount, entry.Run2CallCount, change)
+		}
+		fmt.Println()
+	}
+
+	if diff.BashDiff != nil {
+		renderBashCommandsDiffMarkdownSection(run1ID, run2ID, diff.BashDiff)
+	}
+}
+
+// renderBashCommandsDiffMarkdownSection renders the bash commands diff sub-section as markdown.
+func renderBashCommandsDiffMarkdownSection(run1ID, run2ID int64, diff *BashCommandsDiff) {
+	fmt.Println("#### Bash Commands")
+	fmt.Println()
+	fmt.Printf("Total bash calls: Run #%d=%d, Run #%d=%d (%s)\n\n",
+		run1ID, diff.Run1TotalCalls,
+		run2ID, diff.Run2TotalCalls,
+		diff.TotalCallsChange)
+
+	if len(diff.Commands) > 0 {
+		fmt.Printf("| Command | Run #%d | Run #%d | Change | Max Input (r1/r2) | Max Output (r1/r2) |\n", run1ID, run2ID)
+		fmt.Println("|---------|---------|---------|--------|-------------------|-------------------|")
+		for _, cmd := range diff.Commands {
+			change := cmd.CallCountChange
+			if change == "" {
+				change = "—"
+			}
+			maxInput := "—"
+			if cmd.Run1MaxInputSize > 0 || cmd.Run2MaxInputSize > 0 {
+				v1 := strconv.Itoa(cmd.Run1MaxInputSize)
+				v2 := strconv.Itoa(cmd.Run2MaxInputSize)
+				if cmd.Run1MaxInputSize == 0 {
+					v1 = "—"
+				}
+				if cmd.Run2MaxInputSize == 0 {
+					v2 = "—"
+				}
+				maxInput = v1 + " / " + v2
+			}
+			maxOutput := "—"
+			if cmd.Run1MaxOutputSize > 0 || cmd.Run2MaxOutputSize > 0 {
+				v1 := strconv.Itoa(cmd.Run1MaxOutputSize)
+				v2 := strconv.Itoa(cmd.Run2MaxOutputSize)
+				if cmd.Run1MaxOutputSize == 0 {
+					v1 = "—"
+				}
+				if cmd.Run2MaxOutputSize == 0 {
+					v2 = "—"
+				}
+				maxOutput = v1 + " / " + v2
+			}
+			fmt.Printf("| `%s` | %d | %d | %s | %s | %s |\n",
+				cmd.Name, cmd.Run1CallCount, cmd.Run2CallCount, change, maxInput, maxOutput)
+		}
+		fmt.Println()
+	}
+}
+
 // firewallStatusEmoji returns the status emoji for a domain status
 func firewallStatusEmoji(status string) string {
 	switch status {
diff --git a/pkg/cli/audit_diff_test.go b/pkg/cli/audit_diff_test.go
index 555e59dbf78..4124fc039fc 100644
--- a/pkg/cli/audit_diff_test.go
+++ b/pkg/cli/audit_diff_test.go
@@ -1032,3 +1032,371 @@ func TestComputeRunMetricsDiff_RateLimitAloneNotNil(t *testing.T) {
 	require.NotNil(t, diff, "diff should not be nil when rate limit data is present")
 	require.NotNil(t, diff.GitHubRateLimitDetails, "GitHubRateLimitDetails should be populated")
 }
+
+// --- Tool calls diff tests ---
+
+func TestComputeToolCallsDiff_BothNil(t *testing.T) {
+	result := computeToolCallsDiff(nil, nil)
+	assert.Nil(t, result, "Should return nil when both metrics are nil")
+}
+
+func TestComputeToolCallsDiff_BothEmpty(t *testing.T) {
+	m1 := &LogMetrics{}
+	m2 := &LogMetrics{}
+	result := computeToolCallsDiff(m1, m2)
+	assert.Nil(t, result, "Should return nil when both metrics have no tool calls")
+}
+
+func TestComputeToolCallsDiff_NewTools(t *testing.T) {
+	m1 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "gh", CallCount: 5},
+		},
+	}
+	m2 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "gh", CallCount: 5},
+			{Name: "bash", CallCount: 3, MaxInputSize: 200, MaxOutputSize: 500},
+			{Name: "edit", CallCount: 2},
+		},
+	}
+
+	diff := computeToolCallsDiff(m1, m2)
+	require.NotNil(t, diff, "Should produce diff when tool calls exist")
+
+	assert.Len(t, diff.NewTools, 2, "Should have 2 new tools (bash, edit)")
+	assert.Empty(t, diff.RemovedTools, "Should have no removed tools")
+	assert.Empty(t, diff.ChangedTools, "Should have no changed tools")
+	assert.Len(t, diff.AllTools, 3, "AllTools should include all 3 tools")
+
+	bashEntry := findToolCallDiffEntry(diff.NewTools, "bash")
+	require.NotNil(t, bashEntry, "Should find bash in new tools")
+	assert.Equal(t, "new", bashEntry.Status, "bash status should be 'new'")
+	assert.Equal(t, 3, bashEntry.Run2CallCount, "bash call count should be 3")
+	assert.Equal(t, 200, bashEntry.Run2MaxInputSize, "bash max input should be 200")
+	assert.Equal(t, 500, bashEntry.Run2MaxOutputSize, "bash max output should be 500")
+
+	assert.Equal(t, 2, diff.Summary.NewToolCount, "Summary should show 2 new tools")
+	assert.Equal(t, 5, diff.Summary.Run1TotalCalls, "Run1 total should be 5")
+	assert.Equal(t, 10, diff.Summary.Run2TotalCalls, "Run2 total should be 10 (5+3+2)")
+}
+
+func TestComputeToolCallsDiff_RemovedTools(t *testing.T) {
+	m1 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "bash", CallCount: 8},
+			{Name: "gh", CallCount: 4},
+			{Name: "edit", CallCount: 2},
+		},
+	}
+	m2 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "bash", CallCount: 6},
+		},
+	}
+
+	diff := computeToolCallsDiff(m1, m2)
+	require.NotNil(t, diff, "Should produce diff")
+
+	assert.Len(t, diff.RemovedTools, 2, "Should have 2 removed tools (gh, edit)")
+	assert.Empty(t, diff.NewTools, "Should have no new tools")
+
+	assert.Equal(t, 2, diff.Summary.RemovedToolCount, "Summary should show 2 removed tools")
+	assert.Equal(t, 14, diff.Summary.Run1TotalCalls, "Run1 total should be 14")
+	assert.Equal(t, 6, diff.Summary.Run2TotalCalls, "Run2 total should be 6")
+}
+
+func TestComputeToolCallsDiff_ChangedTools(t *testing.T) {
+	m1 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "bash", CallCount: 5, MaxOutputSize: 300},
+			{Name: "gh", CallCount: 3},
+		},
+	}
+	m2 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "bash", CallCount: 12, MaxOutputSize: 800},
+			{Name: "gh", CallCount: 3},
+		},
+	}
+
+	diff := computeToolCallsDiff(m1, m2)
+	require.NotNil(t, diff, "Should produce diff")
+
+	assert.Len(t, diff.ChangedTools, 1, "Should have 1 changed tool (bash)")
+	assert.Empty(t, diff.NewTools, "Should have no new tools")
+	assert.Empty(t, diff.RemovedTools, "Should have no removed tools")
+
+	bashEntry := findToolCallDiffEntry(diff.ChangedTools, "bash")
+	require.NotNil(t, bashEntry, "Should find bash in changed tools")
+	assert.Equal(t, "changed", bashEntry.Status, "bash status should be 'changed'")
+	assert.Equal(t, 5, bashEntry.Run1CallCount, "bash run1 call count should be 5")
+	assert.Equal(t, 12, bashEntry.Run2CallCount, "bash run2 call count should be 12")
+	assert.Equal(t, "+7", bashEntry.CallCountChange, "bash change should be +7")
+	assert.Equal(t, 300, bashEntry.Run1MaxOutputSize, "run1 max output should be 300")
+	assert.Equal(t, 800, bashEntry.Run2MaxOutputSize, "run2 max output should be 800")
+
+	assert.Equal(t, 1, diff.Summary.ChangedToolCount, "Summary should show 1 changed tool")
+}
+
+func TestComputeToolCallsDiff_AllToolsContainsEverything(t *testing.T) {
+	m1 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "bash", CallCount: 4},
+			{Name: "gh", CallCount: 2},
+		},
+	}
+	m2 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "bash", CallCount: 8},
+			{Name: "edit", CallCount: 3},
+		},
+	}
+
+	diff := computeToolCallsDiff(m1, m2)
+	require.NotNil(t, diff, "Should produce diff")
+
+	// AllTools should include bash (changed), gh (removed), edit (new) - all 3 tools
+	assert.Len(t, diff.AllTools, 3, "AllTools should contain all 3 unique tools")
+
+	statuses := make(map[string]string)
+	for _, e := range diff.AllTools {
+		statuses[e.Name] = e.Status
+	}
+	assert.Equal(t, "changed", statuses["bash"], "bash should be changed")
+	assert.Equal(t, "removed", statuses["gh"], "gh should be removed")
+	assert.Equal(t, "new", statuses["edit"], "edit should be new")
+}
+
+func TestComputeToolCallsDiff_SortedOutput(t *testing.T) {
+	m1 := &LogMetrics{}
+	m2 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "z-tool", CallCount: 1},
+			{Name: "a-tool", CallCount: 1},
+			{Name: "m-tool", CallCount: 1},
+		},
+	}
+
+	diff := computeToolCallsDiff(m1, m2)
+	require.NotNil(t, diff, "Should produce diff")
+	require.Len(t, diff.AllTools, 3, "Should have 3 tools in AllTools")
+
+	// AllTools is built from sorted keys
+	assert.Equal(t, "a-tool", diff.AllTools[0].Name, "First tool should be a-tool (sorted)")
+	assert.Equal(t, "m-tool", diff.AllTools[1].Name, "Second tool should be m-tool (sorted)")
+	assert.Equal(t, "z-tool", diff.AllTools[2].Name, "Third tool should be z-tool (sorted)")
+}
+
+// --- Bash commands diff tests ---
+
+func TestComputeBashCommandsDiff_NoBash(t *testing.T) {
+	run1Tools := map[string]ToolCallInfo{
+		"gh": {Name: "gh", CallCount: 5},
+	}
+	run2Tools := map[string]ToolCallInfo{
+		"gh": {Name: "gh", CallCount: 3},
+	}
+	result := computeBashCommandsDiff(run1Tools, run2Tools)
+	assert.Nil(t, result, "Should return nil when no bash tools present")
+}
+
+func TestComputeBashCommandsDiff_GenericBash(t *testing.T) {
+	run1Tools := map[string]ToolCallInfo{
+		"bash": {Name: "bash", CallCount: 5},
+		"gh":   {Name: "gh", CallCount: 3},
+	}
+	run2Tools := map[string]ToolCallInfo{
+		"bash": {Name: "bash", CallCount: 10},
+		"gh":   {Name: "gh", CallCount: 3},
+	}
+
+	result := computeBashCommandsDiff(run1Tools, run2Tools)
+	require.NotNil(t, result, "Should produce bash diff when bash tool present")
+
+	assert.Equal(t, 5, result.Run1TotalCalls, "Run1 total bash calls should be 5")
+	assert.Equal(t, 10, result.Run2TotalCalls, "Run2 total bash calls should be 10")
+	assert.Equal(t, "+5", result.TotalCallsChange, "Total change should be +5")
+
+	require.Len(t, result.Commands, 1, "Should have 1 command (bash)")
+	assert.Equal(t, "bash", result.Commands[0].Name, "Command should be bash")
+	assert.Equal(t, "changed", result.Commands[0].Status, "bash status should be changed")
+	assert.Equal(t, "+5", result.Commands[0].CallCountChange, "bash change should be +5")
+}
+
+func TestComputeBashCommandsDiff_PerCommandTracking(t *testing.T) {
+	// Codex-style per-command bash tracking
+	run1Tools := map[string]ToolCallInfo{
+		"bash_git_status": {Name: "bash_git_status", CallCount: 3},
+		"bash_ls_la":      {Name: "bash_ls_la", CallCount: 1},
+		"bash_cat_readme": {Name: "bash_cat_readme", CallCount: 2},
+	}
+	run2Tools := map[string]ToolCallInfo{
+		"bash_git_status": {Name: "bash_git_status", CallCount: 5},
+		"bash_git_diff":   {Name: "bash_git_diff", CallCount: 4},
+		"bash_cat_readme": {Name: "bash_cat_readme", CallCount: 2},
+	}
+
+	result := computeBashCommandsDiff(run1Tools, run2Tools)
+	require.NotNil(t, result, "Should produce bash diff")
+
+	assert.Equal(t, 6, result.Run1TotalCalls, "Run1 total: 3+1+2=6")
+	assert.Equal(t, 11, result.Run2TotalCalls, "Run2 total: 5+4+2=11")
+	assert.Equal(t, "+5", result.TotalCallsChange, "Total change should be +5")
+
+	require.Len(t, result.Commands, 4, "Should have 4 commands (ls removed, diff added, status/cat changed/unchanged)")
+
+	statusMap := make(map[string]string)
+	for _, cmd := range result.Commands {
+		statusMap[cmd.Name] = cmd.Status
+	}
+	assert.Equal(t, "changed", statusMap["bash_git_status"], "git_status should be changed")
+	assert.Equal(t, "removed", statusMap["bash_ls_la"], "ls_la should be removed")
+	assert.Equal(t, "unchanged", statusMap["bash_cat_readme"], "cat_readme should be unchanged")
+	assert.Equal(t, "new", statusMap["bash_git_diff"], "git_diff should be new")
+}
+
+func TestComputeBashCommandsDiff_BashCapitalized(t *testing.T) {
+	// Claude uses "Bash" (capitalized)
+	run1Tools := map[string]ToolCallInfo{
+		"Bash": {Name: "Bash", CallCount: 7},
+	}
+	run2Tools := map[string]ToolCallInfo{
+		"Bash": {Name: "Bash", CallCount: 4},
+	}
+
+	result := computeBashCommandsDiff(run1Tools, run2Tools)
+	require.NotNil(t, result, "Should detect capitalized Bash tool")
+	assert.Equal(t, 7, result.Run1TotalCalls, "Run1 total should be 7")
+	assert.Equal(t, 4, result.Run2TotalCalls, "Run2 total should be 4")
+}
+
+// --- Tokens per turn tests ---
+
+func TestComputeRunMetricsDiff_TokensPerTurn(t *testing.T) {
+	summary1 := &RunSummary{
+		Run: WorkflowRun{
+			TokenUsage: 10000,
+			Turns:      5,
+		},
+	}
+	summary2 := &RunSummary{
+		Run: WorkflowRun{
+			TokenUsage: 18000,
+			Turns:      6,
+		},
+	}
+
+	diff := computeRunMetricsDiff(summary1, summary2)
+	require.NotNil(t, diff, "Should produce metrics diff")
+
+	// Without effective tokens, falls back to engine token count
+	assert.Equal(t, 2000, diff.Run1TokensPerTurn, "Run1 tokens/turn should be 10000/5=2000")
+	assert.Equal(t, 3000, diff.Run2TokensPerTurn, "Run2 tokens/turn should be 18000/6=3000")
+	assert.Equal(t, "+50%", diff.TokensPerTurnChange, "Tokens/turn should increase by 50%")
+}
+
+func TestComputeRunMetricsDiff_TokensPerTurnFromEffective(t *testing.T) {
+	// When effective token data is available it should be used for tokens/turn
+	summary1 := &RunSummary{
+		Run: WorkflowRun{
+			TokenUsage: 10000,
+			Turns:      4,
+		},
+		TokenUsage: &TokenUsageSummary{
+			TotalEffectiveTokens: 8000,
+			TotalInputTokens:     10000,
+			TotalRequests:        4,
+		},
+	}
+	summary2 := &RunSummary{
+		Run: WorkflowRun{
+			TokenUsage: 16000,
+			Turns:      4,
+		},
+		TokenUsage: &TokenUsageSummary{
+			TotalEffectiveTokens: 12000,
+			TotalInputTokens:     16000,
+			TotalRequests:        4,
+		},
+	}
+
+	diff := computeRunMetricsDiff(summary1, summary2)
+	require.NotNil(t, diff, "Should produce metrics diff")
+
+	// Effective tokens should be used: 8000/4=2000, 12000/4=3000
+	assert.Equal(t, 2000, diff.Run1TokensPerTurn, "Run1 tokens/turn should use effective: 8000/4=2000")
+	assert.Equal(t, 3000, diff.Run2TokensPerTurn, "Run2 tokens/turn should use effective: 12000/4=3000")
+}
+
+func TestComputeRunMetricsDiff_TokensPerTurnZeroTurns(t *testing.T) {
+	// When turns = 0, tokens per turn should remain 0 (no division)
+	summary1 := &RunSummary{
+		Run: WorkflowRun{
+			TokenUsage: 5000,
+			Turns:      0,
+		},
+	}
+	summary2 := &RunSummary{
+		Run: WorkflowRun{
+			TokenUsage: 8000,
+			Turns:      4,
+		},
+	}
+
+	diff := computeRunMetricsDiff(summary1, summary2)
+	require.NotNil(t, diff, "Should produce metrics diff")
+
+	assert.Equal(t, 0, diff.Run1TokensPerTurn, "Run1 tokens/turn should be 0 when turns=0")
+	assert.Equal(t, 2000, diff.Run2TokensPerTurn, "Run2 tokens/turn should be 8000/4=2000")
+}
+
+// --- Tool calls diff in RunMetricsDiff integration test ---
+
+func TestComputeRunMetricsDiff_WithToolCallsDiff(t *testing.T) {
+	m1 := LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "bash", CallCount: 5},
+			{Name: "gh", CallCount: 3},
+		},
+		Turns: 4,
+	}
+	m2 := LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "bash", CallCount: 12},
+			{Name: "gh", CallCount: 3},
+			{Name: "edit", CallCount: 4},
+		},
+		Turns: 6,
+	}
+	summary1 := &RunSummary{
+		Run:     WorkflowRun{TokenUsage: 5000, Turns: 4},
+		Metrics: m1,
+	}
+	summary2 := &RunSummary{
+		Run:     WorkflowRun{TokenUsage: 9000, Turns: 6},
+		Metrics: m2,
+	}
+
+	diff := computeRunMetricsDiff(summary1, summary2)
+	require.NotNil(t, diff, "Should produce metrics diff")
+	require.NotNil(t, diff.ToolCallsDiff, "Should include tool calls diff")
+
+	assert.Len(t, diff.ToolCallsDiff.NewTools, 1, "Should have 1 new tool (edit)")
+	assert.Len(t, diff.ToolCallsDiff.ChangedTools, 1, "Should have 1 changed tool (bash)")
+	assert.Empty(t, diff.ToolCallsDiff.RemovedTools, "Should have no removed tools")
+
+	require.NotNil(t, diff.ToolCallsDiff.BashDiff, "Should have bash diff")
+	assert.Equal(t, 5, diff.ToolCallsDiff.BashDiff.Run1TotalCalls, "Bash run1 total should be 5")
+	assert.Equal(t, 12, diff.ToolCallsDiff.BashDiff.Run2TotalCalls, "Bash run2 total should be 12")
+}
+
+// findToolCallDiffEntry is a test helper to find a tool entry by name
+func findToolCallDiffEntry(entries []ToolCallDiffEntry, name string) *ToolCallDiffEntry {
+	for i := range entries {
+		if entries[i].Name == name {
+			return &entries[i]
+		}
+	}
+	return nil
+}
diff --git a/pkg/cli/logs_models.go b/pkg/cli/logs_models.go
index 8390d56266d..2776efb399f 100644
--- a/pkg/cli/logs_models.go
+++ b/pkg/cli/logs_models.go
@@ -82,6 +82,10 @@ type WorkflowRun struct {
 // This is now an alias to the shared type in workflow package
 type LogMetrics = workflow.LogMetrics
 
+// ToolCallInfo represents statistics for a single tool invocation type
+// This is an alias to the shared type in workflow package
+type ToolCallInfo = workflow.ToolCallInfo
+
 // ProcessedRun represents a workflow run with its associated analysis
 type ProcessedRun struct {
 	Run                     WorkflowRun

From 6967b114c3f45d4b74153991158fe32c3ef894d0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 25 Apr 2026 20:25:14 +0000
Subject: [PATCH 2/4] refactor: extract formatMaxSizeCell helper and remove
 redundant map traversal in bash diff

- Extract formatMaxSizeCell() helper to remove duplicated max-size formatting in pretty and markdown renderers
- Collect bash tools during main iteration in computeToolCallsDiff() so computeBashCommandsDiff() receives pre-filtered maps, avoiding a second traversal
- Update tests to pass pre-filtered bash tool maps to computeBashCommandsDiff() directly

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/dbe42488-aa10-4336-bfeb-170f75f44adf

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 pkg/cli/audit_diff.go        | 26 +++++++------
 pkg/cli/audit_diff_render.go | 72 +++++++++++-------------------------
 pkg/cli/audit_diff_test.go   | 12 ++----
 3 files changed, 40 insertions(+), 70 deletions(-)

diff --git a/pkg/cli/audit_diff.go b/pkg/cli/audit_diff.go
index f6ccc0c967b..bf17b2e11d2 100644
--- a/pkg/cli/audit_diff.go
+++ b/pkg/cli/audit_diff.go
@@ -646,6 +646,9 @@ func computeToolCallsDiff(m1, m2 *LogMetrics) *ToolCallsDiff {
 
 	diff := &ToolCallsDiff{}
 	var run1Total, run2Total int
+	// Collect bash tools during the main iteration to avoid a second traversal in computeBashCommandsDiff.
+	bashRun1 := make(map[string]ToolCallInfo)
+	bashRun2 := make(map[string]ToolCallInfo)
 
 	for _, name := range sortedNames {
 		tc1, inRun1 := run1Tools[name]
@@ -653,9 +656,15 @@ func computeToolCallsDiff(m1, m2 *LogMetrics) *ToolCallsDiff {
 
 		if inRun1 {
 			run1Total += tc1.CallCount
+			if isBashTool(name) {
+				bashRun1[name] = tc1
+			}
 		}
 		if inRun2 {
 			run2Total += tc2.CallCount
+			if isBashTool(name) {
+				bashRun2[name] = tc2
+			}
 		}
 
 		var entry ToolCallDiffEntry
@@ -706,7 +715,7 @@ func computeToolCallsDiff(m1, m2 *LogMetrics) *ToolCallsDiff {
 		diff.AllTools = append(diff.AllTools, entry)
 	}
 
-	diff.BashDiff = computeBashCommandsDiff(run1Tools, run2Tools)
+	diff.BashDiff = computeBashCommandsDiff(bashRun1, bashRun2)
 	diff.Summary = ToolCallsDiffSummary{
 		NewToolCount:     len(diff.NewTools),
 		RemovedToolCount: len(diff.RemovedTools),
@@ -720,21 +729,16 @@ func computeToolCallsDiff(m1, m2 *LogMetrics) *ToolCallsDiff {
 	return diff
 }
 
-// computeBashCommandsDiff builds bash-specific analysis from the tool call maps.
-// It collects all bash-related entries (generic "bash"/"Bash" and per-command "bash_*")
-// and produces a summary with per-command call count comparison.
-// Returns nil when no bash tool calls are present in either run.
+// computeBashCommandsDiff builds bash-specific analysis from pre-filtered bash tool call maps.
+// The maps should contain only bash-related entries (generic "bash"/"Bash" and per-command "bash_*").
+// Returns nil when no bash tool calls are present in either map.
 func computeBashCommandsDiff(run1Tools, run2Tools map[string]ToolCallInfo) *BashCommandsDiff {
 	allNames := make(map[string]bool)
 	for k := range run1Tools {
-		if isBashTool(k) {
-			allNames[k] = true
-		}
+		allNames[k] = true
 	}
 	for k := range run2Tools {
-		if isBashTool(k) {
-			allNames[k] = true
-		}
+		allNames[k] = true
 	}
 
 	if len(allNames) == 0 {
diff --git a/pkg/cli/audit_diff_render.go b/pkg/cli/audit_diff_render.go
index de9dc0dba58..cdcbe82d7af 100644
--- a/pkg/cli/audit_diff_render.go
+++ b/pkg/cli/audit_diff_render.go
@@ -740,37 +740,13 @@ func renderBashCommandsDiffPrettySection(run1ID, run2ID int64, diff *BashCommand
 			if change == "" {
 				change = "—"
 			}
-			maxInput := "—"
-			if cmd.Run1MaxInputSize > 0 || cmd.Run2MaxInputSize > 0 {
-				v1 := strconv.Itoa(cmd.Run1MaxInputSize)
-				v2 := strconv.Itoa(cmd.Run2MaxInputSize)
-				if cmd.Run1MaxInputSize == 0 {
-					v1 = "—"
-				}
-				if cmd.Run2MaxInputSize == 0 {
-					v2 = "—"
-				}
-				maxInput = v1 + " / " + v2
-			}
-			maxOutput := "—"
-			if cmd.Run1MaxOutputSize > 0 || cmd.Run2MaxOutputSize > 0 {
-				v1 := strconv.Itoa(cmd.Run1MaxOutputSize)
-				v2 := strconv.Itoa(cmd.Run2MaxOutputSize)
-				if cmd.Run1MaxOutputSize == 0 {
-					v1 = "—"
-				}
-				if cmd.Run2MaxOutputSize == 0 {
-					v2 = "—"
-				}
-				maxOutput = v1 + " / " + v2
-			}
 			config.Rows = append(config.Rows, []string{
 				cmd.Name,
 				strconv.Itoa(cmd.Run1CallCount),
 				strconv.Itoa(cmd.Run2CallCount),
 				change,
-				maxInput,
-				maxOutput,
+				formatMaxSizeCell(cmd.Run1MaxInputSize, cmd.Run2MaxInputSize),
+				formatMaxSizeCell(cmd.Run1MaxOutputSize, cmd.Run2MaxOutputSize),
 			})
 		}
 		fmt.Fprint(os.Stderr, console.RenderTable(config))
@@ -822,37 +798,31 @@ func renderBashCommandsDiffMarkdownSection(run1ID, run2ID int64, diff *BashComma
 			if change == "" {
 				change = "—"
 			}
-			maxInput := "—"
-			if cmd.Run1MaxInputSize > 0 || cmd.Run2MaxInputSize > 0 {
-				v1 := strconv.Itoa(cmd.Run1MaxInputSize)
-				v2 := strconv.Itoa(cmd.Run2MaxInputSize)
-				if cmd.Run1MaxInputSize == 0 {
-					v1 = "—"
-				}
-				if cmd.Run2MaxInputSize == 0 {
-					v2 = "—"
-				}
-				maxInput = v1 + " / " + v2
-			}
-			maxOutput := "—"
-			if cmd.Run1MaxOutputSize > 0 || cmd.Run2MaxOutputSize > 0 {
-				v1 := strconv.Itoa(cmd.Run1MaxOutputSize)
-				v2 := strconv.Itoa(cmd.Run2MaxOutputSize)
-				if cmd.Run1MaxOutputSize == 0 {
-					v1 = "—"
-				}
-				if cmd.Run2MaxOutputSize == 0 {
-					v2 = "—"
-				}
-				maxOutput = v1 + " / " + v2
-			}
 			fmt.Printf("| `%s` | %d | %d | %s | %s | %s |\n",
-				cmd.Name, cmd.Run1CallCount, cmd.Run2CallCount, change, maxInput, maxOutput)
+				cmd.Name, cmd.Run1CallCount, cmd.Run2CallCount, change,
+				formatMaxSizeCell(cmd.Run1MaxInputSize, cmd.Run2MaxInputSize),
+				formatMaxSizeCell(cmd.Run1MaxOutputSize, cmd.Run2MaxOutputSize))
 		}
 		fmt.Println()
 	}
 }
 
+// formatMaxSizeCell formats a "run1 / run2" max-size pair for display in a table cell.
+// Returns "—" when both values are zero, and omits the individual value if it is zero.
+func formatMaxSizeCell(run1Size, run2Size int) string {
+	if run1Size == 0 && run2Size == 0 {
+		return "—"
+	}
+	v1, v2 := strconv.Itoa(run1Size), strconv.Itoa(run2Size)
+	if run1Size == 0 {
+		v1 = "—"
+	}
+	if run2Size == 0 {
+		v2 = "—"
+	}
+	return v1 + " / " + v2
+}
+
 // firewallStatusEmoji returns the status emoji for a domain status
 func firewallStatusEmoji(status string) string {
 	switch status {
diff --git a/pkg/cli/audit_diff_test.go b/pkg/cli/audit_diff_test.go
index 4124fc039fc..41989413874 100644
--- a/pkg/cli/audit_diff_test.go
+++ b/pkg/cli/audit_diff_test.go
@@ -1191,24 +1191,20 @@ func TestComputeToolCallsDiff_SortedOutput(t *testing.T) {
 // --- Bash commands diff tests ---
 
 func TestComputeBashCommandsDiff_NoBash(t *testing.T) {
-	run1Tools := map[string]ToolCallInfo{
-		"gh": {Name: "gh", CallCount: 5},
-	}
-	run2Tools := map[string]ToolCallInfo{
-		"gh": {Name: "gh", CallCount: 3},
-	}
+	// computeBashCommandsDiff receives pre-filtered maps; passing no bash tools → nil
+	run1Tools := map[string]ToolCallInfo{}
+	run2Tools := map[string]ToolCallInfo{}
 	result := computeBashCommandsDiff(run1Tools, run2Tools)
 	assert.Nil(t, result, "Should return nil when no bash tools present")
 }
 
 func TestComputeBashCommandsDiff_GenericBash(t *testing.T) {
+	// Only bash tools are passed to computeBashCommandsDiff
 	run1Tools := map[string]ToolCallInfo{
 		"bash": {Name: "bash", CallCount: 5},
-		"gh":   {Name: "gh", CallCount: 3},
 	}
 	run2Tools := map[string]ToolCallInfo{
 		"bash": {Name: "bash", CallCount: 10},
-		"gh":   {Name: "gh", CallCount: 3},
 	}
 
 	result := computeBashCommandsDiff(run1Tools, run2Tools)

From 4fbf62822f3896a3b9f0149756912175384ec7b2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sat, 25 Apr 2026 20:48:01 +0000
Subject: [PATCH 3/4] docs(adr): add draft ADR-28494 for tool-call breakdown
 and tokens-per-turn in audit diff

Generated by the Design Decision Gate workflow for PR #28494.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ...calls-and-tokens-per-turn-in-audit-diff.md | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 docs/adr/28494-tool-calls-and-tokens-per-turn-in-audit-diff.md

diff --git a/docs/adr/28494-tool-calls-and-tokens-per-turn-in-audit-diff.md b/docs/adr/28494-tool-calls-and-tokens-per-turn-in-audit-diff.md
new file mode 100644
index 00000000000..f704644a248
--- /dev/null
+++ b/docs/adr/28494-tool-calls-and-tokens-per-turn-in-audit-diff.md
@@ -0,0 +1,91 @@
+# ADR-28494: Embed Tool-Call Breakdown and Tokens-per-Turn Metrics in Audit Diff Output
+
+**Date**: 2026-04-25
+**Status**: Draft
+**Deciders**: pelikhan, Copilot
+
+---
+
+## Part 1 — Narrative (Human-Friendly)
+
+### Context
+
+The `audit diff` command compares two workflow runs side-by-side and surfaces metrics like total token usage, turns, duration, and cache efficiency. When agents investigate cost regressions between runs they frequently cannot determine *why* tokens changed — whether the regression comes from more turns, heavier per-turn usage, or a shift in which tools (bash, gh, edit, etc.) are being called. The existing `RunMetricsDiff` structure exposed aggregate token counts and turn counts, but no per-turn token rate and no breakdown by tool type. Engine-level tool call data was already being parsed into `RunSummary.Metrics.ToolCalls` (populated by the Claude, Codex, and Copilot log parsers) but was never surfaced in the diff output.
+
+### Decision
+
+We will enrich `RunMetricsDiff` with two new data structures — `ToolCallsDiff` and a tokens-per-turn scalar — and render both in the existing pretty-console and markdown diff renderers. Tokens per turn uses effective tokens from the firewall proxy when available, falling back to the engine-level token count. Tool call data is sourced from the already-computed `LogMetrics.ToolCalls` slice; bash-related entries (`bash`, `Bash`, `bash_*`) are separated into a dedicated `BashCommandsDiff` sub-structure to expose the Codex-style per-command granularity. All new types follow the existing JSON-serialisable struct conventions used by `TokenUsageDiff` and `GitHubRateLimitDiff`.
+
+### Alternatives Considered
+
+#### Alternative 1: Separate `audit tool-calls` subcommand
+
+A dedicated subcommand could show tool-call detail for a single run or a pair. It was considered because it avoids enlarging the diff output and keeps concerns separated. It was not chosen because the tool-call delta is meaningful only in the context of a comparison; the driver is always "why did cost change between run A and run B?" Putting the data in the diff keeps it contextual and eliminates extra round trips for agents.
+
+#### Alternative 2: Log-level analysis only — do not change the diff output
+
+Agents could perform deeper log analysis themselves by querying the raw run logs. This was considered because it keeps the diff output lean. It was not chosen because the relevant data (`RunSummary.Metrics.ToolCalls`) is already materialised in memory during diff computation; re-parsing logs adds latency and requires agents to implement the aggregation logic every time, which is exactly what prompted this change.
+
+#### Alternative 3: Add data to JSON output only, no render changes
+
+Extending the JSON struct without adding renderer support would satisfy machine consumers but not the human-readable console/markdown reports that are the primary use-case for `audit diff`. This approach was not chosen because the primary consumer is the rendered diff report read by agents and engineers.
+
+### Consequences
+
+#### Positive
+- Agents can immediately see which tool types drove a token or call-count change between two runs without additional log queries.
+- Per-turn token efficiency distinguishes "more turns" regressions from "heavier per-turn" regressions, enabling more targeted fixes.
+- Bash command granularity (via Codex's `bash_*` naming) exposes specific shell commands that changed frequency — actionable detail for prompt/workflow optimisation.
+- The `AllTools` slice provides a complete cross-run view, not just the delta, which helps verify expected tool usage patterns.
+
+#### Negative
+- The diff output grows in length; runs with many tool types will produce lengthy "Tool Call Breakdown" sections that may be noisy when there are no significant changes.
+- The `isBashTool` helper encodes engine-specific naming conventions (`bash`, `Bash`, `bash_*`) directly in the diff logic, creating a coupling point that must be updated if a new engine uses a different shell tool naming scheme.
+- Tokens-per-turn uses integer division, silently discarding the fractional part; the resulting value can appear identical between two runs even when there is a small real difference.
+
+#### Neutral
+- The new `ToolCallInfo` type alias is exported from `logs_models.go` alongside the existing `LogMetrics` alias, following the established aliasing pattern for shared workflow types.
+- Bash diff computation receives pre-filtered maps from the parent iteration to avoid a second traversal, which is a performance micro-optimisation that future readers should be aware of when modifying the iteration logic.
+
+---
+
+## Part 2 — Normative Specification (RFC 2119)
+
+> The key words **MUST**, **MUST NOT**, **REQUIRED**, **SHALL**, **SHALL NOT**, **SHOULD**, **SHOULD NOT**, **RECOMMENDED**, **MAY**, and **OPTIONAL** in this section are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119).
+
+### Tokens-per-Turn Computation
+
+1. Implementations **MUST** compute tokens-per-turn as `effectiveTokens / turns` where `effectiveTokens` is `TokenUsageSummary.TotalEffectiveTokens` when that value is greater than zero.
+2. Implementations **MUST** fall back to the engine-level token count (`WorkflowRun.TokenUsage`) when `TotalEffectiveTokens` is zero or the `TokenUsageSummary` is absent.
+3. Implementations **MUST NOT** compute a tokens-per-turn value when the turn count is zero (to avoid division by zero).
+4. Implementations **SHOULD** format the tokens-per-turn change as a percentage string (e.g., `+50%`, `-10%`) using the same `formatVolumeChange` helper applied to other percentage-point metrics.
+
+### Tool Calls Diff
+
+1. Implementations **MUST** source tool call data from `RunSummary.Metrics.ToolCalls` (`LogMetrics.ToolCalls`) and **MUST NOT** re-parse raw log files during diff computation.
+2. Implementations **MUST** produce a `ToolCallsDiff` that classifies each tool as `new`, `removed`, `changed`, or `unchanged` relative to the baseline run.
+3. Implementations **MUST** include every tool seen in either run in the `AllTools` slice, sorted lexicographically by tool name.
+4. Implementations **MUST** return `nil` for `ToolCallsDiff` when both runs have no tool call data, to keep the output clean for runs predating this feature.
+5. Implementations **SHOULD** include per-entry `MaxInputSize` and `MaxOutputSize` values to provide token-size context for each tool type.
+
+### Bash-Specific Breakdown
+
+1. Implementations **MUST** treat tool names matching `bash` or `Bash` (case-insensitive equality) and names with the prefix `bash_` (case-insensitive) as bash tool invocations.
+2. Implementations **MUST** aggregate all bash tool entries into a `BashCommandsDiff` sub-structure and report their combined call count for each run.
+3. Implementations **MUST** collect bash tool entries during the main tool iteration loop and **MUST NOT** perform a second traversal of the tool maps to build the bash diff.
+4. Implementations **MUST** return `nil` for `BashCommandsDiff` when no bash tool calls are present in either run.
+
+### Rendering
+
+1. Implementations **MUST** render `ToolCallsDiff` in both the pretty-console and markdown output paths when the diff is non-nil.
+2. Implementations **MUST** render the tokens-per-turn row in the Run Metrics table when at least one of `Run1TokensPerTurn` or `Run2TokensPerTurn` is greater than zero.
+3. Implementations **SHOULD** use a `formatMaxSizeCell` helper (or equivalent) to format `run1 / run2` size pairs, displaying `—` when both values are zero and omitting the individual value when it is zero.
+4. Implementations **MAY** omit the Bash Commands sub-section from the rendered output when `BashDiff` is nil.
+
+### Conformance
+
+An implementation is considered conformant with this ADR if it satisfies all **MUST** and **MUST NOT** requirements above. Failure to meet any **MUST** or **MUST NOT** requirement constitutes non-conformance.
+
+---
+
+*This is a DRAFT ADR generated by the [Design Decision Gate](https://github.com/github/gh-aw/actions/runs/24940226956) workflow. The PR author must review, complete, and finalize this document before the PR can merge.*

From 874e7a8cf9593f23e377b8391dd996b77dc098de Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 25 Apr 2026 20:53:14 +0000
Subject: [PATCH 4/4] fix: aggregate duplicate tool names and always show dash
 for empty change cells
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- In computeToolCallsDiff: aggregate duplicate tool entries (sum CallCount, take max of sizes) instead of overwriting, matching how other consumers handle metrics appended from multiple log files
- In renderToolCallsDiffPrettySection: always substitute "—" for empty change cells regardless of status, consistent with markdown renderer
- Add TestComputeToolCallsDiff_DuplicateToolNames test

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/60183031-f19f-4034-bf21-75f6983738ad

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 pkg/cli/audit_diff.go        | 24 ++++++++++++++++++++++--
 pkg/cli/audit_diff_render.go |  2 +-
 pkg/cli/audit_diff_test.go   | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/pkg/cli/audit_diff.go b/pkg/cli/audit_diff.go
index bf17b2e11d2..5a66e2a493a 100644
--- a/pkg/cli/audit_diff.go
+++ b/pkg/cli/audit_diff.go
@@ -615,14 +615,34 @@ func computeToolCallsDiff(m1, m2 *LogMetrics) *ToolCallsDiff {
 	run1Tools := make(map[string]ToolCallInfo)
 	run2Tools := make(map[string]ToolCallInfo)
 
+	// aggregateToolCall merges a tool call entry into the map, summing call counts and
+	// taking the max of size fields to handle duplicate entries across log files.
+	aggregateToolCall := func(tools map[string]ToolCallInfo, tc ToolCallInfo) {
+		if existing, ok := tools[tc.Name]; ok {
+			existing.CallCount += tc.CallCount
+			if tc.MaxInputSize > existing.MaxInputSize {
+				existing.MaxInputSize = tc.MaxInputSize
+			}
+			if tc.MaxOutputSize > existing.MaxOutputSize {
+				existing.MaxOutputSize = tc.MaxOutputSize
+			}
+			if tc.MaxDuration > existing.MaxDuration {
+				existing.MaxDuration = tc.MaxDuration
+			}
+			tools[tc.Name] = existing
+			return
+		}
+		tools[tc.Name] = tc
+	}
+
 	if m1 != nil {
 		for _, tc := range m1.ToolCalls {
-			run1Tools[tc.Name] = tc
+			aggregateToolCall(run1Tools, tc)
 		}
 	}
 	if m2 != nil {
 		for _, tc := range m2.ToolCalls {
-			run2Tools[tc.Name] = tc
+			aggregateToolCall(run2Tools, tc)
 		}
 	}
 
diff --git a/pkg/cli/audit_diff_render.go b/pkg/cli/audit_diff_render.go
index cdcbe82d7af..bd991a9b735 100644
--- a/pkg/cli/audit_diff_render.go
+++ b/pkg/cli/audit_diff_render.go
@@ -696,7 +696,7 @@ func renderToolCallsDiffPrettySection(run1ID, run2ID int64, diff *ToolCallsDiff)
 		}
 		for _, entry := range diff.AllTools {
 			change := entry.CallCountChange
-			if change == "" && entry.Status == "unchanged" {
+			if change == "" {
 				change = "—"
 			}
 			config.Rows = append(config.Rows, []string{
diff --git a/pkg/cli/audit_diff_test.go b/pkg/cli/audit_diff_test.go
index 41989413874..6a30074cfce 100644
--- a/pkg/cli/audit_diff_test.go
+++ b/pkg/cli/audit_diff_test.go
@@ -1047,6 +1047,40 @@ func TestComputeToolCallsDiff_BothEmpty(t *testing.T) {
 	assert.Nil(t, result, "Should return nil when both metrics have no tool calls")
 }
 
+func TestComputeToolCallsDiff_DuplicateToolNames(t *testing.T) {
+	// When the same tool name appears multiple times (e.g. metrics aggregated from multiple log files),
+	// call counts should be summed and max sizes kept.
+	m1 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "bash", CallCount: 3, MaxInputSize: 100, MaxOutputSize: 200},
+			{Name: "bash", CallCount: 4, MaxInputSize: 150, MaxOutputSize: 180},
+		},
+	}
+	m2 := &LogMetrics{
+		ToolCalls: []ToolCallInfo{
+			{Name: "bash", CallCount: 5, MaxInputSize: 120, MaxOutputSize: 300},
+			{Name: "bash", CallCount: 2, MaxInputSize: 90, MaxOutputSize: 250},
+		},
+	}
+
+	diff := computeToolCallsDiff(m1, m2)
+	require.NotNil(t, diff, "Should produce diff")
+
+	require.Len(t, diff.AllTools, 1, "Should have 1 unique tool (bash)")
+	bash := diff.AllTools[0]
+	assert.Equal(t, "bash", bash.Name, "Tool should be bash")
+	// Run1: 3+4=7, Run2: 5+2=7 → unchanged
+	assert.Equal(t, 7, bash.Run1CallCount, "Run1 call count should be sum: 3+4=7")
+	assert.Equal(t, 7, bash.Run2CallCount, "Run2 call count should be sum: 5+2=7")
+	assert.Equal(t, "unchanged", bash.Status, "Status should be unchanged when counts are equal")
+	// Max input: run1=max(100,150)=150, run2=max(120,90)=120
+	assert.Equal(t, 150, bash.Run1MaxInputSize, "Run1 max input should be max(100,150)=150")
+	assert.Equal(t, 120, bash.Run2MaxInputSize, "Run2 max input should be max(120,90)=120")
+	// Max output: run1=max(200,180)=200, run2=max(300,250)=300
+	assert.Equal(t, 200, bash.Run1MaxOutputSize, "Run1 max output should be max(200,180)=200")
+	assert.Equal(t, 300, bash.Run2MaxOutputSize, "Run2 max output should be max(300,250)=300")
+}
+
 func TestComputeToolCallsDiff_NewTools(t *testing.T) {
 	m1 := &LogMetrics{
 		ToolCalls: []ToolCallInfo{