From 150709e90572ee1c3a481d0f53239036f44caef5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Apr 2026 22:51:17 +0000 Subject: [PATCH 1/3] feat: add Time Between Turns (TBT) to audit/logs command and glossary - Add AvgTimeBetweenTurns and MaxTimeBetweenTurns to LogMetrics - Parse TBT from user.message timestamps in events.jsonl (Copilot engine) - Add TBT fields + cache warning to SessionAnalysis in audit command - Render TBT with Anthropic 5-min cache-expiry warning in audit report - Add AvgTimeBetweenTurns to WorkflowRun and RunData for logs command - Propagate TBT through logs orchestrator pipeline - Add TBT glossary entry to docs - Add unit tests for TBT computation in parseEventsJSONLFile Agent-Logs-Url: https://github.com/github/gh-aw/sessions/ac5d0f93-faf6-480f-878a-17d194e35075 Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- docs/src/content/docs/reference/glossary.md | 11 ++++ pkg/cli/audit_expanded.go | 56 +++++++++++++++--- pkg/cli/audit_report_render_overview.go | 23 +++++--- pkg/cli/copilot_events_jsonl.go | 36 ++++++++++++ pkg/cli/copilot_events_jsonl_test.go | 64 +++++++++++++++++++++ pkg/cli/logs_models.go | 55 +++++++++--------- pkg/cli/logs_orchestrator.go | 2 + pkg/cli/logs_report.go | 7 +++ pkg/workflow/metrics.go | 12 ++-- 9 files changed, 218 insertions(+), 48 deletions(-) diff --git a/docs/src/content/docs/reference/glossary.md b/docs/src/content/docs/reference/glossary.md index 666d27083f5..697e3131d92 100644 --- a/docs/src/content/docs/reference/glossary.md +++ b/docs/src/content/docs/reference/glossary.md @@ -405,6 +405,17 @@ A feature of `gh aw logs` that aggregates firewall, MCP, and metrics data across A weighted token count that normalizes raw API token usage into a single comparable value for cost estimation and monitoring. Computed by applying cache and output multipliers to each token category (input, output, cache read, cache write) and summing the results. Appears in audit reports, `gh aw logs` output, and safe-output message footers (as `{effective_tokens}` and `{effective_tokens_formatted}`). For episode-level aggregation, `total_estimated_cost` uses effective tokens as its basis. See [Effective Tokens Specification](/gh-aw/reference/effective-tokens-specification/). +### Time Between Turns (TBT) + +The elapsed time between consecutive LLM API calls in an agentic workflow run. A "turn" is one complete LLM inference request; TBT measures the gap from when the model finishes one response (and tool calls are dispatched) to when the next request is sent (after all tool results are collected). TBT is an important performance and cost metric because LLM inference providers implement prompt caching with a fixed TTL: + +- **Anthropic** reduced their cache TTL from 1 hour to **5 minutes**. If the TBT for any turn exceeds 5 minutes, the cached prompt context expires and the full prompt must be re-processed, significantly increasing token costs. +- **OpenAI** has a similar server-side prompt cache with variable TTL. + +`gh aw audit` reports both average and maximum TBT in the Session Analysis section. A cache warning is emitted when the maximum observed TBT exceeds the Anthropic 5-minute threshold. For Copilot engine runs, TBT is computed from precise per-turn timestamps in the `events.jsonl` session log; for other engines it is estimated from total wall time divided by turn count. + +To reduce TBT — and keep prompt caches warm — minimize blocking tool calls, parallelize independent tool invocations, and avoid long-running shell commands in the critical path between turns. + ### Firewall Analysis A section of the `gh aw audit` report that breaks down all network requests made during a workflow run — showing allowed domains, denied domains, request volumes, and policy attribution. Derived from AWF firewall logs. Use `gh aw audit diff` to compare firewall behavior across runs and identify new or removed domain accesses. See [Audit Commands](/gh-aw/reference/audit/) and [Network Permissions](/gh-aw/reference/network/). diff --git a/pkg/cli/audit_expanded.go b/pkg/cli/audit_expanded.go index 110535cd918..2b9093e745e 100644 --- a/pkg/cli/audit_expanded.go +++ b/pkg/cli/audit_expanded.go @@ -36,13 +36,16 @@ type PromptAnalysis struct { // SessionAnalysis represents session and agent performance metrics type SessionAnalysis struct { - WallTime string `json:"wall_time,omitempty" console:"header:Wall Time,omitempty"` - TurnCount int `json:"turn_count,omitempty" console:"header:Turn Count,omitempty"` - AvgTurnDuration string `json:"avg_turn_duration,omitempty" console:"header:Avg Turn Duration,omitempty"` - TokensPerMinute float64 `json:"tokens_per_minute,omitempty"` - TimeoutDetected bool `json:"timeout_detected"` - NoopCount int `json:"noop_count,omitempty" console:"header:Noop Count,omitempty"` - AgentActiveRatio float64 `json:"agent_active_ratio,omitempty"` // 0.0 - 1.0 + WallTime string `json:"wall_time,omitempty" console:"header:Wall Time,omitempty"` + TurnCount int `json:"turn_count,omitempty" console:"header:Turn Count,omitempty"` + AvgTurnDuration string `json:"avg_turn_duration,omitempty" console:"header:Avg Turn Duration,omitempty"` + AvgTimeBetweenTurns string `json:"avg_time_between_turns,omitempty" console:"header:Avg Time Between Turns,omitempty"` + MaxTimeBetweenTurns string `json:"max_time_between_turns,omitempty" console:"header:Max Time Between Turns,omitempty"` + TokensPerMinute float64 `json:"tokens_per_minute,omitempty"` + TimeoutDetected bool `json:"timeout_detected"` + NoopCount int `json:"noop_count,omitempty" console:"header:Noop Count,omitempty"` + AgentActiveRatio float64 `json:"agent_active_ratio,omitempty"` // 0.0 - 1.0 + CacheWarning string `json:"cache_warning,omitempty" console:"header:Cache Warning,omitempty"` } // SafeOutputSummary provides a summary of safe output items by type @@ -205,6 +208,41 @@ func buildSessionAnalysis(processedRun ProcessedRun, metrics LogMetrics) *Sessio session.AvgTurnDuration = timeutil.FormatDuration(avgTurnDuration) } + // Time Between Turns (TBT): prefer precise per-turn timestamps from log metrics; + // fall back to wall-time / turns when timestamps are unavailable. + // TBT measures the gap between consecutive LLM API calls (tool execution overhead). + // Anthropic's prompt cache TTL is 5 minutes — if TBT exceeds this, cache entries + // expire and every turn incurs full prompt re-processing costs. + const anthropicCacheTTL = 5 * time.Minute + if metrics.AvgTimeBetweenTurns > 0 { + session.AvgTimeBetweenTurns = timeutil.FormatDuration(metrics.AvgTimeBetweenTurns) + if metrics.MaxTimeBetweenTurns > 0 { + session.MaxTimeBetweenTurns = timeutil.FormatDuration(metrics.MaxTimeBetweenTurns) + } + // Warn when the maximum observed TBT exceeds the Anthropic cache TTL. + if metrics.MaxTimeBetweenTurns > anthropicCacheTTL { + session.CacheWarning = fmt.Sprintf( + "Max TBT (%s) exceeds Anthropic 5-min cache TTL — prompt cache will expire between turns, increasing cost", + timeutil.FormatDuration(metrics.MaxTimeBetweenTurns), + ) + } else if metrics.AvgTimeBetweenTurns > anthropicCacheTTL { + session.CacheWarning = fmt.Sprintf( + "Avg TBT (%s) exceeds Anthropic 5-min cache TTL — prompt cache likely expiring between turns", + timeutil.FormatDuration(metrics.AvgTimeBetweenTurns), + ) + } + } else if metrics.Turns > 1 && run.Duration > 0 { + // Fallback: average cycle time as a proxy for TBT. + avgTBT := run.Duration / time.Duration(metrics.Turns) + session.AvgTimeBetweenTurns = timeutil.FormatDuration(avgTBT) + " (estimated)" + if avgTBT > anthropicCacheTTL { + session.CacheWarning = fmt.Sprintf( + "Estimated avg TBT (%s) exceeds Anthropic 5-min cache TTL — prompt cache likely expiring between turns", + timeutil.FormatDuration(avgTBT), + ) + } + } + // Tokens per minute if metrics.TokenUsage > 0 && run.Duration > 0 { minutes := run.Duration.Minutes() @@ -226,8 +264,8 @@ func buildSessionAnalysis(processedRun ProcessedRun, metrics LogMetrics) *Sessio } } - auditExpandedLog.Printf("Built session analysis: turns=%d, wall_time=%s, timeout=%v", - session.TurnCount, session.WallTime, session.TimeoutDetected) + auditExpandedLog.Printf("Built session analysis: turns=%d, wall_time=%s, avg_tbt=%s, max_tbt=%s, timeout=%v", + session.TurnCount, session.WallTime, session.AvgTimeBetweenTurns, session.MaxTimeBetweenTurns, session.TimeoutDetected) return session } diff --git a/pkg/cli/audit_report_render_overview.go b/pkg/cli/audit_report_render_overview.go index 41c5d9426e7..a76078cfe92 100644 --- a/pkg/cli/audit_report_render_overview.go +++ b/pkg/cli/audit_report_render_overview.go @@ -172,24 +172,33 @@ func renderSessionAnalysis(session *SessionAnalysis) { return } if session.WallTime != "" { - fmt.Fprintf(os.Stderr, " Wall Time: %s\n", session.WallTime) + fmt.Fprintf(os.Stderr, " Wall Time: %s\n", session.WallTime) } if session.TurnCount > 0 { - fmt.Fprintf(os.Stderr, " Turn Count: %d\n", session.TurnCount) + fmt.Fprintf(os.Stderr, " Turn Count: %d\n", session.TurnCount) } if session.AvgTurnDuration != "" { - fmt.Fprintf(os.Stderr, " Avg Turn Duration: %s\n", session.AvgTurnDuration) + fmt.Fprintf(os.Stderr, " Avg Turn Duration: %s\n", session.AvgTurnDuration) + } + if session.AvgTimeBetweenTurns != "" { + fmt.Fprintf(os.Stderr, " Avg Time Between Turns: %s\n", session.AvgTimeBetweenTurns) + } + if session.MaxTimeBetweenTurns != "" { + fmt.Fprintf(os.Stderr, " Max Time Between Turns: %s\n", session.MaxTimeBetweenTurns) + } + if session.CacheWarning != "" { + fmt.Fprintf(os.Stderr, " Cache Warning: %s\n", console.FormatWarningMessage(session.CacheWarning)) } if session.TokensPerMinute > 0 { - fmt.Fprintf(os.Stderr, " Tokens/Minute: %.1f\n", session.TokensPerMinute) + fmt.Fprintf(os.Stderr, " Tokens/Minute: %.1f\n", session.TokensPerMinute) } if session.NoopCount > 0 { - fmt.Fprintf(os.Stderr, " Noop Count: %d\n", session.NoopCount) + fmt.Fprintf(os.Stderr, " Noop Count: %d\n", session.NoopCount) } if session.TimeoutDetected { - fmt.Fprintf(os.Stderr, " Timeout Detected: %s\n", console.FormatWarningMessage("Yes")) + fmt.Fprintf(os.Stderr, " Timeout Detected: %s\n", console.FormatWarningMessage("Yes")) } else { - fmt.Fprintf(os.Stderr, " Timeout Detected: %s\n", console.FormatSuccessMessage("No")) + fmt.Fprintf(os.Stderr, " Timeout Detected: %s\n", console.FormatSuccessMessage("No")) } fmt.Fprintln(os.Stderr) } diff --git a/pkg/cli/copilot_events_jsonl.go b/pkg/cli/copilot_events_jsonl.go index ecf5e1a9275..50a4d947b76 100644 --- a/pkg/cli/copilot_events_jsonl.go +++ b/pkg/cli/copilot_events_jsonl.go @@ -18,6 +18,7 @@ import ( "os" "path/filepath" "strings" + "time" "github.com/github/gh-aw/pkg/console" "github.com/github/gh-aw/pkg/logger" @@ -174,6 +175,9 @@ func parseEventsJSONLFile(path string, verbose bool) (workflow.LogMetrics, error totalTokens := 0 foundAnyEvent := false + // Per-turn timestamps used to compute Time Between Turns (TBT) + var turnTimestamps []time.Time + scanner := bufio.NewScanner(file) buf := make([]byte, maxScannerBufferSize) scanner.Buffer(buf, maxScannerBufferSize) @@ -205,6 +209,14 @@ func parseEventsJSONLFile(path string, verbose bool) (workflow.LogMetrics, error metrics.ToolSequences = append(metrics.ToolSequences, currentSequence) currentSequence = []string{} } + // Record the timestamp for TBT computation. + if entry.Timestamp != "" { + if ts, parseErr := time.Parse(time.RFC3339Nano, entry.Timestamp); parseErr == nil { + turnTimestamps = append(turnTimestamps, ts) + } else if ts, parseErr = time.Parse(time.RFC3339, entry.Timestamp); parseErr == nil { + turnTimestamps = append(turnTimestamps, ts) + } + } copilotEventsJSONLLog.Printf("user.message: turn=%d", turns) case "tool.execution_start": @@ -267,6 +279,30 @@ func parseEventsJSONLFile(path string, verbose bool) (workflow.LogMetrics, error metrics.TokenUsage = totalTokens metrics.Turns = turns + // Compute Time Between Turns (TBT) from per-turn timestamps. + // TBT[i] = timestamp[i] - timestamp[i-1] for i > 0. Two or more timestamps + // are required to measure at least one interval. + if len(turnTimestamps) >= 2 { + var totalTBT time.Duration + var maxTBT time.Duration + for i := 1; i < len(turnTimestamps); i++ { + tbt := turnTimestamps[i].Sub(turnTimestamps[i-1]) + if tbt > 0 { + totalTBT += tbt + if tbt > maxTBT { + maxTBT = tbt + } + } + } + intervals := len(turnTimestamps) - 1 + if totalTBT > 0 && intervals > 0 { + metrics.AvgTimeBetweenTurns = totalTBT / time.Duration(intervals) + metrics.MaxTimeBetweenTurns = maxTBT + copilotEventsJSONLLog.Printf("TBT computed: avg=%s max=%s intervals=%d", + metrics.AvgTimeBetweenTurns, metrics.MaxTimeBetweenTurns, intervals) + } + } + copilotEventsJSONLLog.Printf("Parsed events.jsonl: turns=%d totalTokens=%d toolCalls=%d sequences=%d", turns, totalTokens, len(toolCallMap), len(metrics.ToolSequences)) diff --git a/pkg/cli/copilot_events_jsonl_test.go b/pkg/cli/copilot_events_jsonl_test.go index 3373f26ce7e..554fd69fba9 100644 --- a/pkg/cli/copilot_events_jsonl_test.go +++ b/pkg/cli/copilot_events_jsonl_test.go @@ -6,6 +6,7 @@ import ( "os" "path/filepath" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -291,3 +292,66 @@ func TestExtractLogMetrics_EventsJSONLPriority(t *testing.T) { assert.GreaterOrEqual(t, metrics.Turns, 1, "should detect at least one turn from log file") }) } + +// TestParseEventsJSONLFile_TBT verifies that Time Between Turns is computed +// correctly from per-turn timestamps embedded in user.message events. +func TestParseEventsJSONLFile_TBT(t *testing.T) { + // Helper that produces an events.jsonl line with a specific RFC3339 timestamp. + lineWithTimestamp := func(eventType, dataJSON, timestamp string) string { + return `{"type":"` + eventType + `","data":` + dataJSON + `,"id":"test-id","timestamp":"` + timestamp + `"}` + } + + t.Run("computes avg and max TBT from multi-turn timestamps", func(t *testing.T) { + // Turn 1 at T+0s, Turn 2 at T+30s, Turn 3 at T+90s → intervals: 30s, 60s → avg 45s, max 60s + content := lineWithTimestamp("session.start", `{"sessionId":"s1","copilotVersion":"1.0.0"}`, "2026-01-01T10:00:00Z") + "\n" + + lineWithTimestamp("user.message", `{"content":"turn 1"}`, "2026-01-01T10:00:00Z") + "\n" + + lineWithTimestamp("user.message", `{"content":"turn 2"}`, "2026-01-01T10:00:30Z") + "\n" + + lineWithTimestamp("user.message", `{"content":"turn 3"}`, "2026-01-01T10:01:30Z") + "\n" + + lineWithTimestamp("session.shutdown", `{"shutdownType":"routine","totalPremiumRequests":3,"modelMetrics":{"m":{"usage":{"inputTokens":100,"outputTokens":10}}}}`, "2026-01-01T10:02:00Z") + "\n" + + dir := t.TempDir() + eventsPath := filepath.Join(dir, "events.jsonl") + require.NoError(t, os.WriteFile(eventsPath, []byte(content), 0644)) + + metrics, err := parseEventsJSONLFile(eventsPath, false) + require.NoError(t, err, "should parse without error") + + assert.Equal(t, 3, metrics.Turns, "should detect 3 turns") + assert.Equal(t, 45*time.Second, metrics.AvgTimeBetweenTurns, "avg TBT should be 45s") + assert.Equal(t, 60*time.Second, metrics.MaxTimeBetweenTurns, "max TBT should be 60s") + }) + + t.Run("no TBT when only one turn", func(t *testing.T) { + content := lineWithTimestamp("user.message", `{"content":"single turn"}`, "2026-01-01T10:00:00Z") + "\n" + + lineWithTimestamp("session.shutdown", `{"shutdownType":"routine","totalPremiumRequests":1,"modelMetrics":{"m":{"usage":{"inputTokens":50,"outputTokens":5}}}}`, "2026-01-01T10:01:00Z") + "\n" + + dir := t.TempDir() + eventsPath := filepath.Join(dir, "events.jsonl") + require.NoError(t, os.WriteFile(eventsPath, []byte(content), 0644)) + + metrics, err := parseEventsJSONLFile(eventsPath, false) + require.NoError(t, err, "should parse without error") + + assert.Equal(t, 1, metrics.Turns, "should detect 1 turn") + assert.Zero(t, metrics.AvgTimeBetweenTurns, "no TBT with a single turn") + assert.Zero(t, metrics.MaxTimeBetweenTurns, "no TBT with a single turn") + }) + + t.Run("no TBT when timestamps missing", func(t *testing.T) { + // Use realFormatEventsLine which always uses the same timestamp — no intervals. + content := realFormatEventsLine("user.message", `{"content":"turn 1"}`) + "\n" + + realFormatEventsLine("user.message", `{"content":"turn 2"}`) + "\n" + + realFormatEventsLine("session.shutdown", `{"shutdownType":"routine","totalPremiumRequests":2,"modelMetrics":{"m":{"usage":{"inputTokens":50,"outputTokens":5}}}}`) + "\n" + + dir := t.TempDir() + eventsPath := filepath.Join(dir, "events.jsonl") + require.NoError(t, os.WriteFile(eventsPath, []byte(content), 0644)) + + metrics, err := parseEventsJSONLFile(eventsPath, false) + require.NoError(t, err, "should parse without error") + + assert.Equal(t, 2, metrics.Turns, "should detect 2 turns") + // All user.message events share the same timestamp → interval is 0 → TBT stays zero. + assert.Zero(t, metrics.AvgTimeBetweenTurns, "TBT should be zero when all timestamps are identical") + }) +} diff --git a/pkg/cli/logs_models.go b/pkg/cli/logs_models.go index db9945a6432..8390d56266d 100644 --- a/pkg/cli/logs_models.go +++ b/pkg/cli/logs_models.go @@ -48,33 +48,34 @@ const ( // WorkflowRun represents a GitHub Actions workflow run with metrics type WorkflowRun struct { - DatabaseID int64 `json:"databaseId"` - Number int `json:"number"` - URL string `json:"url"` - Status string `json:"status"` - Conclusion string `json:"conclusion"` - WorkflowName string `json:"workflowName"` - WorkflowPath string `json:"workflowPath"` // Workflow file path (e.g., .github/workflows/copilot-swe-agent.yml) - CreatedAt time.Time `json:"createdAt"` - StartedAt time.Time `json:"startedAt"` - UpdatedAt time.Time `json:"updatedAt"` - Event string `json:"event"` - HeadBranch string `json:"headBranch"` - HeadSha string `json:"headSha"` - DisplayTitle string `json:"displayTitle"` - Duration time.Duration - ActionMinutes float64 // Billable Actions minutes estimated from wall-clock time - TokenUsage int - EstimatedCost float64 - Turns int - ErrorCount int - WarningCount int - MissingToolCount int - MissingDataCount int - NoopCount int - SafeItemsCount int - EffectiveTokens int // Cost-normalized token count computed from per-model multipliers - LogsPath string + DatabaseID int64 `json:"databaseId"` + Number int `json:"number"` + URL string `json:"url"` + Status string `json:"status"` + Conclusion string `json:"conclusion"` + WorkflowName string `json:"workflowName"` + WorkflowPath string `json:"workflowPath"` // Workflow file path (e.g., .github/workflows/copilot-swe-agent.yml) + CreatedAt time.Time `json:"createdAt"` + StartedAt time.Time `json:"startedAt"` + UpdatedAt time.Time `json:"updatedAt"` + Event string `json:"event"` + HeadBranch string `json:"headBranch"` + HeadSha string `json:"headSha"` + DisplayTitle string `json:"displayTitle"` + Duration time.Duration + ActionMinutes float64 // Billable Actions minutes estimated from wall-clock time + TokenUsage int + EstimatedCost float64 + Turns int + ErrorCount int + WarningCount int + MissingToolCount int + MissingDataCount int + NoopCount int + SafeItemsCount int + EffectiveTokens int // Cost-normalized token count computed from per-model multipliers + AvgTimeBetweenTurns time.Duration // Average time between consecutive LLM API calls (from per-turn timestamps when available) + LogsPath string } // LogMetrics represents extracted metrics from log files diff --git a/pkg/cli/logs_orchestrator.go b/pkg/cli/logs_orchestrator.go index 5c4daedcdce..8d643e6f617 100644 --- a/pkg/cli/logs_orchestrator.go +++ b/pkg/cli/logs_orchestrator.go @@ -354,6 +354,7 @@ func DownloadWorkflowLogs(ctx context.Context, workflowName string, count int, s run.TokenUsage = result.Metrics.TokenUsage run.EstimatedCost = result.Metrics.EstimatedCost run.Turns = result.Metrics.Turns + run.AvgTimeBetweenTurns = result.Metrics.AvgTimeBetweenTurns run.ErrorCount = 0 run.WarningCount = 0 run.LogsPath = result.LogsPath @@ -760,6 +761,7 @@ func downloadRunArtifactsConcurrent(ctx context.Context, runs []WorkflowRun, out result.Run.TokenUsage = metrics.TokenUsage result.Run.EstimatedCost = metrics.EstimatedCost result.Run.Turns = metrics.Turns + result.Run.AvgTimeBetweenTurns = metrics.AvgTimeBetweenTurns result.Run.LogsPath = runOutputDir // Calculate duration and billable minutes from GitHub API timestamps. diff --git a/pkg/cli/logs_report.go b/pkg/cli/logs_report.go index f20e8149a54..a37f3664660 100644 --- a/pkg/cli/logs_report.go +++ b/pkg/cli/logs_report.go @@ -114,6 +114,7 @@ type RunData struct { AwContext *AwContext `json:"context,omitempty" console:"-"` // aw_context data from aw_info.json TokenUsageSummary *TokenUsageSummary `json:"token_usage_summary,omitempty" console:"-"` // Token usage from firewall proxy GitHubAPICalls int `json:"github_api_calls,omitempty" console:"header:GitHub API Calls,format:number,omitempty"` // GitHub API calls made during the run + AvgTimeBetweenTurns string `json:"avg_time_between_turns,omitempty" console:"-"` // Average time between consecutive LLM API calls (TBT) } // buildLogsData creates structured logs data from processed runs @@ -230,6 +231,12 @@ func buildLogsData(processedRuns []ProcessedRun, outputDir string, continuation if run.Duration > 0 { runData.Duration = timeutil.FormatDuration(run.Duration) } + // Compute average TBT from metrics when available; fall back to wall-time / turns. + if run.AvgTimeBetweenTurns > 0 { + runData.AvgTimeBetweenTurns = timeutil.FormatDuration(run.AvgTimeBetweenTurns) + } else if run.Turns > 1 && run.Duration > 0 { + runData.AvgTimeBetweenTurns = timeutil.FormatDuration(run.Duration/time.Duration(run.Turns)) + " (estimated)" + } runs = append(runs, runData) } diff --git a/pkg/workflow/metrics.go b/pkg/workflow/metrics.go index 6ab9baae06d..4ebccb53d4a 100644 --- a/pkg/workflow/metrics.go +++ b/pkg/workflow/metrics.go @@ -23,11 +23,13 @@ type ToolCallInfo struct { // LogMetrics represents extracted metrics from log files type LogMetrics struct { - TokenUsage int - EstimatedCost float64 - Turns int // Number of turns needed to complete the task - ToolCalls []ToolCallInfo // Tool call statistics - ToolSequences [][]string // Sequences of tool calls preserving order + TokenUsage int + EstimatedCost float64 + Turns int // Number of turns needed to complete the task + ToolCalls []ToolCallInfo // Tool call statistics + ToolSequences [][]string // Sequences of tool calls preserving order + AvgTimeBetweenTurns time.Duration // Average time between consecutive LLM API calls (computed from per-turn timestamps when available) + MaxTimeBetweenTurns time.Duration // Maximum time between any two consecutive LLM API calls // Timestamp removed - use GitHub API timestamps instead of parsing from logs } From 584ffa7e40a57f169b8cbaf5379a5533331ec827 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 14 Apr 2026 23:00:01 +0000 Subject: [PATCH 2/3] docs(adr): add draft ADR-26321 for TBT metric and cache TTL warning --- ...321-track-time-between-turns-tbt-metric.md | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 docs/adr/26321-track-time-between-turns-tbt-metric.md diff --git a/docs/adr/26321-track-time-between-turns-tbt-metric.md b/docs/adr/26321-track-time-between-turns-tbt-metric.md new file mode 100644 index 00000000000..3b4cab8a139 --- /dev/null +++ b/docs/adr/26321-track-time-between-turns-tbt-metric.md @@ -0,0 +1,86 @@ +# ADR-26321: Track Time Between Turns (TBT) as an Agentic Performance Metric with Prompt Cache TTL Warning + +**Date**: 2026-04-14 +**Status**: Draft +**Deciders**: pelikhan, Copilot + +--- + +## Part 1 — Narrative (Human-Friendly) + +### Context + +Agentic workflow runs issue multiple sequential LLM API calls. LLM inference providers implement server-side prompt caching to avoid re-processing unchanged context on each turn; the cache has a fixed time-to-live (TTL). Anthropic recently reduced their cache TTL from 1 hour to 5 minutes. When the elapsed time between consecutive LLM calls (Time Between Turns, TBT) exceeds this TTL, the cached prompt context expires and every subsequent turn pays full re-processing costs — significantly increasing token spend. The `gh aw audit` and `gh aw logs` commands already report turn counts and wall-clock time but gave users no visibility into whether their TBT was safe relative to provider cache TTLs. + +### Decision + +We will add `AvgTimeBetweenTurns` and `MaxTimeBetweenTurns` as first-class fields on `LogMetrics` and surface them through the entire `audit`/`logs` pipeline. For Copilot engine runs, TBT will be computed precisely from per-turn RFC3339 timestamps embedded in `user.message` events in `events.jsonl`. For other engines (or when timestamps are absent), TBT will be estimated as `Duration / Turns`. The audit report will emit a cache warning when the maximum observed TBT exceeds the hard-coded Anthropic 5-minute TTL threshold. We chose to hard-code the 5-minute constant rather than make it configurable because it reflects an externally-imposed provider constraint, not a user preference. + +### Alternatives Considered + +#### Alternative 1: Derive TBT solely from wall-clock time divided by turn count + +TBT could be approximated using only already-available data (`Duration / Turns`). This requires no new parsing logic and works across all engine types. It was rejected as the primary approach because it averages out spikes — a single 10-minute pause surrounded by fast turns would be invisible — whereas the precise timestamp-based approach captures both average and maximum TBT and can correctly identify individual cache-busting intervals. + +#### Alternative 2: Make the cache TTL threshold configurable via `.design-gate.yml` or a CLI flag + +The warning threshold (5 minutes) could be exposed as a user-configurable value to accommodate providers other than Anthropic or future TTL changes. This was rejected because the threshold is a vendor-imposed fact rather than a tunable policy; hard-coding it keeps the feature self-contained and avoids configuration sprawl. If Anthropic changes their TTL again, a code change is appropriate since the PR author will need to update documentation anyway. + +#### Alternative 3: Report TBT only in `gh aw audit` (not in `gh aw logs`) + +TBT could have been scoped to the single-run `audit` command only. This was rejected because `gh aw logs --json` output is consumed by downstream tooling and dashboards that benefit from per-run TBT data at the aggregated level. + +### Consequences + +#### Positive +- Users can identify workflow designs where slow tool calls are busting the prompt cache and costing extra tokens. +- The `CacheWarning` field provides an immediately actionable signal in the audit report. +- JSON output from `gh aw logs` gains a new `avg_time_between_turns` field for programmatic consumption. +- Precise timestamp-based TBT is computed only from engine logs that carry per-turn timestamps; other engines gracefully degrade to an estimate. + +#### Negative +- The Anthropic 5-minute TTL is hard-coded; if the threshold changes, a code change is required. +- Only `user.message` events in `events.jsonl` (Copilot engine format) carry per-turn timestamps; other engine log formats will always use the estimated fallback until they add timestamp support. +- The fallback estimate (`Duration / Turns`) can be misleading for highly variable workflows where some turns are fast and others are slow. + +#### Neutral +- Two new fields (`AvgTimeBetweenTurns`, `MaxTimeBetweenTurns`) are added to `LogMetrics`, `WorkflowRun`, `SessionAnalysis`, and `RunData` — widening the data model across four structs. +- Tests are scoped to the Copilot engine JSONL parser; audit/logs integration tests are unaffected. +- The feature is additive and non-breaking: existing consumers of `gh aw logs --json` will see a new optional field (`avg_time_between_turns`) that is omitted when TBT cannot be computed. + +--- + +## Part 2 — Normative Specification (RFC 2119) + +> The key words **MUST**, **MUST NOT**, **REQUIRED**, **SHALL**, **SHALL NOT**, **SHOULD**, **SHOULD NOT**, **RECOMMENDED**, **MAY**, and **OPTIONAL** in this section are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119). + +### TBT Computation + +1. Implementations **MUST** compute TBT as the elapsed time between consecutive LLM API call initiations (one interval per pair of adjacent turns). +2. Implementations **MUST** compute both average TBT (`AvgTimeBetweenTurns`) and maximum TBT (`MaxTimeBetweenTurns`) when two or more turn timestamps are available. +3. Implementations **MUST NOT** report a non-zero TBT value when fewer than two turns are recorded. +4. Implementations **MUST** use per-turn RFC3339 timestamps from `user.message` events in `events.jsonl` as the primary source for TBT computation when those timestamps are available. +5. Implementations **SHOULD** fall back to `Duration / Turns` as an estimated average TBT when per-turn timestamps are absent or when only one unique timestamp value is present; the estimated value **MUST** be labelled `(estimated)` in human-readable output. +6. Implementations **MUST** ignore zero-duration intervals (timestamps that are identical) when computing TBT averages and maximums to avoid artefacts from log files that reuse a single timestamp. + +### Cache Warning + +1. Implementations **MUST** emit a cache warning when the maximum observed TBT exceeds the Anthropic 5-minute prompt cache TTL (300 seconds). +2. Implementations **SHOULD** emit a cache warning when the average TBT exceeds 300 seconds and the maximum is unavailable. +3. Implementations **MUST NOT** emit a cache warning when both average and maximum TBT are zero or absent. +4. The cache warning **MUST** include the observed TBT value and state that the Anthropic 5-minute cache TTL is being exceeded. + +### Data Model and Propagation + +1. Implementations **MUST** store `AvgTimeBetweenTurns` and `MaxTimeBetweenTurns` as `time.Duration` fields on `LogMetrics`. +2. Implementations **MUST** propagate `AvgTimeBetweenTurns` from `LogMetrics` through `WorkflowRun` and into `RunData` for JSON output. +3. Implementations **MUST** expose `avg_time_between_turns` as an `omitempty` JSON field in `RunData` so that existing consumers are unaffected when TBT is unavailable. +4. Implementations **SHOULD** expose both average and maximum TBT in the `SessionAnalysis` struct used by the `audit` command's report renderer. + +### Conformance + +An implementation is considered conformant with this ADR if it satisfies all **MUST** and **MUST NOT** requirements above. Failure to meet any **MUST** or **MUST NOT** requirement constitutes non-conformance. + +--- + +*This is a DRAFT ADR generated by the [Design Decision Gate](https://github.com/github/gh-aw/actions/runs/24427002630) workflow. The PR author must review, complete, and finalize this document before the PR can merge.* From 0914a48e4d4dd8767dd61da6a5cb3ca76141c010 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Apr 2026 23:07:29 +0000 Subject: [PATCH 3/3] fix: address review feedback on TBT computation accuracy - Use validIntervals count (not len-1) when averaging TBT to avoid underestimating when identical/out-of-order timestamps are skipped - Fix wall-time fallback to divide by Turns-1 intervals (not Turns) in both audit_expanded.go and logs_report.go - Rename misleading test subtest to clarify it tests identical timestamps - Update glossary to accurately describe both warning code paths (max TBT for Copilot, estimated avg TBT for other engines) Agent-Logs-Url: https://github.com/github/gh-aw/sessions/16bad17c-1b59-442d-acc5-1be6db1fc7e2 Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- docs/src/content/docs/reference/glossary.md | 2 +- pkg/cli/audit_expanded.go | 4 ++-- pkg/cli/copilot_events_jsonl.go | 12 +++++++----- pkg/cli/copilot_events_jsonl_test.go | 2 +- pkg/cli/logs_report.go | 4 ++-- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/src/content/docs/reference/glossary.md b/docs/src/content/docs/reference/glossary.md index 697e3131d92..30211fe9fd9 100644 --- a/docs/src/content/docs/reference/glossary.md +++ b/docs/src/content/docs/reference/glossary.md @@ -412,7 +412,7 @@ The elapsed time between consecutive LLM API calls in an agentic workflow run. A - **Anthropic** reduced their cache TTL from 1 hour to **5 minutes**. If the TBT for any turn exceeds 5 minutes, the cached prompt context expires and the full prompt must be re-processed, significantly increasing token costs. - **OpenAI** has a similar server-side prompt cache with variable TTL. -`gh aw audit` reports both average and maximum TBT in the Session Analysis section. A cache warning is emitted when the maximum observed TBT exceeds the Anthropic 5-minute threshold. For Copilot engine runs, TBT is computed from precise per-turn timestamps in the `events.jsonl` session log; for other engines it is estimated from total wall time divided by turn count. +`gh aw audit` reports both average and maximum TBT in the Session Analysis section. A cache warning is emitted when the TBT used for cache analysis exceeds the Anthropic 5-minute threshold: the maximum observed TBT for Copilot engine runs, where precise per-turn timestamps are available in the `events.jsonl` session log, or the estimated average TBT for other engines, where TBT is derived from total wall time divided by turn count. To reduce TBT — and keep prompt caches warm — minimize blocking tool calls, parallelize independent tool invocations, and avoid long-running shell commands in the critical path between turns. diff --git a/pkg/cli/audit_expanded.go b/pkg/cli/audit_expanded.go index 2b9093e745e..1af75754b63 100644 --- a/pkg/cli/audit_expanded.go +++ b/pkg/cli/audit_expanded.go @@ -232,8 +232,8 @@ func buildSessionAnalysis(processedRun ProcessedRun, metrics LogMetrics) *Sessio ) } } else if metrics.Turns > 1 && run.Duration > 0 { - // Fallback: average cycle time as a proxy for TBT. - avgTBT := run.Duration / time.Duration(metrics.Turns) + // Fallback: estimate TBT from wall time over turns-1 intervals. + avgTBT := run.Duration / time.Duration(metrics.Turns-1) session.AvgTimeBetweenTurns = timeutil.FormatDuration(avgTBT) + " (estimated)" if avgTBT > anthropicCacheTTL { session.CacheWarning = fmt.Sprintf( diff --git a/pkg/cli/copilot_events_jsonl.go b/pkg/cli/copilot_events_jsonl.go index 50a4d947b76..74690045548 100644 --- a/pkg/cli/copilot_events_jsonl.go +++ b/pkg/cli/copilot_events_jsonl.go @@ -281,25 +281,27 @@ func parseEventsJSONLFile(path string, verbose bool) (workflow.LogMetrics, error // Compute Time Between Turns (TBT) from per-turn timestamps. // TBT[i] = timestamp[i] - timestamp[i-1] for i > 0. Two or more timestamps - // are required to measure at least one interval. + // are required to measure at least one interval. Only positive intervals are + // included so that identical or out-of-order timestamps don't skew the average. if len(turnTimestamps) >= 2 { var totalTBT time.Duration var maxTBT time.Duration + validIntervals := 0 for i := 1; i < len(turnTimestamps); i++ { tbt := turnTimestamps[i].Sub(turnTimestamps[i-1]) if tbt > 0 { totalTBT += tbt + validIntervals++ if tbt > maxTBT { maxTBT = tbt } } } - intervals := len(turnTimestamps) - 1 - if totalTBT > 0 && intervals > 0 { - metrics.AvgTimeBetweenTurns = totalTBT / time.Duration(intervals) + if validIntervals > 0 { + metrics.AvgTimeBetweenTurns = totalTBT / time.Duration(validIntervals) metrics.MaxTimeBetweenTurns = maxTBT copilotEventsJSONLLog.Printf("TBT computed: avg=%s max=%s intervals=%d", - metrics.AvgTimeBetweenTurns, metrics.MaxTimeBetweenTurns, intervals) + metrics.AvgTimeBetweenTurns, metrics.MaxTimeBetweenTurns, validIntervals) } } diff --git a/pkg/cli/copilot_events_jsonl_test.go b/pkg/cli/copilot_events_jsonl_test.go index 554fd69fba9..30eb3f8eb1b 100644 --- a/pkg/cli/copilot_events_jsonl_test.go +++ b/pkg/cli/copilot_events_jsonl_test.go @@ -337,7 +337,7 @@ func TestParseEventsJSONLFile_TBT(t *testing.T) { assert.Zero(t, metrics.MaxTimeBetweenTurns, "no TBT with a single turn") }) - t.Run("no TBT when timestamps missing", func(t *testing.T) { + t.Run("no TBT when all timestamps are identical", func(t *testing.T) { // Use realFormatEventsLine which always uses the same timestamp — no intervals. content := realFormatEventsLine("user.message", `{"content":"turn 1"}`) + "\n" + realFormatEventsLine("user.message", `{"content":"turn 2"}`) + "\n" + diff --git a/pkg/cli/logs_report.go b/pkg/cli/logs_report.go index a37f3664660..09d21255525 100644 --- a/pkg/cli/logs_report.go +++ b/pkg/cli/logs_report.go @@ -231,11 +231,11 @@ func buildLogsData(processedRuns []ProcessedRun, outputDir string, continuation if run.Duration > 0 { runData.Duration = timeutil.FormatDuration(run.Duration) } - // Compute average TBT from metrics when available; fall back to wall-time / turns. + // Compute average TBT from metrics when available; fall back to wall-time / (turns - 1). if run.AvgTimeBetweenTurns > 0 { runData.AvgTimeBetweenTurns = timeutil.FormatDuration(run.AvgTimeBetweenTurns) } else if run.Turns > 1 && run.Duration > 0 { - runData.AvgTimeBetweenTurns = timeutil.FormatDuration(run.Duration/time.Duration(run.Turns)) + " (estimated)" + runData.AvgTimeBetweenTurns = timeutil.FormatDuration(run.Duration/time.Duration(run.Turns-1)) + " (estimated)" } runs = append(runs, runData) }