diff --git a/.github/workflows/agentic-observability-kit.md b/.github/workflows/agentic-observability-kit.md index 8a45d0a5e01..3702d815e64 100644 --- a/.github/workflows/agentic-observability-kit.md +++ b/.github/workflows/agentic-observability-kit.md @@ -113,6 +113,7 @@ The logs JSON already contains the main agentic signals. Prefer these fields ove - `behavior_fingerprint.actuation_style` - `behavior_fingerprint.resource_profile` - `behavior_fingerprint.dispatch_mode` +- `behavior_fingerprint.agentic_fraction` - `agentic_assessments[].kind` - `agentic_assessments[].severity` - `context.repo` @@ -125,6 +126,8 @@ The logs JSON already contains the main agentic signals. Prefer these fields ove - `comparison.classification.label` - `comparison.classification.reason_codes[]` - `comparison.recommendation.action` +- `action_minutes` (estimated billable Actions minutes per run) +- `summary.total_action_minutes` Treat these values as the canonical signals for reporting. @@ -162,6 +165,7 @@ Include small numeric summaries such as: - runs with `comparison.classification.label == "risky"` - runs with medium or high `agentic_assessments` - workflows with repeated `overkill_for_agentic` +- workflows with `partially_reducible` or `model_downgrade_available` assessments - workflows whose comparisons mostly fell back to `latest_success` ### Details diff --git a/pkg/cli/audit_agentic_analysis.go b/pkg/cli/audit_agentic_analysis.go index 41e342cecfa..8cf5a1eb664 100644 --- a/pkg/cli/audit_agentic_analysis.go +++ b/pkg/cli/audit_agentic_analysis.go @@ -23,11 +23,12 @@ type TaskDomainInfo struct { // BehaviorFingerprint summarizes the run's execution profile in compact dimensions. type BehaviorFingerprint struct { - ExecutionStyle string `json:"execution_style"` - ToolBreadth string `json:"tool_breadth"` - ActuationStyle string `json:"actuation_style"` - ResourceProfile string `json:"resource_profile"` - DispatchMode string `json:"dispatch_mode"` + ExecutionStyle string `json:"execution_style"` + ToolBreadth string `json:"tool_breadth"` + ActuationStyle string `json:"actuation_style"` + ResourceProfile string `json:"resource_profile"` + DispatchMode string `json:"dispatch_mode"` + AgenticFraction float64 `json:"agentic_fraction"` // Ratio of reasoning turns to total turns (0.0-1.0) } // AgenticAssessment captures an actionable judgment about the run's behavior. @@ -105,6 +106,7 @@ func deriveRunAgenticAnalysis(processedRun ProcessedRun, metrics LogMetrics) (*A metricsData := MetricsData{ TokenUsage: processedRun.Run.TokenUsage, EstimatedCost: processedRun.Run.EstimatedCost, + ActionMinutes: processedRun.Run.ActionMinutes, Turns: processedRun.Run.Turns, ErrorCount: processedRun.Run.ErrorCount, WarningCount: processedRun.Run.WarningCount, @@ -201,12 +203,15 @@ func buildBehaviorFingerprint(processedRun ProcessedRun, metrics MetricsData, to dispatchMode = "delegated" } + agenticFraction := computeAgenticFraction(processedRun) + return &BehaviorFingerprint{ ExecutionStyle: executionStyle, ToolBreadth: toolBreadth, ActuationStyle: actuationStyle, ResourceProfile: resourceProfile, DispatchMode: dispatchMode, + AgenticFraction: agenticFraction, } } @@ -258,6 +263,40 @@ func buildAgenticAssessments(processedRun ProcessedRun, metrics MetricsData, too }) } + // Partially reducible: the workflow has a low agentic fraction, meaning + // many turns are data-gathering that could be moved to deterministic steps: + // or post-steps: in the frontmatter. Only flag when there's substantive work + // (not lean/directed runs which overkill_for_agentic already covers). + if fingerprint.AgenticFraction > 0 && fingerprint.AgenticFraction < 0.6 && + fingerprint.ResourceProfile != "lean" { + severity := "low" + if fingerprint.AgenticFraction < 0.4 { + severity = "medium" + } + deterministicPct := int((1.0 - fingerprint.AgenticFraction) * 100) + assessments = append(assessments, AgenticAssessment{ + Kind: "partially_reducible", + Severity: severity, + Summary: fmt.Sprintf("About %d%% of this run's turns appear to be data-gathering that could move to deterministic steps.", deterministicPct), + Evidence: fmt.Sprintf("agentic_fraction=%.2f turns=%d", fingerprint.AgenticFraction, metrics.Turns), + Recommendation: "Move data-fetching work to frontmatter steps: (pre-agent) writing to /tmp/gh-aw/agent/ or post-steps: (post-agent) to reduce inference cost. See the Deterministic & Agentic Patterns guide.", + }) + } + + // Model downgrade suggestion: the run uses a heavy resource profile but + // the task domain is simple enough that a smaller model would likely suffice. + if fingerprint.ResourceProfile != "lean" && + (domain.Name == "triage" || domain.Name == "repo_maintenance" || domain.Name == "issue_response") && + fingerprint.ActuationStyle != "write_heavy" { + assessments = append(assessments, AgenticAssessment{ + Kind: "model_downgrade_available", + Severity: "low", + Summary: fmt.Sprintf("This %s run may not need a frontier model. A smaller model (e.g. gpt-4.1-mini, claude-haiku-4-5) could handle the task at lower cost.", domain.Label), + Evidence: fmt.Sprintf("domain=%s resource_profile=%s actuation=%s", domain.Name, fingerprint.ResourceProfile, fingerprint.ActuationStyle), + Recommendation: "Try engine.model: gpt-4.1-mini or claude-haiku-4-5 in the workflow frontmatter.", + }) + } + if awContext != nil { assessments = append(assessments, AgenticAssessment{ Kind: "delegated_context_present", @@ -286,6 +325,12 @@ func generateAgenticAssessmentFindings(assessments []AgenticAssessment) []Findin case "poor_agentic_control": category = "agentic" impact = "Broad or weakly controlled behavior can reduce trust even when the run succeeds" + case "partially_reducible": + category = "optimization" + impact = "Moving data-gathering turns to deterministic steps reduces inference cost" + case "model_downgrade_available": + category = "optimization" + impact = "A smaller model could reduce per-run cost significantly for this task domain" case "delegated_context_present": category = "coordination" impact = "Context continuity improves downstream debugging and auditability" @@ -320,6 +365,41 @@ func generateAgenticAssessmentRecommendations(assessments []AgenticAssessment) [ return recommendations } +// computeAgenticFraction classifies tool sequences into reasoning vs. data-gathering +// turns and returns the ratio of reasoning turns to total turns. +// Reasoning turns are those where the agent makes decisions, writes output, or uses +// multiple tool types. Data-gathering turns use only read-oriented tools. +func computeAgenticFraction(processedRun ProcessedRun) float64 { + run := processedRun.Run + if run.Turns <= 0 { + return 0.0 + } + + // If no tool sequence data, estimate from write actions + writeCount := run.SafeItemsCount + if writeCount > 0 && run.Turns > 0 { + // At minimum, the fraction of turns that produced write actions are agentic + // and non-write turns doing data gathering are deterministic-reducible + agenticTurns := writeCount + // Add reasoning turns: at least 1 turn of reasoning per write action, + // plus initial planning turn + reasoningTurns := min(writeCount+1, run.Turns) + agenticTurns = max(agenticTurns, reasoningTurns) + agenticTurns = min(agenticTurns, run.Turns) + return float64(agenticTurns) / float64(run.Turns) + } + + // No write actions: if the run is read-only with few turns, nearly all + // turns are reasoning (the AI is analyzing, not just gathering data) + if run.Turns <= 3 { + return 1.0 + } + + // For longer read-only runs, assume early turns gather context and + // later turns do analysis. Heuristic: half the turns are reasoning. + return 0.5 +} + func containsAny(value string, terms ...string) bool { for _, term := range terms { if strings.Contains(value, term) { @@ -339,6 +419,10 @@ func prettifyAssessmentKind(kind string) string { return "Weak Agentic Control" case "delegated_context_present": return "Dispatch Context Preserved" + case "partially_reducible": + return "Partially Reducible To Deterministic" + case "model_downgrade_available": + return "Cheaper Model Available" default: return strings.ReplaceAll(kind, "_", " ") } diff --git a/pkg/cli/audit_agentic_analysis_test.go b/pkg/cli/audit_agentic_analysis_test.go index 96fa46fd7d8..dbb9aa4bded 100644 --- a/pkg/cli/audit_agentic_analysis_test.go +++ b/pkg/cli/audit_agentic_analysis_test.go @@ -3,6 +3,7 @@ package cli import ( + "math" "testing" "time" @@ -105,3 +106,151 @@ func TestBuildAuditDataIncludesAgenticAnalysis(t *testing.T) { assert.NotEmpty(t, auditData.AgenticAssessments, "agentic assessments should be present") assert.Equal(t, "triage", auditData.TaskDomain.Name) } + +func TestComputeAgenticFraction(t *testing.T) { + tests := []struct { + name string + run WorkflowRun + minAlpha float64 + maxAlpha float64 + }{ + { + name: "zero turns returns zero", + run: WorkflowRun{Turns: 0}, + minAlpha: 0.0, + maxAlpha: 0.0, + }, + { + name: "short read-only run is fully agentic", + run: WorkflowRun{Turns: 2}, + minAlpha: 1.0, + maxAlpha: 1.0, + }, + { + name: "write-heavy run with many turns has partial fraction", + run: WorkflowRun{Turns: 10, SafeItemsCount: 3}, + minAlpha: 0.3, + maxAlpha: 0.5, + }, + { + name: "long read-only run returns 0.5", + run: WorkflowRun{Turns: 8}, + minAlpha: 0.5, + maxAlpha: 0.5, + }, + { + name: "single write action in multi-turn run", + run: WorkflowRun{Turns: 6, SafeItemsCount: 1}, + minAlpha: 0.3, + maxAlpha: 0.4, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pr := ProcessedRun{Run: tt.run} + alpha := computeAgenticFraction(pr) + assert.GreaterOrEqual(t, alpha, tt.minAlpha, "agentic fraction should be >= %v", tt.minAlpha) + assert.LessOrEqual(t, alpha, tt.maxAlpha, "agentic fraction should be <= %v", tt.maxAlpha) + }) + } +} + +func TestBuildBehaviorFingerprintIncludesAgenticFraction(t *testing.T) { + processedRun := ProcessedRun{ + Run: WorkflowRun{ + Turns: 8, + Duration: 10 * time.Minute, + SafeItemsCount: 2, + }, + } + metrics := MetricsData{Turns: 8} + toolUsage := []ToolUsageInfo{ + {Name: "bash", CallCount: 3}, + {Name: "github_issue_read", CallCount: 2}, + } + + fp := buildBehaviorFingerprint(processedRun, metrics, toolUsage, nil, nil) + require.NotNil(t, fp, "fingerprint should not be nil") + assert.Greater(t, fp.AgenticFraction, 0.0, "agentic fraction should be positive") + assert.LessOrEqual(t, fp.AgenticFraction, 1.0, "agentic fraction should be <= 1.0") +} + +func TestBuildAgenticAssessmentsFlagsPartiallyReducible(t *testing.T) { + processedRun := ProcessedRun{ + Run: WorkflowRun{ + WorkflowName: "Data Collector", + Turns: 10, + Duration: 8 * time.Minute, + SafeItemsCount: 2, + }, + } + metrics := MetricsData{Turns: 10} + toolUsage := []ToolUsageInfo{ + {Name: "bash", CallCount: 5}, + {Name: "github_issue_read", CallCount: 3}, + {Name: "gh", CallCount: 2}, + {Name: "jq", CallCount: 1}, + } + domain := &TaskDomainInfo{Name: "research", Label: "Research"} + fingerprint := buildBehaviorFingerprint(processedRun, metrics, toolUsage, nil, nil) + + assessments := buildAgenticAssessments(processedRun, metrics, toolUsage, nil, domain, fingerprint, nil) + + var found bool + for _, a := range assessments { + if a.Kind == "partially_reducible" { + found = true + assert.Contains(t, a.Summary, "data-gathering", "summary should mention data-gathering") + assert.Contains(t, a.Recommendation, "steps:", "recommendation should mention steps:") + } + } + assert.True(t, found, "partially_reducible assessment should be present for low agentic fraction moderate run") +} + +func TestBuildAgenticAssessmentsFlagsModelDowngrade(t *testing.T) { + processedRun := ProcessedRun{ + Run: WorkflowRun{ + WorkflowName: "Issue Triage Moderate", + Turns: 7, + Duration: 6 * time.Minute, + SafeItemsCount: 1, + }, + } + metrics := MetricsData{Turns: 7} + toolUsage := []ToolUsageInfo{ + {Name: "bash", CallCount: 3}, + {Name: "github_issue_read", CallCount: 2}, + {Name: "grep", CallCount: 1}, + {Name: "jq", CallCount: 1}, + } + domain := &TaskDomainInfo{Name: "triage", Label: "Triage"} + fingerprint := buildBehaviorFingerprint(processedRun, metrics, toolUsage, nil, nil) + + assessments := buildAgenticAssessments(processedRun, metrics, toolUsage, nil, domain, fingerprint, nil) + + var found bool + for _, a := range assessments { + if a.Kind == "model_downgrade_available" { + found = true + assert.Contains(t, a.Recommendation, "gpt-4.1-mini", "should suggest a cheaper model") + } + } + assert.True(t, found, "model_downgrade_available assessment should be present for moderate triage run") +} + +func TestActionMinutesComputedFromDuration(t *testing.T) { + run := WorkflowRun{ + Duration: 3*time.Minute + 30*time.Second, + } + // ActionMinutes should be ceil of Duration in minutes + // Since ActionMinutes is set by logs_orchestrator, test the formula directly + expected := 4.0 // ceil(3.5) + actual := math.Ceil(run.Duration.Minutes()) + assert.InDelta(t, expected, actual, 0.001, "ActionMinutes should be ceiling of duration in minutes") +} + +func TestPrettifyAssessmentKindNewKinds(t *testing.T) { + assert.Equal(t, "Partially Reducible To Deterministic", prettifyAssessmentKind("partially_reducible")) + assert.Equal(t, "Cheaper Model Available", prettifyAssessmentKind("model_downgrade_available")) +} diff --git a/pkg/cli/audit_report.go b/pkg/cli/audit_report.go index 297d067ccac..2c3a6cae018 100644 --- a/pkg/cli/audit_report.go +++ b/pkg/cli/audit_report.go @@ -98,6 +98,7 @@ type OverviewData struct { type MetricsData struct { TokenUsage int `json:"token_usage,omitempty" console:"header:Token Usage,format:number,omitempty"` EstimatedCost float64 `json:"estimated_cost,omitempty" console:"header:Estimated Cost,format:cost,omitempty"` + ActionMinutes float64 `json:"action_minutes,omitempty" console:"header:Action Minutes,omitempty"` Turns int `json:"turns,omitempty" console:"header:Turns,omitempty"` ErrorCount int `json:"error_count" console:"header:Errors"` WarningCount int `json:"warning_count" console:"header:Warnings"` diff --git a/pkg/cli/logs_models.go b/pkg/cli/logs_models.go index 9f4a987de6d..fcf240d2acc 100644 --- a/pkg/cli/logs_models.go +++ b/pkg/cli/logs_models.go @@ -50,6 +50,7 @@ type WorkflowRun struct { HeadSha string `json:"headSha"` DisplayTitle string `json:"displayTitle"` Duration time.Duration + ActionMinutes float64 // Billable Actions minutes estimated from wall-clock time TokenUsage int EstimatedCost float64 Turns int diff --git a/pkg/cli/logs_orchestrator.go b/pkg/cli/logs_orchestrator.go index 39703d81235..1ec5151e7d2 100644 --- a/pkg/cli/logs_orchestrator.go +++ b/pkg/cli/logs_orchestrator.go @@ -16,6 +16,7 @@ import ( "encoding/json" "errors" "fmt" + "math" "os" "path/filepath" "strings" @@ -346,6 +347,9 @@ func DownloadWorkflowLogs(ctx context.Context, workflowName string, count int, s // Always use GitHub API timestamps for duration calculation if !run.StartedAt.IsZero() && !run.UpdatedAt.IsZero() { run.Duration = run.UpdatedAt.Sub(run.StartedAt) + // Estimate billable Actions minutes from wall-clock time. + // GitHub Actions bills per minute, rounded up per job. + run.ActionMinutes = math.Ceil(run.Duration.Minutes()) } processedRun := ProcessedRun{ diff --git a/pkg/cli/logs_report.go b/pkg/cli/logs_report.go index 51ce5ff140f..a8c05432321 100644 --- a/pkg/cli/logs_report.go +++ b/pkg/cli/logs_report.go @@ -58,6 +58,7 @@ type LogsSummary struct { TotalDuration string `json:"total_duration" console:"header:Total Duration"` TotalTokens int `json:"total_tokens" console:"header:Total Tokens,format:number"` TotalCost float64 `json:"total_cost" console:"header:Total Cost,format:cost"` + TotalActionMinutes float64 `json:"total_action_minutes" console:"header:Total Action Minutes"` TotalTurns int `json:"total_turns" console:"header:Total Turns"` TotalErrors int `json:"total_errors" console:"header:Total Errors"` TotalWarnings int `json:"total_warnings" console:"header:Total Warnings"` @@ -78,6 +79,7 @@ type RunData struct { Status string `json:"status" console:"header:Status"` Conclusion string `json:"conclusion,omitempty" console:"-"` Duration string `json:"duration,omitempty" console:"header:Duration,omitempty"` + ActionMinutes float64 `json:"action_minutes,omitempty" console:"header:Action Minutes,omitempty"` TokenUsage int `json:"token_usage,omitempty" console:"header:Tokens,format:number,omitempty"` EstimatedCost float64 `json:"estimated_cost,omitempty" console:"header:Cost ($),format:cost,omitempty"` Turns int `json:"turns,omitempty" console:"header:Turns,omitempty"` @@ -159,6 +161,7 @@ func buildLogsData(processedRuns []ProcessedRun, outputDir string, continuation var totalDuration time.Duration var totalTokens int var totalCost float64 + var totalActionMinutes float64 var totalTurns int var totalErrors int var totalWarnings int @@ -177,6 +180,7 @@ func buildLogsData(processedRuns []ProcessedRun, outputDir string, continuation } totalTokens += run.TokenUsage totalCost += run.EstimatedCost + totalActionMinutes += run.ActionMinutes totalTurns += run.Turns totalErrors += run.ErrorCount totalWarnings += run.WarningCount @@ -210,6 +214,7 @@ func buildLogsData(processedRuns []ProcessedRun, outputDir string, continuation Conclusion: run.Conclusion, TokenUsage: run.TokenUsage, EstimatedCost: run.EstimatedCost, + ActionMinutes: run.ActionMinutes, Turns: run.Turns, ErrorCount: run.ErrorCount, WarningCount: run.WarningCount, @@ -247,16 +252,17 @@ func buildLogsData(processedRuns []ProcessedRun, outputDir string, continuation } summary := LogsSummary{ - TotalRuns: len(processedRuns), - TotalDuration: timeutil.FormatDuration(totalDuration), - TotalTokens: totalTokens, - TotalCost: totalCost, - TotalTurns: totalTurns, - TotalErrors: totalErrors, - TotalWarnings: totalWarnings, - TotalMissingTools: totalMissingTools, - TotalMissingData: totalMissingData, - TotalSafeItems: totalSafeItems, + TotalRuns: len(processedRuns), + TotalDuration: timeutil.FormatDuration(totalDuration), + TotalTokens: totalTokens, + TotalCost: totalCost, + TotalActionMinutes: totalActionMinutes, + TotalTurns: totalTurns, + TotalErrors: totalErrors, + TotalWarnings: totalWarnings, + TotalMissingTools: totalMissingTools, + TotalMissingData: totalMissingData, + TotalSafeItems: totalSafeItems, } episodes, edges := buildEpisodeData(runs, processedRuns)