github · mnkiefer · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/.github/workflows/agentic-observability-kit.md b/.github/workflows/agentic-observability-kit.md
@@ -113,6 +113,7 @@ The logs JSON already contains the main agentic signals. Prefer these fields ove
 - `behavior_fingerprint.actuation_style`
 - `behavior_fingerprint.resource_profile`
 - `behavior_fingerprint.dispatch_mode`
+- `behavior_fingerprint.agentic_fraction`
 - `agentic_assessments[].kind`
 - `agentic_assessments[].severity`
 - `context.repo`
@@ -125,6 +126,8 @@ The logs JSON already contains the main agentic signals. Prefer these fields ove
 - `comparison.classification.label`
 - `comparison.classification.reason_codes[]`
 - `comparison.recommendation.action`
+- `action_minutes` (estimated billable Actions minutes per run)
+- `summary.total_action_minutes`
 
 Treat these values as the canonical signals for reporting.
 
@@ -162,6 +165,7 @@ Include small numeric summaries such as:
 - runs with `comparison.classification.label == "risky"`
 - runs with medium or high `agentic_assessments`
 - workflows with repeated `overkill_for_agentic`
+- workflows with `partially_reducible` or `model_downgrade_available` assessments
 - workflows whose comparisons mostly fell back to `latest_success`
 
 ### Details

diff --git a/pkg/cli/audit_agentic_analysis.go b/pkg/cli/audit_agentic_analysis.go
@@ -23,11 +23,12 @@ type TaskDomainInfo struct {
 
 // BehaviorFingerprint summarizes the run's execution profile in compact dimensions.
 type BehaviorFingerprint struct {
-	ExecutionStyle  string `json:"execution_style"`
-	ToolBreadth     string `json:"tool_breadth"`
-	ActuationStyle  string `json:"actuation_style"`
-	ResourceProfile string `json:"resource_profile"`
-	DispatchMode    string `json:"dispatch_mode"`
+	ExecutionStyle  string  `json:"execution_style"`
+	ToolBreadth     string  `json:"tool_breadth"`
+	ActuationStyle  string  `json:"actuation_style"`
+	ResourceProfile string  `json:"resource_profile"`
+	DispatchMode    string  `json:"dispatch_mode"`
+	AgenticFraction float64 `json:"agentic_fraction"` // Ratio of reasoning turns to total turns (0.0-1.0)
 }
 
 // AgenticAssessment captures an actionable judgment about the run's behavior.
@@ -105,6 +106,7 @@ func deriveRunAgenticAnalysis(processedRun ProcessedRun, metrics LogMetrics) (*A
 	metricsData := MetricsData{
 		TokenUsage:    processedRun.Run.TokenUsage,
 		EstimatedCost: processedRun.Run.EstimatedCost,
+		ActionMinutes: processedRun.Run.ActionMinutes,
 		Turns:         processedRun.Run.Turns,
 		ErrorCount:    processedRun.Run.ErrorCount,
 		WarningCount:  processedRun.Run.WarningCount,
@@ -201,12 +203,15 @@ func buildBehaviorFingerprint(processedRun ProcessedRun, metrics MetricsData, to
 		dispatchMode = "delegated"
 	}
 
+	agenticFraction := computeAgenticFraction(processedRun)
+
 	return &BehaviorFingerprint{
 		ExecutionStyle:  executionStyle,
 		ToolBreadth:     toolBreadth,
 		ActuationStyle:  actuationStyle,
 		ResourceProfile: resourceProfile,
 		DispatchMode:    dispatchMode,
+		AgenticFraction: agenticFraction,
 	}
 }
 
@@ -258,6 +263,40 @@ func buildAgenticAssessments(processedRun ProcessedRun, metrics MetricsData, too
 		})
 	}
 
+	// Partially reducible: the workflow has a low agentic fraction, meaning
+	// many turns are data-gathering that could be moved to deterministic steps:
+	// or post-steps: in the frontmatter. Only flag when there's substantive work
+	// (not lean/directed runs which overkill_for_agentic already covers).
+	if fingerprint.AgenticFraction > 0 && fingerprint.AgenticFraction < 0.6 &&
+		fingerprint.ResourceProfile != "lean" {
+		severity := "low"
+		if fingerprint.AgenticFraction < 0.4 {
+			severity = "medium"
+		}
+		deterministicPct := int((1.0 - fingerprint.AgenticFraction) * 100)
+		assessments = append(assessments, AgenticAssessment{
+			Kind:           "partially_reducible",
+			Severity:       severity,
+			Summary:        fmt.Sprintf("About %d%% of this run's turns appear to be data-gathering that could move to deterministic steps.", deterministicPct),
+			Evidence:       fmt.Sprintf("agentic_fraction=%.2f turns=%d", fingerprint.AgenticFraction, metrics.Turns),
+			Recommendation: "Move data-fetching work to frontmatter steps: (pre-agent) writing to /tmp/gh-aw/agent/ or post-steps: (post-agent) to reduce inference cost. See the Deterministic & Agentic Patterns guide.",
+		})
+	}
+
+	// Model downgrade suggestion: the run uses a heavy resource profile but
+	// the task domain is simple enough that a smaller model would likely suffice.
+	if fingerprint.ResourceProfile != "lean" &&
+		(domain.Name == "triage" || domain.Name == "repo_maintenance" || domain.Name == "issue_response") &&
+		fingerprint.ActuationStyle != "write_heavy" {
+		assessments = append(assessments, AgenticAssessment{
+			Kind:           "model_downgrade_available",
+			Severity:       "low",
+			Summary:        fmt.Sprintf("This %s run may not need a frontier model. A smaller model (e.g. gpt-4.1-mini, claude-haiku-4-5) could handle the task at lower cost.", domain.Label),
+			Evidence:       fmt.Sprintf("domain=%s resource_profile=%s actuation=%s", domain.Name, fingerprint.ResourceProfile, fingerprint.ActuationStyle),
+			Recommendation: "Try engine.model: gpt-4.1-mini or claude-haiku-4-5 in the workflow frontmatter.",
+		})
+	}
+
 	if awContext != nil {
 		assessments = append(assessments, AgenticAssessment{
 			Kind:           "delegated_context_present",
@@ -286,6 +325,12 @@ func generateAgenticAssessmentFindings(assessments []AgenticAssessment) []Findin
 		case "poor_agentic_control":
 			category = "agentic"
 			impact = "Broad or weakly controlled behavior can reduce trust even when the run succeeds"
+		case "partially_reducible":
+			category = "optimization"
+			impact = "Moving data-gathering turns to deterministic steps reduces inference cost"
+		case "model_downgrade_available":
+			category = "optimization"
+			impact = "A smaller model could reduce per-run cost significantly for this task domain"
 		case "delegated_context_present":
 			category = "coordination"
 			impact = "Context continuity improves downstream debugging and auditability"
@@ -320,6 +365,41 @@ func generateAgenticAssessmentRecommendations(assessments []AgenticAssessment) [
 	return recommendations
 }
 
+// computeAgenticFraction classifies tool sequences into reasoning vs. data-gathering
+// turns and returns the ratio of reasoning turns to total turns.
+// Reasoning turns are those where the agent makes decisions, writes output, or uses
+// multiple tool types. Data-gathering turns use only read-oriented tools.
+func computeAgenticFraction(processedRun ProcessedRun) float64 {
+	run := processedRun.Run
+	if run.Turns <= 0 {
+		return 0.0
+	}
+
+	// If no tool sequence data, estimate from write actions
+	writeCount := run.SafeItemsCount
+	if writeCount > 0 && run.Turns > 0 {
+		// At minimum, the fraction of turns that produced write actions are agentic
+		// and non-write turns doing data gathering are deterministic-reducible
+		agenticTurns := writeCount
+		// Add reasoning turns: at least 1 turn of reasoning per write action,
+		// plus initial planning turn
+		reasoningTurns := min(writeCount+1, run.Turns)
+		agenticTurns = max(agenticTurns, reasoningTurns)
+		agenticTurns = min(agenticTurns, run.Turns)
+		return float64(agenticTurns) / float64(run.Turns)
+	}
+
+	// No write actions: if the run is read-only with few turns, nearly all
+	// turns are reasoning (the AI is analyzing, not just gathering data)
+	if run.Turns <= 3 {
+		return 1.0
+	}
+
+	// For longer read-only runs, assume early turns gather context and
+	// later turns do analysis. Heuristic: half the turns are reasoning.
+	return 0.5
+}
+
 func containsAny(value string, terms ...string) bool {
 	for _, term := range terms {
 		if strings.Contains(value, term) {
@@ -339,6 +419,10 @@ func prettifyAssessmentKind(kind string) string {
 		return "Weak Agentic Control"
 	case "delegated_context_present":
 		return "Dispatch Context Preserved"
+	case "partially_reducible":
+		return "Partially Reducible To Deterministic"
+	case "model_downgrade_available":
+		return "Cheaper Model Available"
 	default:
 		return strings.ReplaceAll(kind, "_", " ")
 	}

diff --git a/pkg/cli/audit_agentic_analysis_test.go b/pkg/cli/audit_agentic_analysis_test.go
@@ -3,6 +3,7 @@
 package cli
 
 import (
+	"math"
 	"testing"
 	"time"
 
@@ -105,3 +106,151 @@ func TestBuildAuditDataIncludesAgenticAnalysis(t *testing.T) {
 	assert.NotEmpty(t, auditData.AgenticAssessments, "agentic assessments should be present")
 	assert.Equal(t, "triage", auditData.TaskDomain.Name)
 }
+
+func TestComputeAgenticFraction(t *testing.T) {
+	tests := []struct {
+		name     string
+		run      WorkflowRun
+		minAlpha float64
+		maxAlpha float64
+	}{
+		{
+			name:     "zero turns returns zero",
+			run:      WorkflowRun{Turns: 0},
+			minAlpha: 0.0,
+			maxAlpha: 0.0,
+		},
+		{
+			name:     "short read-only run is fully agentic",
+			run:      WorkflowRun{Turns: 2},
+			minAlpha: 1.0,
+			maxAlpha: 1.0,
+		},
+		{
+			name:     "write-heavy run with many turns has partial fraction",
+			run:      WorkflowRun{Turns: 10, SafeItemsCount: 3},
+			minAlpha: 0.3,
+			maxAlpha: 0.5,
+		},
+		{
+			name:     "long read-only run returns 0.5",
+			run:      WorkflowRun{Turns: 8},
+			minAlpha: 0.5,
+			maxAlpha: 0.5,
+		},
+		{
+			name:     "single write action in multi-turn run",
+			run:      WorkflowRun{Turns: 6, SafeItemsCount: 1},
+			minAlpha: 0.3,
+			maxAlpha: 0.4,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			pr := ProcessedRun{Run: tt.run}
+			alpha := computeAgenticFraction(pr)
+			assert.GreaterOrEqual(t, alpha, tt.minAlpha, "agentic fraction should be >= %v", tt.minAlpha)
+			assert.LessOrEqual(t, alpha, tt.maxAlpha, "agentic fraction should be <= %v", tt.maxAlpha)
+		})
+	}
+}
+
+func TestBuildBehaviorFingerprintIncludesAgenticFraction(t *testing.T) {
+	processedRun := ProcessedRun{
+		Run: WorkflowRun{
+			Turns:          8,
+			Duration:       10 * time.Minute,
+			SafeItemsCount: 2,
+		},
+	}
+	metrics := MetricsData{Turns: 8}
+	toolUsage := []ToolUsageInfo{
+		{Name: "bash", CallCount: 3},
+		{Name: "github_issue_read", CallCount: 2},
+	}
+
+	fp := buildBehaviorFingerprint(processedRun, metrics, toolUsage, nil, nil)
+	require.NotNil(t, fp, "fingerprint should not be nil")
+	assert.Greater(t, fp.AgenticFraction, 0.0, "agentic fraction should be positive")
+	assert.LessOrEqual(t, fp.AgenticFraction, 1.0, "agentic fraction should be <= 1.0")
+}
+
+func TestBuildAgenticAssessmentsFlagsPartiallyReducible(t *testing.T) {
+	processedRun := ProcessedRun{
+		Run: WorkflowRun{
+			WorkflowName:   "Data Collector",
+			Turns:          10,
+			Duration:       8 * time.Minute,
+			SafeItemsCount: 2,
+		},
+	}
+	metrics := MetricsData{Turns: 10}
+	toolUsage := []ToolUsageInfo{
+		{Name: "bash", CallCount: 5},
+		{Name: "github_issue_read", CallCount: 3},
+		{Name: "gh", CallCount: 2},
+		{Name: "jq", CallCount: 1},
+	}
+	domain := &TaskDomainInfo{Name: "research", Label: "Research"}
+	fingerprint := buildBehaviorFingerprint(processedRun, metrics, toolUsage, nil, nil)
+
+	assessments := buildAgenticAssessments(processedRun, metrics, toolUsage, nil, domain, fingerprint, nil)
+
+	var found bool
+	for _, a := range assessments {
+		if a.Kind == "partially_reducible" {
+			found = true
+			assert.Contains(t, a.Summary, "data-gathering", "summary should mention data-gathering")
+			assert.Contains(t, a.Recommendation, "steps:", "recommendation should mention steps:")
+		}
+	}
+	assert.True(t, found, "partially_reducible assessment should be present for low agentic fraction moderate run")
+}
+
+func TestBuildAgenticAssessmentsFlagsModelDowngrade(t *testing.T) {
+	processedRun := ProcessedRun{
+		Run: WorkflowRun{
+			WorkflowName:   "Issue Triage Moderate",
+			Turns:          7,
+			Duration:       6 * time.Minute,
+			SafeItemsCount: 1,
+		},
+	}
+	metrics := MetricsData{Turns: 7}
+	toolUsage := []ToolUsageInfo{
+		{Name: "bash", CallCount: 3},
+		{Name: "github_issue_read", CallCount: 2},
+		{Name: "grep", CallCount: 1},
+		{Name: "jq", CallCount: 1},
+	}
+	domain := &TaskDomainInfo{Name: "triage", Label: "Triage"}
+	fingerprint := buildBehaviorFingerprint(processedRun, metrics, toolUsage, nil, nil)
+
+	assessments := buildAgenticAssessments(processedRun, metrics, toolUsage, nil, domain, fingerprint, nil)
+
+	var found bool
+	for _, a := range assessments {
+		if a.Kind == "model_downgrade_available" {
+			found = true
+			assert.Contains(t, a.Recommendation, "gpt-4.1-mini", "should suggest a cheaper model")
+		}
+	}
+	assert.True(t, found, "model_downgrade_available assessment should be present for moderate triage run")
+}
+
+func TestActionMinutesComputedFromDuration(t *testing.T) {
+	run := WorkflowRun{
+		Duration: 3*time.Minute + 30*time.Second,
+	}
+	// ActionMinutes should be ceil of Duration in minutes
+	// Since ActionMinutes is set by logs_orchestrator, test the formula directly
+	expected := 4.0 // ceil(3.5)
+	actual := math.Ceil(run.Duration.Minutes())
+	assert.InDelta(t, expected, actual, 0.001, "ActionMinutes should be ceiling of duration in minutes")
+}
+
+func TestPrettifyAssessmentKindNewKinds(t *testing.T) {
+	assert.Equal(t, "Partially Reducible To Deterministic", prettifyAssessmentKind("partially_reducible"))
+	assert.Equal(t, "Cheaper Model Available", prettifyAssessmentKind("model_downgrade_available"))
+}
diff --git a/pkg/cli/audit_report.go b/pkg/cli/audit_report.go
@@ -98,6 +98,7 @@ type OverviewData struct {
 type MetricsData struct {
 	TokenUsage    int     `json:"token_usage,omitempty" console:"header:Token Usage,format:number,omitempty"`
 	EstimatedCost float64 `json:"estimated_cost,omitempty" console:"header:Estimated Cost,format:cost,omitempty"`
+	ActionMinutes float64 `json:"action_minutes,omitempty" console:"header:Action Minutes,omitempty"`
 	Turns         int     `json:"turns,omitempty" console:"header:Turns,omitempty"`
 	ErrorCount    int     `json:"error_count" console:"header:Errors"`
 	WarningCount  int     `json:"warning_count" console:"header:Warnings"`

diff --git a/pkg/cli/logs_models.go b/pkg/cli/logs_models.go
@@ -50,6 +50,7 @@ type WorkflowRun struct {
 	HeadSha          string    `json:"headSha"`
 	DisplayTitle     string    `json:"displayTitle"`
 	Duration         time.Duration
+	ActionMinutes    float64 // Billable Actions minutes estimated from wall-clock time
 	TokenUsage       int
 	EstimatedCost    float64
 	Turns            int

diff --git a/pkg/cli/logs_orchestrator.go b/pkg/cli/logs_orchestrator.go
@@ -16,6 +16,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"math"
 	"os"
 	"path/filepath"
 	"strings"
@@ -346,6 +347,9 @@ func DownloadWorkflowLogs(ctx context.Context, workflowName string, count int, s
 				// Always use GitHub API timestamps for duration calculation
 				if !run.StartedAt.IsZero() && !run.UpdatedAt.IsZero() {
 					run.Duration = run.UpdatedAt.Sub(run.StartedAt)
+					// Estimate billable Actions minutes from wall-clock time.
+					// GitHub Actions bills per minute, rounded up per job.
+					run.ActionMinutes = math.Ceil(run.Duration.Minutes())
 				}
 
 				processedRun := ProcessedRun{