Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/agentic-observability-kit.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ The logs JSON already contains the main agentic signals. Prefer these fields ove
- `behavior_fingerprint.actuation_style`
- `behavior_fingerprint.resource_profile`
- `behavior_fingerprint.dispatch_mode`
- `behavior_fingerprint.agentic_fraction`
- `agentic_assessments[].kind`
- `agentic_assessments[].severity`
- `context.repo`
Expand All @@ -125,6 +126,8 @@ The logs JSON already contains the main agentic signals. Prefer these fields ove
- `comparison.classification.label`
- `comparison.classification.reason_codes[]`
- `comparison.recommendation.action`
- `action_minutes` (estimated billable Actions minutes per run)
- `summary.total_action_minutes`

Treat these values as the canonical signals for reporting.

Expand Down Expand Up @@ -162,6 +165,7 @@ Include small numeric summaries such as:
- runs with `comparison.classification.label == "risky"`
- runs with medium or high `agentic_assessments`
- workflows with repeated `overkill_for_agentic`
- workflows with `partially_reducible` or `model_downgrade_available` assessments
- workflows whose comparisons mostly fell back to `latest_success`

### Details
Expand Down
94 changes: 89 additions & 5 deletions pkg/cli/audit_agentic_analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ type TaskDomainInfo struct {

// BehaviorFingerprint summarizes the run's execution profile in compact dimensions.
type BehaviorFingerprint struct {
ExecutionStyle string `json:"execution_style"`
ToolBreadth string `json:"tool_breadth"`
ActuationStyle string `json:"actuation_style"`
ResourceProfile string `json:"resource_profile"`
DispatchMode string `json:"dispatch_mode"`
ExecutionStyle string `json:"execution_style"`
ToolBreadth string `json:"tool_breadth"`
ActuationStyle string `json:"actuation_style"`
ResourceProfile string `json:"resource_profile"`
DispatchMode string `json:"dispatch_mode"`
AgenticFraction float64 `json:"agentic_fraction"` // Ratio of reasoning turns to total turns (0.0-1.0)
}

// AgenticAssessment captures an actionable judgment about the run's behavior.
Expand Down Expand Up @@ -105,6 +106,7 @@ func deriveRunAgenticAnalysis(processedRun ProcessedRun, metrics LogMetrics) (*A
metricsData := MetricsData{
TokenUsage: processedRun.Run.TokenUsage,
EstimatedCost: processedRun.Run.EstimatedCost,
ActionMinutes: processedRun.Run.ActionMinutes,
Turns: processedRun.Run.Turns,
ErrorCount: processedRun.Run.ErrorCount,
WarningCount: processedRun.Run.WarningCount,
Expand Down Expand Up @@ -201,12 +203,15 @@ func buildBehaviorFingerprint(processedRun ProcessedRun, metrics MetricsData, to
dispatchMode = "delegated"
}

agenticFraction := computeAgenticFraction(processedRun)

return &BehaviorFingerprint{
ExecutionStyle: executionStyle,
ToolBreadth: toolBreadth,
ActuationStyle: actuationStyle,
ResourceProfile: resourceProfile,
DispatchMode: dispatchMode,
AgenticFraction: agenticFraction,
}
}

Expand Down Expand Up @@ -258,6 +263,40 @@ func buildAgenticAssessments(processedRun ProcessedRun, metrics MetricsData, too
})
}

// Partially reducible: the workflow has a low agentic fraction, meaning
// many turns are data-gathering that could be moved to deterministic steps:
// or post-steps: in the frontmatter. Only flag when there's substantive work
// (not lean/directed runs which overkill_for_agentic already covers).
if fingerprint.AgenticFraction > 0 && fingerprint.AgenticFraction < 0.6 &&
fingerprint.ResourceProfile != "lean" {
severity := "low"
if fingerprint.AgenticFraction < 0.4 {
severity = "medium"
}
deterministicPct := int((1.0 - fingerprint.AgenticFraction) * 100)
assessments = append(assessments, AgenticAssessment{
Kind: "partially_reducible",
Severity: severity,
Summary: fmt.Sprintf("About %d%% of this run's turns appear to be data-gathering that could move to deterministic steps.", deterministicPct),
Evidence: fmt.Sprintf("agentic_fraction=%.2f turns=%d", fingerprint.AgenticFraction, metrics.Turns),
Recommendation: "Move data-fetching work to frontmatter steps: (pre-agent) writing to /tmp/gh-aw/agent/ or post-steps: (post-agent) to reduce inference cost. See the Deterministic & Agentic Patterns guide.",
})
}

// Model downgrade suggestion: the run uses a heavy resource profile but
// the task domain is simple enough that a smaller model would likely suffice.
if fingerprint.ResourceProfile != "lean" &&
(domain.Name == "triage" || domain.Name == "repo_maintenance" || domain.Name == "issue_response") &&
fingerprint.ActuationStyle != "write_heavy" {
assessments = append(assessments, AgenticAssessment{
Kind: "model_downgrade_available",
Severity: "low",
Summary: fmt.Sprintf("This %s run may not need a frontier model. A smaller model (e.g. gpt-4.1-mini, claude-haiku-4-5) could handle the task at lower cost.", domain.Label),
Evidence: fmt.Sprintf("domain=%s resource_profile=%s actuation=%s", domain.Name, fingerprint.ResourceProfile, fingerprint.ActuationStyle),
Recommendation: "Try engine.model: gpt-4.1-mini or claude-haiku-4-5 in the workflow frontmatter.",
})
}

if awContext != nil {
assessments = append(assessments, AgenticAssessment{
Kind: "delegated_context_present",
Expand Down Expand Up @@ -286,6 +325,12 @@ func generateAgenticAssessmentFindings(assessments []AgenticAssessment) []Findin
case "poor_agentic_control":
category = "agentic"
impact = "Broad or weakly controlled behavior can reduce trust even when the run succeeds"
case "partially_reducible":
category = "optimization"
impact = "Moving data-gathering turns to deterministic steps reduces inference cost"
case "model_downgrade_available":
category = "optimization"
impact = "A smaller model could reduce per-run cost significantly for this task domain"
case "delegated_context_present":
category = "coordination"
impact = "Context continuity improves downstream debugging and auditability"
Expand Down Expand Up @@ -320,6 +365,41 @@ func generateAgenticAssessmentRecommendations(assessments []AgenticAssessment) [
return recommendations
}

// computeAgenticFraction classifies tool sequences into reasoning vs. data-gathering
// turns and returns the ratio of reasoning turns to total turns.
// Reasoning turns are those where the agent makes decisions, writes output, or uses
// multiple tool types. Data-gathering turns use only read-oriented tools.
func computeAgenticFraction(processedRun ProcessedRun) float64 {
run := processedRun.Run
if run.Turns <= 0 {
return 0.0
}

// If no tool sequence data, estimate from write actions
writeCount := run.SafeItemsCount
if writeCount > 0 && run.Turns > 0 {
// At minimum, the fraction of turns that produced write actions are agentic
// and non-write turns doing data gathering are deterministic-reducible
agenticTurns := writeCount
// Add reasoning turns: at least 1 turn of reasoning per write action,
// plus initial planning turn
reasoningTurns := min(writeCount+1, run.Turns)
agenticTurns = max(agenticTurns, reasoningTurns)
agenticTurns = min(agenticTurns, run.Turns)
return float64(agenticTurns) / float64(run.Turns)
}

// No write actions: if the run is read-only with few turns, nearly all
// turns are reasoning (the AI is analyzing, not just gathering data)
if run.Turns <= 3 {
return 1.0
}

// For longer read-only runs, assume early turns gather context and
// later turns do analysis. Heuristic: half the turns are reasoning.
return 0.5
}

func containsAny(value string, terms ...string) bool {
for _, term := range terms {
if strings.Contains(value, term) {
Expand All @@ -339,6 +419,10 @@ func prettifyAssessmentKind(kind string) string {
return "Weak Agentic Control"
case "delegated_context_present":
return "Dispatch Context Preserved"
case "partially_reducible":
return "Partially Reducible To Deterministic"
case "model_downgrade_available":
return "Cheaper Model Available"
default:
return strings.ReplaceAll(kind, "_", " ")
}
Expand Down
149 changes: 149 additions & 0 deletions pkg/cli/audit_agentic_analysis_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package cli

import (
"math"
"testing"
"time"

Expand Down Expand Up @@ -105,3 +106,151 @@ func TestBuildAuditDataIncludesAgenticAnalysis(t *testing.T) {
assert.NotEmpty(t, auditData.AgenticAssessments, "agentic assessments should be present")
assert.Equal(t, "triage", auditData.TaskDomain.Name)
}

func TestComputeAgenticFraction(t *testing.T) {
tests := []struct {
name string
run WorkflowRun
minAlpha float64
maxAlpha float64
}{
{
name: "zero turns returns zero",
run: WorkflowRun{Turns: 0},
minAlpha: 0.0,
maxAlpha: 0.0,
},
{
name: "short read-only run is fully agentic",
run: WorkflowRun{Turns: 2},
minAlpha: 1.0,
maxAlpha: 1.0,
},
{
name: "write-heavy run with many turns has partial fraction",
run: WorkflowRun{Turns: 10, SafeItemsCount: 3},
minAlpha: 0.3,
maxAlpha: 0.5,
},
{
name: "long read-only run returns 0.5",
run: WorkflowRun{Turns: 8},
minAlpha: 0.5,
maxAlpha: 0.5,
},
{
name: "single write action in multi-turn run",
run: WorkflowRun{Turns: 6, SafeItemsCount: 1},
minAlpha: 0.3,
maxAlpha: 0.4,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
pr := ProcessedRun{Run: tt.run}
alpha := computeAgenticFraction(pr)
assert.GreaterOrEqual(t, alpha, tt.minAlpha, "agentic fraction should be >= %v", tt.minAlpha)
assert.LessOrEqual(t, alpha, tt.maxAlpha, "agentic fraction should be <= %v", tt.maxAlpha)
})
}
}

func TestBuildBehaviorFingerprintIncludesAgenticFraction(t *testing.T) {
processedRun := ProcessedRun{
Run: WorkflowRun{
Turns: 8,
Duration: 10 * time.Minute,
SafeItemsCount: 2,
},
}
metrics := MetricsData{Turns: 8}
toolUsage := []ToolUsageInfo{
{Name: "bash", CallCount: 3},
{Name: "github_issue_read", CallCount: 2},
}

fp := buildBehaviorFingerprint(processedRun, metrics, toolUsage, nil, nil)
require.NotNil(t, fp, "fingerprint should not be nil")
assert.Greater(t, fp.AgenticFraction, 0.0, "agentic fraction should be positive")
assert.LessOrEqual(t, fp.AgenticFraction, 1.0, "agentic fraction should be <= 1.0")
}

func TestBuildAgenticAssessmentsFlagsPartiallyReducible(t *testing.T) {
processedRun := ProcessedRun{
Run: WorkflowRun{
WorkflowName: "Data Collector",
Turns: 10,
Duration: 8 * time.Minute,
SafeItemsCount: 2,
},
}
metrics := MetricsData{Turns: 10}
toolUsage := []ToolUsageInfo{
{Name: "bash", CallCount: 5},
{Name: "github_issue_read", CallCount: 3},
{Name: "gh", CallCount: 2},
{Name: "jq", CallCount: 1},
}
domain := &TaskDomainInfo{Name: "research", Label: "Research"}
fingerprint := buildBehaviorFingerprint(processedRun, metrics, toolUsage, nil, nil)

assessments := buildAgenticAssessments(processedRun, metrics, toolUsage, nil, domain, fingerprint, nil)

var found bool
for _, a := range assessments {
if a.Kind == "partially_reducible" {
found = true
assert.Contains(t, a.Summary, "data-gathering", "summary should mention data-gathering")
assert.Contains(t, a.Recommendation, "steps:", "recommendation should mention steps:")
}
}
assert.True(t, found, "partially_reducible assessment should be present for low agentic fraction moderate run")
}

func TestBuildAgenticAssessmentsFlagsModelDowngrade(t *testing.T) {
processedRun := ProcessedRun{
Run: WorkflowRun{
WorkflowName: "Issue Triage Moderate",
Turns: 7,
Duration: 6 * time.Minute,
SafeItemsCount: 1,
},
}
metrics := MetricsData{Turns: 7}
toolUsage := []ToolUsageInfo{
{Name: "bash", CallCount: 3},
{Name: "github_issue_read", CallCount: 2},
{Name: "grep", CallCount: 1},
{Name: "jq", CallCount: 1},
}
domain := &TaskDomainInfo{Name: "triage", Label: "Triage"}
fingerprint := buildBehaviorFingerprint(processedRun, metrics, toolUsage, nil, nil)

assessments := buildAgenticAssessments(processedRun, metrics, toolUsage, nil, domain, fingerprint, nil)

var found bool
for _, a := range assessments {
if a.Kind == "model_downgrade_available" {
found = true
assert.Contains(t, a.Recommendation, "gpt-4.1-mini", "should suggest a cheaper model")
}
}
assert.True(t, found, "model_downgrade_available assessment should be present for moderate triage run")
}

func TestActionMinutesComputedFromDuration(t *testing.T) {
run := WorkflowRun{
Duration: 3*time.Minute + 30*time.Second,
}
// ActionMinutes should be ceil of Duration in minutes
// Since ActionMinutes is set by logs_orchestrator, test the formula directly
expected := 4.0 // ceil(3.5)
actual := math.Ceil(run.Duration.Minutes())
assert.InDelta(t, expected, actual, 0.001, "ActionMinutes should be ceiling of duration in minutes")
}

func TestPrettifyAssessmentKindNewKinds(t *testing.T) {
assert.Equal(t, "Partially Reducible To Deterministic", prettifyAssessmentKind("partially_reducible"))
assert.Equal(t, "Cheaper Model Available", prettifyAssessmentKind("model_downgrade_available"))
}
1 change: 1 addition & 0 deletions pkg/cli/audit_report.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ type OverviewData struct {
type MetricsData struct {
TokenUsage int `json:"token_usage,omitempty" console:"header:Token Usage,format:number,omitempty"`
EstimatedCost float64 `json:"estimated_cost,omitempty" console:"header:Estimated Cost,format:cost,omitempty"`
ActionMinutes float64 `json:"action_minutes,omitempty" console:"header:Action Minutes,omitempty"`
Turns int `json:"turns,omitempty" console:"header:Turns,omitempty"`
ErrorCount int `json:"error_count" console:"header:Errors"`
WarningCount int `json:"warning_count" console:"header:Warnings"`
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/logs_models.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type WorkflowRun struct {
HeadSha string `json:"headSha"`
DisplayTitle string `json:"displayTitle"`
Duration time.Duration
ActionMinutes float64 // Billable Actions minutes estimated from wall-clock time
TokenUsage int
EstimatedCost float64
Turns int
Expand Down
4 changes: 4 additions & 0 deletions pkg/cli/logs_orchestrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"encoding/json"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strings"
Expand Down Expand Up @@ -346,6 +347,9 @@ func DownloadWorkflowLogs(ctx context.Context, workflowName string, count int, s
// Always use GitHub API timestamps for duration calculation
if !run.StartedAt.IsZero() && !run.UpdatedAt.IsZero() {
run.Duration = run.UpdatedAt.Sub(run.StartedAt)
// Estimate billable Actions minutes from wall-clock time.
// GitHub Actions bills per minute, rounded up per job.
run.ActionMinutes = math.Ceil(run.Duration.Minutes())
}

processedRun := ProcessedRun{
Expand Down
Loading
Loading