dzianisv · dzianisv · Feb 16, 2026 · Feb 16, 2026
diff --git a/README.md b/README.md
@@ -115,6 +115,97 @@ Evaluates task completion after each agent response and provides feedback if wor
 4. **Verdict**: PASS → toast notification | FAIL → feedback injected into chat
 5. **Continuation**: Agent receives feedback and continues working
 
+### State Graph
+
+```
+session.idle fires
+    |
+    v
++---------------------------+
+| GUARD CHECKS              |
+| - Is judge/classifier?    |--yes--> SKIP
+| - Is plan mode?           |--yes--> SKIP
+| - Was ESC-aborted?        |--yes--> SKIP (10s cooldown)
+| - Same user msg already   |--yes--> SKIP
+|   reflected?              |
++----------+----------------+
+           | no
+           v
++---------------------------+
+| A) BUILD TASK CONTEXT     |
+| - Collect user messages   |
+| - Infer task type         |
+|   (coding/docs/research/  |
+|    ops/other)             |
+| - Detect repo signals     |
+|   (package.json scripts,  |
+|    test/ dir)             |
+| - Extract tool commands   |
+| - Determine workflow      |
+|   requirements            |
++----------+----------------+
+           v
++---------------------------+
+| B) SELF-ASSESSMENT        |
+| Request agent to produce  |
+| JSON with evidence:       |
+| - Did you complete task?  |
+| - Did you run tests?      |
+| - Did you create PR?      |
+| - Did CI pass?            |
+| - Are you stuck?          |
+| (runs in ephemeral        |
+|  session, not main)       |
++----------+----------------+
+           v
++---------------------------+
+| C) PARSE & EVALUATE      |
+| Parse JSON --success--> evaluateSelfAssessment()
+|            +--fail---> Judge LLM fallback
+|                        |
+| Workflow gate checks:  |
+| 1. Tests ran? Passed?  |
+|    Ran AFTER changes?  |
+|    Not skipped/flaky?  |
+| 2. Build ran? Passed?  |
+| 3. PR created? URL?    |
+|    Evidence (gh pr)?   |
+| 4. CI checked? Passed? |
+|    Evidence (gh pr     |
+|    checks)?            |
+| 5. No push to main?   |
+| 6. Planning loop check |
++----------+----------------+
+           v
++---------------------------+
+| D) VERDICT                |
+| Write .reflection/        |
+|   verdict_<session>.json  |
+|                           |
+| Three outcomes:           |
+| COMPLETE      --> Toast success, done
+| NEEDS USER    --> Toast warning, done
+| INCOMPLETE    --> Continue below
++----------+----------------+
+           | incomplete
+           v
++---------------------------+
+| E) FEEDBACK + ROUTING     |
+| - Classify task category  |
+|   (backend/arch/frontend) |
+| - Build escalating        |
+|   feedback (attempt N/5)  |
+| - Inject feedback into    |
+|   session (optionally     |
+|   with model routing)     |
+| - Agent continues work    |
++---------------------------+
+           |
+           v
+     (session.idle fires again --> loop back to top,
+      up to MAX_ATTEMPTS=5)
+```
+
 ### Features
 
 - **OpenCode Sessions API**: Uses OpenCode's session management to create isolated judge sessions

diff --git a/docs/reflection.md b/docs/reflection.md
@@ -78,6 +78,97 @@ flowchart TD
   Done -->|no| Feedback[Prompt feedback to continue]
 ```
 
+### Detailed State Graph
+
+```
+session.idle fires
+    |
+    v
++---------------------------+
+| GUARD CHECKS              |
+| - Is judge/classifier?    |--yes--> SKIP
+| - Is plan mode?           |--yes--> SKIP
+| - Was ESC-aborted?        |--yes--> SKIP (10s cooldown)
+| - Same user msg already   |--yes--> SKIP
+|   reflected?              |
++----------+----------------+
+           | no
+           v
++---------------------------+
+| A) BUILD TASK CONTEXT     |
+| - Collect user messages   |
+| - Infer task type         |
+|   (coding/docs/research/  |
+|    ops/other)             |
+| - Detect repo signals     |
+|   (package.json scripts,  |
+|    test/ dir)             |
+| - Extract tool commands   |
+| - Determine workflow      |
+|   requirements            |
++----------+----------------+
+           v
++---------------------------+
+| B) SELF-ASSESSMENT        |
+| Request agent to produce  |
+| JSON with evidence:       |
+| - Did you complete task?  |
+| - Did you run tests?      |
+| - Did you create PR?      |
+| - Did CI pass?            |
+| - Are you stuck?          |
+| (runs in ephemeral        |
+|  session, not main)       |
++----------+----------------+
+           v
++---------------------------+
+| C) PARSE & EVALUATE      |
+| Parse JSON --success--> evaluateSelfAssessment()
+|            +--fail---> Judge LLM fallback
+|                        |
+| Workflow gate checks:  |
+| 1. Tests ran? Passed?  |
+|    Ran AFTER changes?  |
+|    Not skipped/flaky?  |
+| 2. Build ran? Passed?  |
+| 3. PR created? URL?    |
+|    Evidence (gh pr)?   |
+| 4. CI checked? Passed? |
+|    Evidence (gh pr     |
+|    checks)?            |
+| 5. No push to main?   |
+| 6. Planning loop check |
++----------+----------------+
+           v
++---------------------------+
+| D) VERDICT                |
+| Write .reflection/        |
+|   verdict_<session>.json  |
+|                           |
+| Three outcomes:           |
+| COMPLETE      --> Toast success, done
+| NEEDS USER    --> Toast warning, done
+| INCOMPLETE    --> Continue below
++----------+----------------+
+           | incomplete
+           v
++---------------------------+
+| E) FEEDBACK + ROUTING     |
+| - Classify task category  |
+|   (backend/arch/frontend) |
+| - Build escalating        |
+|   feedback (attempt N/5)  |
+| - Inject feedback into    |
+|   session (optionally     |
+|   with model routing)     |
+| - Agent continues work    |
++---------------------------+
+           |
+           v
+     (session.idle fires again --> loop back to top,
+      up to MAX_ATTEMPTS=5)
+```
+
 ## Files and Artifacts
 - `<workspace>/.reflection/verdict_<session>.json` (signal for TTS/Telegram)
 - `<workspace>/.reflection/<session>_<timestamp>.json` (full analysis record)

diff --git a/reflection-3.test-helpers.ts b/reflection-3.test-helpers.ts
@@ -310,6 +310,11 @@ export interface RoutingConfig {
   models: Record<RoutingCategory, string>
 }
 
+export interface ModelSpecParts {
+  providerID: string
+  modelID: string
+}
+
 
 export function parseRoutingFromYaml(content: string): RoutingConfig {
   const DEFAULT_ROUTING_CONFIG: RoutingConfig = {
@@ -376,6 +381,27 @@ export function getRoutingModel(config: RoutingConfig, category: RoutingCategory
   return { providerID, modelID }
 }
 
+export function parseModelSpec(modelSpec: string | null | undefined): ModelSpecParts | null {
+  if (typeof modelSpec !== "string") return null
+  const trimmed = modelSpec.trim()
+  if (!trimmed) return null
+  const parts = trimmed.split("/")
+  if (parts.length < 2) return null
+  const providerID = parts[0] || ""
+  const modelID = parts.slice(1).join("/") || ""
+  if (!providerID || !modelID) return null
+  return { providerID, modelID }
+}
+
+export function getCrossReviewModelSpec(modelSpec: string | null | undefined): string | null {
+  const parsed = parseModelSpec(modelSpec)
+  if (!parsed) return null
+  const modelID = parsed.modelID.toLowerCase()
+  if (modelID === "claude-opus-4.6") return "github-copilot/gpt-5.2-codex"
+  if (modelID === "gpt-5.2-codex") return "github-copilot/claude-opus-4.6"
+  return null
+}
+
 const FEEDBACK_MARKER = "## Reflection-3:"
 const MAX_ATTEMPTS = 5