From c8a52efbd00e8ae39000b4012cfda21feca72812 Mon Sep 17 00:00:00 2001
From: Dennis Palatov <dp@metalinxx.io>
Date: Sat, 18 Apr 2026 13:27:57 -0700
Subject: [PATCH] gameplay_capture: carry known_good action as supervised label
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per Dennis's observation: the gameplay records ARE supervised training
triples, but we were only capturing what our baseline PROPOSED, not
what actually was the right move. The winning trace's per-step action
is by definition a good next action — encode it in metadata so
downstream training can use it as the teacher signal.

New metadata fields on every gameplay record:
- known_good_action: int  (GameAction value that the winning trace took)
- known_good_data: Dict|None  (click coords for action=6, else None)
- known_good_level: int  (game level at this step)

What this unlocks for training:
- Router BC: (state → baseline_dispatch)      [already worked]
- Action prediction: (state → known_good_action)      [NEW]
- Motor-skill BC by demonstration: (state × skill_params → known_good_action)  [NEW]
- Outcome-weighted shaping: sample_weight ∝ game_outcome.won   [NEW]
- Backprop through chained components using the winning action
  as the terminal-loss target                                   [NEW]

'This is what SAGE should do next to evaluate what it proposes next' —
the proposal and the ground truth are now both in every record.

Tests: 2 new (18 total). Verify known_good_action matches trace action
exactly, click-step known_good_data carries coords, levels pass through.

Backward compat: old consumers that don't look at the new fields are
unaffected. RouterRecord schema unchanged (metadata is open dict).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../thalamic_router/gameplay_capture.py       | 15 +++++++-
 .../tests/test_gameplay_capture.py            | 37 +++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/sage/cognition/thalamic_router/gameplay_capture.py b/sage/cognition/thalamic_router/gameplay_capture.py
index ae8402a15..33c03e904 100644
--- a/sage/cognition/thalamic_router/gameplay_capture.py
+++ b/sage/cognition/thalamic_router/gameplay_capture.py
@@ -375,7 +375,16 @@ def run(self) -> CaptureResult:
                 self.errors.append(f"step {step.index}: decide failed: {e!r}")
                 continue
 
-            # Build the record with provenance metadata
+            # Build the record with provenance metadata.
+            #
+            # `known_good_*` fields are the supervised-training labels. Since
+            # this trace is a WIN, the action actually taken at this tick is
+            # by definition a good next action. This lets downstream training
+            # use the record as a supervised triple:
+            #   state → (baseline-proposed dispatch)  [router BC]
+            #   state → known_good_action             [action prediction]
+            #   state × skill_params → known_good_action [motor-skill BC]
+            # Plus outcome-weighted shaping: sample weight ∝ game_outcome.won.
             metadata = {
                 "source": "gameplay",
                 "game": self.trace.game,
@@ -385,6 +394,10 @@ def run(self) -> CaptureResult:
                 "step_index": step.index,
                 "level": step.level,
                 "synthetic_kernel_state": True,
+                # Supervised labels from the winning trace
+                "known_good_action": step.action,
+                "known_good_data": step.data,
+                "known_good_level": step.level,
             }
             record = RouterRecord(
                 router_input=router_input,
diff --git a/sage/cognition/thalamic_router/tests/test_gameplay_capture.py b/sage/cognition/thalamic_router/tests/test_gameplay_capture.py
index 01220162c..ade50148e 100644
--- a/sage/cognition/thalamic_router/tests/test_gameplay_capture.py
+++ b/sage/cognition/thalamic_router/tests/test_gameplay_capture.py
@@ -186,6 +186,10 @@ def test_capture_end_to_end_emits_records(tmp_path):
         assert rec.metadata.get("game") == "testgame"
         assert rec.metadata.get("synthetic_kernel_state") is True
         assert "game_outcome" in rec.metadata
+        # Supervised-training labels
+        assert "known_good_action" in rec.metadata
+        assert rec.metadata["known_good_action"] in (1, 3, 6)   # from _make_trace_3_steps
+        assert "known_good_level" in rec.metadata
 
 
 def test_capture_writes_to_partition(tmp_path):
@@ -271,3 +275,36 @@ def test_capture_all_records_have_valid_router_output(tmp_path):
     for rec in capture.records:
         ok, reason = rec.router_output.validate()
         assert ok, f"invalid output: {reason}"
+
+
+def test_capture_known_good_labels_match_trace_actions(tmp_path):
+    """Each record's known_good_action must equal the corresponding TraceStep.action."""
+    trace = _make_trace_3_steps()
+    writer = RouterDatasetWriter(base_dir=tmp_path / "dataset",
+                                 machine="cbp", compress=False)
+    capture = GameplayCapture(trace=trace, writer=writer, machine="cbp",
+                              env_factory=_make_env_factory())
+    capture.run()
+    writer.close()
+    assert len(capture.records) == 3
+    assert capture.records[0].metadata["known_good_action"] == 1   # UP
+    assert capture.records[1].metadata["known_good_action"] == 3   # LEFT
+    assert capture.records[2].metadata["known_good_action"] == 6   # CLICK
+    # Click step carries click data
+    assert capture.records[2].metadata["known_good_data"] == {"x": 10, "y": 20}
+    # Non-click steps have None data
+    assert capture.records[0].metadata["known_good_data"] is None
+
+
+def test_capture_known_good_levels_match_trace_levels(tmp_path):
+    """known_good_level passes through the trace's per-step level."""
+    trace = _make_trace_3_steps()
+    writer = RouterDatasetWriter(base_dir=tmp_path / "dataset",
+                                 machine="cbp", compress=False)
+    capture = GameplayCapture(trace=trace, writer=writer, machine="cbp",
+                              env_factory=_make_env_factory())
+    capture.run()
+    writer.close()
+    assert capture.records[0].metadata["known_good_level"] == 0
+    assert capture.records[1].metadata["known_good_level"] == 0
+    assert capture.records[2].metadata["known_good_level"] == 1