AppSprout-dev · CalebisGross · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026
diff --git a/cmd/mnemonic/runtime.go b/cmd/mnemonic/runtime.go
@@ -42,6 +42,9 @@ func buildRetrievalConfig(cfg *config.Config) retrieval.RetrievalConfig {
 		RecencyBoostWeight:  float32(cfg.Retrieval.RecencyBoostWeight),
 		RecencyHalfLifeDays: float32(cfg.Retrieval.RecencyHalfLifeDays),
 
+		TypeFilterRecencyWeight:   float32(cfg.Retrieval.TypeFilterRecencyWeight),
+		TypeFilterRecencyHalfLife: float32(cfg.Retrieval.TypeFilterRecencyHalfLife),
+
 		ActivityBonusMax:   float32(cfg.Retrieval.ActivityBonusMax),
 		ActivityBonusScale: float32(cfg.Retrieval.ActivityBonusScale),
 

diff --git a/internal/agent/retrieval/agent.go b/internal/agent/retrieval/agent.go
@@ -51,6 +51,12 @@ type RetrievalConfig struct {
 	ActivityBonusMax    float32 // cap on Hebbian activity bonus (default: 0.2)
 	ActivityBonusScale  float32 // scale factor for activity bonus log curve (default: 0.02)
 
+	// Type-filtered query recency — when filtering by type, recency matters more
+	// than semantic match (the type already constrains relevance). These override
+	// RecencyBoostWeight/RecencyHalfLifeDays for type-filtered queries.
+	TypeFilterRecencyWeight   float32 // max recency bonus for type-filtered queries (default: 0.5)
+	TypeFilterRecencyHalfLife float32 // half-life in days for type-filtered recency (default: 7)
+
 	// Significance multipliers
 	CriticalBoost  float32 // multiplier for "critical" significance memories (default: 1.2)
 	ImportantBoost float32 // multiplier for "important" significance memories (default: 1.1)
@@ -103,6 +109,9 @@ func DefaultConfig() RetrievalConfig {
 		ActivityBonusMax:    0.2,
 		ActivityBonusScale:  0.02,
 
+		TypeFilterRecencyWeight:   0.5,
+		TypeFilterRecencyHalfLife: 7,
+
 		CriticalBoost:  1.2,
 		ImportantBoost: 1.1,
 
@@ -701,9 +710,19 @@ func (ra *RetrievalAgent) rankResults(ctx context.Context, activated map[string]
 		// Using CreatedAt (not LastAccessed) prevents a feedback loop where
 		// frequently-recalled memories continually reset their recency bonus
 		// via IncrementAccess. The activity bonus already rewards frequent access.
+		//
+		// For type-filtered queries, recency is amplified: the type filter already
+		// constrains relevance, so WHEN matters more than semantic match. This
+		// ensures the most recent handoff/decision/error surfaces first.
 		daysSinceCreated := float32(time.Since(mem.CreatedAt).Hours() / 24)
-		recencyWt := agentutil.Float32Or(ra.config.RecencyBoostWeight, 0.2)
-		recencyHL := agentutil.Float32Or(ra.config.RecencyHalfLifeDays, 30)
+		var recencyWt, recencyHL float32
+		if typeFiltered {
+			recencyWt = agentutil.Float32Or(ra.config.TypeFilterRecencyWeight, 0.5)
+			recencyHL = agentutil.Float32Or(ra.config.TypeFilterRecencyHalfLife, 7)
+		} else {
+			recencyWt = agentutil.Float32Or(ra.config.RecencyBoostWeight, 0.2)
+			recencyHL = agentutil.Float32Or(ra.config.RecencyHalfLifeDays, 30)
+		}
 		recencyBonus := recencyWt * float32(math.Exp(float64(-daysSinceCreated/recencyHL)))
 
 		// Hebbian activity bonus — frequently traversed associations indicate relevance.

diff --git a/internal/agent/retrieval/config_behavior_test.go b/internal/agent/retrieval/config_behavior_test.go
@@ -357,6 +357,169 @@ func TestConfigSynthesisMaxTokensPassedToLLM(t *testing.T) {
 	}
 }
 
+func TestConfigTypeFilterRecencyBoostsRecent(t *testing.T) {
+	// Scenario: two handoff memories with identical salience.
+	// m_old was created 7 days ago and has more associations (higher base activation).
+	// m_new was created 30 minutes ago.
+	// With the type-filter recency boost (weight 0.5, half-life 7 days), the new
+	// handoff must rank above the old one despite the old one's association advantage.
+
+	now := time.Now()
+	mNew := store.Memory{
+		ID:        "m_new",
+		Summary:   "session handoff 2026-04-11",
+		Content:   "recent handoff content",
+		Salience:  0.95,
+		CreatedAt: now.Add(-30 * time.Minute),
+		Source:    "mcp",
+		Type:      "handoff",
+	}
+	mOld := store.Memory{
+		ID:        "m_old",
+		Summary:   "session handoff 2026-04-04",
+		Content:   "old handoff content",
+		Salience:  0.95,
+		CreatedAt: now.Add(-7 * 24 * time.Hour),
+		Source:    "mcp",
+		Type:      "handoff",
+	}
+
+	s := &mockStore{
+		searchByFullTextFunc: func(_ context.Context, _ string, _ int) ([]store.Memory, error) {
+			return nil, nil
+		},
+		searchByEmbeddingFunc: func(_ context.Context, _ []float32, _ int) ([]store.RetrievalResult, error) {
+			return nil, nil
+		},
+		searchByTypeFunc: func(_ context.Context, _ []string, _ int) ([]store.Memory, error) {
+			return []store.Memory{mNew, mOld}, nil
+		},
+		getAssociationsFunc: func(_ context.Context, memoryID string) ([]store.Association, error) {
+			// Old memory has more associations — simulates richer graph
+			if memoryID == "m_old" {
+				return []store.Association{
+					{SourceID: "m_old", TargetID: "m_other1", Strength: 0.8, RelationType: "temporal", ActivationCount: 5},
+					{SourceID: "m_old", TargetID: "m_other2", Strength: 0.7, RelationType: "similar", ActivationCount: 3},
+				}, nil
+			}
+			return nil, nil
+		},
+		getMemoryFunc: func(_ context.Context, id string) (store.Memory, error) {
+			switch id {
+			case "m_new":
+				return mNew, nil
+			case "m_old":
+				return mOld, nil
+			default:
+				return store.Memory{ID: id, Salience: 0.5, CreatedAt: now.Add(-14 * 24 * time.Hour)}, nil
+			}
+		},
+	}
+
+	cfg := DefaultConfig()
+	agent := NewRetrievalAgent(s, &mockLLMProvider{}, cfg, testLogger(), nil)
+
+	resp, err := agent.Query(context.Background(), QueryRequest{
+		Query: "session handoff",
+		Type:  "handoff",
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if len(resp.Memories) < 2 {
+		t.Fatalf("expected at least 2 results, got %d", len(resp.Memories))
+	}
+
+	// The recent handoff must rank first
+	if resp.Memories[0].Memory.ID != "m_new" {
+		t.Errorf("expected m_new (recent) to rank first, but got %s (scores: %v)",
+			resp.Memories[0].Memory.ID,
+			func() []string {
+				var s []string
+				for _, m := range resp.Memories {
+					s = append(s, fmt.Sprintf("%s=%.4f", m.Memory.ID, m.Score))
+				}
+				return s
+			}())
+	}
+}
+
+func TestConfigTypeFilterRecencyParamsUsed(t *testing.T) {
+	// Verify that the type-filter recency params are actually applied (not the
+	// general ones) by using extreme values and checking the ranking effect.
+	now := time.Now()
+
+	mRecent := store.Memory{
+		ID:        "m_recent",
+		Summary:   "recent decision",
+		Salience:  0.5, // lower salience
+		CreatedAt: now.Add(-1 * time.Hour),
+		Source:    "mcp",
+		Type:      "decision",
+	}
+	mOld := store.Memory{
+		ID:        "m_old",
+		Summary:   "old decision",
+		Salience:  0.9, // higher salience
+		CreatedAt: now.Add(-30 * 24 * time.Hour),
+		Source:    "mcp",
+		Type:      "decision",
+	}
+
+	s := &mockStore{
+		searchByFullTextFunc: func(_ context.Context, _ string, _ int) ([]store.Memory, error) {
+			return nil, nil
+		},
+		searchByEmbeddingFunc: func(_ context.Context, _ []float32, _ int) ([]store.RetrievalResult, error) {
+			return nil, nil
+		},
+		searchByTypeFunc: func(_ context.Context, _ []string, _ int) ([]store.Memory, error) {
+			return []store.Memory{mRecent, mOld}, nil
+		},
+		getAssociationsFunc: func(_ context.Context, _ string) ([]store.Association, error) {
+			return nil, nil
+		},
+		getMemoryFunc: func(_ context.Context, id string) (store.Memory, error) {
+			switch id {
+			case "m_recent":
+				return mRecent, nil
+			case "m_old":
+				return mOld, nil
+			default:
+				return store.Memory{ID: id, Salience: 0.5, CreatedAt: now}, nil
+			}
+		},
+	}
+
+	// Use aggressive type-filter recency: weight=1.0, half-life=1 day
+	cfg := DefaultConfig()
+	cfg.TypeFilterRecencyWeight = 1.0
+	cfg.TypeFilterRecencyHalfLife = 1.0
+	agent := NewRetrievalAgent(s, &mockLLMProvider{}, cfg, testLogger(), nil)
+
+	resp, err := agent.Query(context.Background(), QueryRequest{
+		Query: "decision",
+		Type:  "decision",
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if len(resp.Memories) < 2 {
+		t.Fatalf("expected at least 2 results, got %d", len(resp.Memories))
+	}
+
+	// With weight=1.0 and half-life=1 day:
+	//   m_recent (1 hour old): bonus = 1.0 * exp(-0.04/1) ≈ 0.96
+	//   m_old (30 days old):   bonus = 1.0 * exp(-30/1) ≈ 0.0
+	// Even though m_old has higher salience, the recency must dominate
+	if resp.Memories[0].Memory.ID != "m_recent" {
+		t.Errorf("expected m_recent to rank first with aggressive type-filter recency, got %s",
+			resp.Memories[0].Memory.ID)
+	}
+}
+
 func TestConfigMaxToolCallsLimitsSynthesisTools(t *testing.T) {
 	now := time.Now()
 

diff --git a/internal/config/config.go b/internal/config/config.go
@@ -302,6 +302,10 @@ type RetrievalConfig struct {
 	ActivityBonusMax    float64 `yaml:"activity_bonus_max"`
 	ActivityBonusScale  float64 `yaml:"activity_bonus_scale"`
 
+	// Type-filtered query recency (stronger recency for type-narrowed queries)
+	TypeFilterRecencyWeight   float64 `yaml:"type_filter_recency_weight"`
+	TypeFilterRecencyHalfLife float64 `yaml:"type_filter_recency_half_life"`
+
 	// Significance multipliers
 	CriticalBoost  float64 `yaml:"critical_boost"`
 	ImportantBoost float64 `yaml:"important_boost"`
@@ -763,6 +767,9 @@ func Default() *Config {
 			ActivityBonusMax:    0.2,
 			ActivityBonusScale:  0.02,
 
+			TypeFilterRecencyWeight:   0.5,
+			TypeFilterRecencyHalfLife: 7,
+
 			CriticalBoost:  1.2,
 			ImportantBoost: 1.1,
 

diff --git a/internal/mcp/server.go b/internal/mcp/server.go
@@ -2738,6 +2738,7 @@ func (srv *MCPServer) handleCheckMemory(ctx context.Context, args map[string]any
 	fmt.Fprintf(&sb, "Memory %s (encoded)\n", mem.ID)
 	fmt.Fprintf(&sb, "  Raw ID: %s\n", mem.RawID)
 	fmt.Fprintf(&sb, "  Summary: %s\n", mem.Summary)
+	fmt.Fprintf(&sb, "  Content: %s\n", mem.Content)
 	fmt.Fprintf(&sb, "  Concepts: %v\n", mem.Concepts)
 	fmt.Fprintf(&sb, "  Salience: %.2f\n", mem.Salience)
 	fmt.Fprintf(&sb, "  State: %s\n", mem.State)

diff --git a/internal/mcp/server_test.go b/internal/mcp/server_test.go
@@ -5,10 +5,12 @@ import (
 	"encoding/json"
 	"io"
 	"log/slog"
+	"strings"
 	"testing"
 	"time"
 
 	"github.com/appsprout-dev/mnemonic/internal/events"
+	"github.com/appsprout-dev/mnemonic/internal/store"
 	"github.com/appsprout-dev/mnemonic/internal/store/storetest"
 )
 
@@ -17,6 +19,17 @@ type mockStore struct {
 	storetest.MockStore
 }
 
+// checkMemoryStore overrides GetMemory to return a configurable memory.
+type checkMemoryStore struct {
+	storetest.MockStore
+	memory store.Memory
+	err    error
+}
+
+func (s *checkMemoryStore) GetMemory(_ context.Context, _ string) (store.Memory, error) {
+	return s.memory, s.err
+}
+
 // mockBus is a minimal mock of the Bus interface for testing.
 type mockBus struct{}
 
@@ -472,3 +485,53 @@ func TestContextMetricsJSON(t *testing.T) {
 		}
 	}
 }
+
+// TestHandleCheckMemoryIncludesContent verifies that check_memory output
+// includes the full memory content, not just the summary.
+func TestHandleCheckMemoryIncludesContent(t *testing.T) {
+	logger := slog.New(slog.NewTextHandler(io.Discard, nil))
+
+	mem := store.Memory{
+		ID:        "mem-123",
+		RawID:     "raw-456",
+		Summary:   "short summary",
+		Content:   "This is the full memory content that must appear in check_memory output.",
+		Concepts:  []string{"test", "content"},
+		Salience:  0.85,
+		State:     "active",
+		Source:    "mcp",
+		Type:      "handoff",
+		CreatedAt: time.Now(),
+	}
+
+	s := &checkMemoryStore{memory: mem}
+	srv := NewMCPServer(s, nil, &mockBus{}, logger, "test", "", []string{}, 0, nil, "", DefaultMemoryDefaults())
+
+	result, err := srv.handleCheckMemory(context.Background(), map[string]any{
+		"memory_id": "mem-123",
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// Extract the text from the tool result
+	resultMap, ok := result.(map[string]any)
+	if !ok {
+		t.Fatalf("expected map result, got %T", result)
+	}
+	contentArray, ok := resultMap["content"].([]map[string]any)
+	if !ok || len(contentArray) == 0 {
+		t.Fatal("expected content array in result")
+	}
+	text, _ := contentArray[0]["text"].(string)
+
+	// Verify content is present
+	if !strings.Contains(text, "Content: "+mem.Content) {
+		t.Errorf("check_memory output missing Content field.\nGot:\n%s", text)
+	}
+
+	// Verify summary is also present (regression check)
+	if !strings.Contains(text, "Summary: "+mem.Summary) {
+		t.Errorf("check_memory output missing Summary field.\nGot:\n%s", text)
+	}
+}
diff --git a/training/docs/experiment_registry.md b/training/docs/experiment_registry.md
@@ -1192,7 +1192,7 @@ Gemma E2B matches Qwen 4B on faithfulness while being 44% faster. The faithful p
 ### EXP-30: Gemma 4 E2B Spoke Training — Faithful Prompt + V7 Data
 
 - **Date:** 2026-04-10
-- **Status:** RUNNING
+- **Status:** COMPLETED (training), PENDING (evaluation)
 - **Hypothesis:** Gemma 4 E2B with trained Felix spokes on v7 data + faithful prompt will achieve 100% SC (schema compliance) while maintaining the 100% EPR and 100% NP demonstrated by the base model with the faithful prompt in EXP-29. The spokes learn the structural schema that the base model can't produce without grammar enforcement.
 - **Null hypothesis:** Spoke training on Gemma E2B degrades the faithfulness achieved by the faithful prompt alone (EPR drops below 90% or FR rises above 5%). The base model + prompt is sufficient and spokes add no value.
 - **Variable:** Spoke adapters trained on v7 encoding data with faithful prompt format. Base model, prompt, and quantization held constant.
@@ -1205,6 +1205,45 @@ Gemma E2B matches Qwen 4B on faithfulness while being 44% faster. The faithful p
 - **Export plan:** Export spokes via Gemma-specific export script, quantize to RQ4 via rotorq pipeline, deploy in embedded llama.cpp backend.
 - **Tracking:** Branch feat/gemma-e2b-spokes
 - **Overfit probe (2026-04-10):** 10 train / 5 eval, 200 optimizer steps, batch 1 x accum 1, LR 3e-4. Online train loss 14.4→5.2, eval loss 1.80→1.61. Online train loss was misleading — diagnostic showed batch-1 oscillation noise. Evaluating the final checkpoint on training data in eval mode gave loss 1.56 (PPL 4.8), confirming the model learned. Train eval-mode loss (1.56) < eval loss (1.61) — pipeline is working. Gates barely moved (expected at 200 steps). Autocast asymmetry ruled out as cause (NF4 outputs bf16 regardless). WandB: spokes_tmp_b1x1.
-- **Full training run (2026-04-10):** 15,714 micro-steps (1,964 optimizer steps, ~3 epochs), batch 1 x accum 8 = 8 effective. Warmup 20 optimizer steps. Eval loss trajectory: 1.6830 (init) → 1.6823 (step 200) → 1.6713 (step 400) → 1.6480 (step 600). Steady decline, LR still ramping. Gates frozen at initialization through step 600 (expected — scalar_lr_scale 0.1 is conservative). WandB: exp30_gemma4_v7_faithful. Checkpoints: `checkpoints/exp30_gemma4_v7_faithful/`.
-- **Result:** (pending)
-- **Verdict:** (pending)
+- **Full training run (2026-04-10):** Early stopped at step 5,800 (patience 5). Best checkpoint: step 4,800 (eval loss 1.2002, PPL 3.3). Config ran past the planned 1,964 optimizer steps (~3 epochs) because the cosine schedule wraps — training continued through ~7.4 epochs total. WandB: exp30_gemma4_v7_faithful. Checkpoints: `checkpoints/exp30_gemma4_v7_faithful/`.
+- **Eval loss trajectory:**
+
+| Step | Eval Loss | PPL | Delta | Phase |
+|------|-----------|-----|-------|-------|
+| init | 1.6830 | 5.4 | — | baseline |
+| 200 | 1.6823 | 5.4 | -0.001 | warmup |
+| 400 | 1.6713 | 5.3 | -0.011 | LR ramping |
+| 600 | 1.6480 | 5.2 | -0.023 | |
+| 800 | 1.6026 | 5.0 | -0.045 | peak LR |
+| 1000 | 1.5786 | 4.8 | -0.024 | phase 1 best |
+| 1200 | 1.6137 | 5.0 | +0.035 | regression |
+| 1400 | 1.6694 | 5.3 | +0.056 | |
+| 1600 | 1.6786 | 5.4 | +0.009 | near-init |
+| 1800 | 1.6153 | 5.0 | -0.063 | recovery |
+| 2000 | 1.5248 | 4.6 | -0.091 | phase 2 begins |
+| 2200 | 1.4991 | 4.5 | -0.026 | |
+| 2400 | 1.4657 | 4.3 | -0.033 | |
+| 2600 | 1.4438 | 4.2 | -0.022 | |
+| 2800 | 1.4145 | 4.1 | -0.029 | |
+| 3000 | 1.3913 | 4.0 | -0.023 | |
+| 3200 | 1.3113 | 3.7 | -0.080 | accelerating |
+| 3400 | 1.3026 | 3.7 | -0.009 | |
+| 3600 | 1.2493 | 3.5 | -0.053 | |
+| 3800 | 1.2284 | 3.4 | -0.021 | below MI300X init |
+| 4000 | 1.2110 | 3.4 | -0.017 | |
+| 4200 | 1.2256 | 3.4 | +0.015 | patience 1 |
+| 4400 | 1.2098 | 3.4 | -0.016 | recovered |
+| 4600 | 1.2017 | 3.3 | -0.008 | |
+| **4800** | **1.2002** | **3.3** | **-0.002** | **best** |
+| 5000 | 1.2160 | 3.4 | +0.016 | patience 1 |
+| 5200 | 1.2190 | 3.4 | +0.003 | patience 2 |
+| 5400 | 1.2296 | 3.4 | +0.011 | patience 3 |
+| 5600 | 1.2349 | 3.4 | +0.005 | patience 4 |
+| 5800 | 1.2688 | 3.6 | +0.034 | early stop |
+
+- **Training dynamics:** Two distinct phases. Phase 1 (steps 0-1000, peak cosine LR ~3e-4): fast improvement to 1.5786, then regression back to near-init as LR decayed — the spokes couldn't maintain learned behavior at intermediate LR with NF4 quantization noise. Phase 2 (steps 1800+, minimum cosine LR ~3e-5): stable second descent through 14 consecutive new bests. The minimum LR is the productive regime for NF4 spoke training. **Implication:** future NF4 runs should use lower peak LR or longer training at constant low LR.
+- **Gate movement:** 8 of 35 layers shifted from initialization — layers 0, 1, 2, 3, 4, 5 (early) and 32, 33, 34 (late). Movement was small (0.001-0.002 per layer) but consistent. Scalar_lr_scale=0.1 at peak LR 3e-4 = gate LR 3e-5 is too conservative for meaningful gate differentiation on NF4.
+- **Evaluation (2026-04-11):** Multiple eval runs on 25 EXP-25 gold probes. Best result: 1/10 valid JSON (10%), 0 SC. The base model without spokes (EXP-29) achieves 24/25 valid JSON zero-shot. Diagnostic showed the model generates faithful *content* (entity preservation, correct facts) but cannot maintain valid JSON *structure* — `structured_concepts` has mixed types, fields are nested incorrectly, output truncated by verbose malformed sections. The model was trained on 5,238 perfectly structured examples but the spokes failed to learn schema compliance.
+- **Result:** NEGATIVE. Best eval loss 1.2002 (PPL 3.3) does not translate to usable generation. The eval loss improvement (-0.483) is real for teacher-forced prediction but autoregressive generation with NF4 spokes degrades output quality below the base model's zero-shot capability.
+- **Verdict:** INCONCLUSIVE. Python HF generate() with trained spokes produces valid faithful JSON (entity preservation, correct schema fields), but llama.cpp server with the same exported GGUF produces incoherent output. The discrepancy points to a bug in the llama.cpp fork's Gemma spoke application (gemma4-iswa.cpp), not a training failure. Additionally, GBNF grammar enforcement was never tested through a working inference path — the experiment cannot be judged until spokes + grammar are evaluated together. Verdict suspended pending: (1) llama.cpp spoke debugging, (2) spokes + GBNF eval on the 25 gold probes.
+- **Key learning:** Do not declare verdicts based on incomplete inference pipelines. The eval script had multiple bugs (missing repetition_penalty, no markdown fence stripping, insufficient max_tokens) that produced false negatives. Always verify the inference path produces sane output on a trivial input before running the full evaluation.