diff --git a/cmd/mnemonic/runtime.go b/cmd/mnemonic/runtime.go index 64220f85..dde020b0 100644 --- a/cmd/mnemonic/runtime.go +++ b/cmd/mnemonic/runtime.go @@ -42,6 +42,9 @@ func buildRetrievalConfig(cfg *config.Config) retrieval.RetrievalConfig { RecencyBoostWeight: float32(cfg.Retrieval.RecencyBoostWeight), RecencyHalfLifeDays: float32(cfg.Retrieval.RecencyHalfLifeDays), + TypeFilterRecencyWeight: float32(cfg.Retrieval.TypeFilterRecencyWeight), + TypeFilterRecencyHalfLife: float32(cfg.Retrieval.TypeFilterRecencyHalfLife), + ActivityBonusMax: float32(cfg.Retrieval.ActivityBonusMax), ActivityBonusScale: float32(cfg.Retrieval.ActivityBonusScale), diff --git a/internal/agent/retrieval/agent.go b/internal/agent/retrieval/agent.go index 9630abc8..ec5bccc3 100644 --- a/internal/agent/retrieval/agent.go +++ b/internal/agent/retrieval/agent.go @@ -51,6 +51,12 @@ type RetrievalConfig struct { ActivityBonusMax float32 // cap on Hebbian activity bonus (default: 0.2) ActivityBonusScale float32 // scale factor for activity bonus log curve (default: 0.02) + // Type-filtered query recency — when filtering by type, recency matters more + // than semantic match (the type already constrains relevance). These override + // RecencyBoostWeight/RecencyHalfLifeDays for type-filtered queries. + TypeFilterRecencyWeight float32 // max recency bonus for type-filtered queries (default: 0.5) + TypeFilterRecencyHalfLife float32 // half-life in days for type-filtered recency (default: 7) + // Significance multipliers CriticalBoost float32 // multiplier for "critical" significance memories (default: 1.2) ImportantBoost float32 // multiplier for "important" significance memories (default: 1.1) @@ -103,6 +109,9 @@ func DefaultConfig() RetrievalConfig { ActivityBonusMax: 0.2, ActivityBonusScale: 0.02, + TypeFilterRecencyWeight: 0.5, + TypeFilterRecencyHalfLife: 7, + CriticalBoost: 1.2, ImportantBoost: 1.1, @@ -701,9 +710,19 @@ func (ra *RetrievalAgent) rankResults(ctx context.Context, activated map[string] // Using CreatedAt (not LastAccessed) prevents a feedback loop where // frequently-recalled memories continually reset their recency bonus // via IncrementAccess. The activity bonus already rewards frequent access. + // + // For type-filtered queries, recency is amplified: the type filter already + // constrains relevance, so WHEN matters more than semantic match. This + // ensures the most recent handoff/decision/error surfaces first. daysSinceCreated := float32(time.Since(mem.CreatedAt).Hours() / 24) - recencyWt := agentutil.Float32Or(ra.config.RecencyBoostWeight, 0.2) - recencyHL := agentutil.Float32Or(ra.config.RecencyHalfLifeDays, 30) + var recencyWt, recencyHL float32 + if typeFiltered { + recencyWt = agentutil.Float32Or(ra.config.TypeFilterRecencyWeight, 0.5) + recencyHL = agentutil.Float32Or(ra.config.TypeFilterRecencyHalfLife, 7) + } else { + recencyWt = agentutil.Float32Or(ra.config.RecencyBoostWeight, 0.2) + recencyHL = agentutil.Float32Or(ra.config.RecencyHalfLifeDays, 30) + } recencyBonus := recencyWt * float32(math.Exp(float64(-daysSinceCreated/recencyHL))) // Hebbian activity bonus — frequently traversed associations indicate relevance. diff --git a/internal/agent/retrieval/config_behavior_test.go b/internal/agent/retrieval/config_behavior_test.go index 986809df..d3b7e936 100644 --- a/internal/agent/retrieval/config_behavior_test.go +++ b/internal/agent/retrieval/config_behavior_test.go @@ -357,6 +357,169 @@ func TestConfigSynthesisMaxTokensPassedToLLM(t *testing.T) { } } +func TestConfigTypeFilterRecencyBoostsRecent(t *testing.T) { + // Scenario: two handoff memories with identical salience. + // m_old was created 7 days ago and has more associations (higher base activation). + // m_new was created 30 minutes ago. + // With the type-filter recency boost (weight 0.5, half-life 7 days), the new + // handoff must rank above the old one despite the old one's association advantage. + + now := time.Now() + mNew := store.Memory{ + ID: "m_new", + Summary: "session handoff 2026-04-11", + Content: "recent handoff content", + Salience: 0.95, + CreatedAt: now.Add(-30 * time.Minute), + Source: "mcp", + Type: "handoff", + } + mOld := store.Memory{ + ID: "m_old", + Summary: "session handoff 2026-04-04", + Content: "old handoff content", + Salience: 0.95, + CreatedAt: now.Add(-7 * 24 * time.Hour), + Source: "mcp", + Type: "handoff", + } + + s := &mockStore{ + searchByFullTextFunc: func(_ context.Context, _ string, _ int) ([]store.Memory, error) { + return nil, nil + }, + searchByEmbeddingFunc: func(_ context.Context, _ []float32, _ int) ([]store.RetrievalResult, error) { + return nil, nil + }, + searchByTypeFunc: func(_ context.Context, _ []string, _ int) ([]store.Memory, error) { + return []store.Memory{mNew, mOld}, nil + }, + getAssociationsFunc: func(_ context.Context, memoryID string) ([]store.Association, error) { + // Old memory has more associations — simulates richer graph + if memoryID == "m_old" { + return []store.Association{ + {SourceID: "m_old", TargetID: "m_other1", Strength: 0.8, RelationType: "temporal", ActivationCount: 5}, + {SourceID: "m_old", TargetID: "m_other2", Strength: 0.7, RelationType: "similar", ActivationCount: 3}, + }, nil + } + return nil, nil + }, + getMemoryFunc: func(_ context.Context, id string) (store.Memory, error) { + switch id { + case "m_new": + return mNew, nil + case "m_old": + return mOld, nil + default: + return store.Memory{ID: id, Salience: 0.5, CreatedAt: now.Add(-14 * 24 * time.Hour)}, nil + } + }, + } + + cfg := DefaultConfig() + agent := NewRetrievalAgent(s, &mockLLMProvider{}, cfg, testLogger(), nil) + + resp, err := agent.Query(context.Background(), QueryRequest{ + Query: "session handoff", + Type: "handoff", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(resp.Memories) < 2 { + t.Fatalf("expected at least 2 results, got %d", len(resp.Memories)) + } + + // The recent handoff must rank first + if resp.Memories[0].Memory.ID != "m_new" { + t.Errorf("expected m_new (recent) to rank first, but got %s (scores: %v)", + resp.Memories[0].Memory.ID, + func() []string { + var s []string + for _, m := range resp.Memories { + s = append(s, fmt.Sprintf("%s=%.4f", m.Memory.ID, m.Score)) + } + return s + }()) + } +} + +func TestConfigTypeFilterRecencyParamsUsed(t *testing.T) { + // Verify that the type-filter recency params are actually applied (not the + // general ones) by using extreme values and checking the ranking effect. + now := time.Now() + + mRecent := store.Memory{ + ID: "m_recent", + Summary: "recent decision", + Salience: 0.5, // lower salience + CreatedAt: now.Add(-1 * time.Hour), + Source: "mcp", + Type: "decision", + } + mOld := store.Memory{ + ID: "m_old", + Summary: "old decision", + Salience: 0.9, // higher salience + CreatedAt: now.Add(-30 * 24 * time.Hour), + Source: "mcp", + Type: "decision", + } + + s := &mockStore{ + searchByFullTextFunc: func(_ context.Context, _ string, _ int) ([]store.Memory, error) { + return nil, nil + }, + searchByEmbeddingFunc: func(_ context.Context, _ []float32, _ int) ([]store.RetrievalResult, error) { + return nil, nil + }, + searchByTypeFunc: func(_ context.Context, _ []string, _ int) ([]store.Memory, error) { + return []store.Memory{mRecent, mOld}, nil + }, + getAssociationsFunc: func(_ context.Context, _ string) ([]store.Association, error) { + return nil, nil + }, + getMemoryFunc: func(_ context.Context, id string) (store.Memory, error) { + switch id { + case "m_recent": + return mRecent, nil + case "m_old": + return mOld, nil + default: + return store.Memory{ID: id, Salience: 0.5, CreatedAt: now}, nil + } + }, + } + + // Use aggressive type-filter recency: weight=1.0, half-life=1 day + cfg := DefaultConfig() + cfg.TypeFilterRecencyWeight = 1.0 + cfg.TypeFilterRecencyHalfLife = 1.0 + agent := NewRetrievalAgent(s, &mockLLMProvider{}, cfg, testLogger(), nil) + + resp, err := agent.Query(context.Background(), QueryRequest{ + Query: "decision", + Type: "decision", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(resp.Memories) < 2 { + t.Fatalf("expected at least 2 results, got %d", len(resp.Memories)) + } + + // With weight=1.0 and half-life=1 day: + // m_recent (1 hour old): bonus = 1.0 * exp(-0.04/1) ≈ 0.96 + // m_old (30 days old): bonus = 1.0 * exp(-30/1) ≈ 0.0 + // Even though m_old has higher salience, the recency must dominate + if resp.Memories[0].Memory.ID != "m_recent" { + t.Errorf("expected m_recent to rank first with aggressive type-filter recency, got %s", + resp.Memories[0].Memory.ID) + } +} + func TestConfigMaxToolCallsLimitsSynthesisTools(t *testing.T) { now := time.Now() diff --git a/internal/config/config.go b/internal/config/config.go index b9a5deda..5da43ff2 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -302,6 +302,10 @@ type RetrievalConfig struct { ActivityBonusMax float64 `yaml:"activity_bonus_max"` ActivityBonusScale float64 `yaml:"activity_bonus_scale"` + // Type-filtered query recency (stronger recency for type-narrowed queries) + TypeFilterRecencyWeight float64 `yaml:"type_filter_recency_weight"` + TypeFilterRecencyHalfLife float64 `yaml:"type_filter_recency_half_life"` + // Significance multipliers CriticalBoost float64 `yaml:"critical_boost"` ImportantBoost float64 `yaml:"important_boost"` @@ -763,6 +767,9 @@ func Default() *Config { ActivityBonusMax: 0.2, ActivityBonusScale: 0.02, + TypeFilterRecencyWeight: 0.5, + TypeFilterRecencyHalfLife: 7, + CriticalBoost: 1.2, ImportantBoost: 1.1, diff --git a/internal/mcp/server.go b/internal/mcp/server.go index f19f99d2..e99c120f 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -2738,6 +2738,7 @@ func (srv *MCPServer) handleCheckMemory(ctx context.Context, args map[string]any fmt.Fprintf(&sb, "Memory %s (encoded)\n", mem.ID) fmt.Fprintf(&sb, " Raw ID: %s\n", mem.RawID) fmt.Fprintf(&sb, " Summary: %s\n", mem.Summary) + fmt.Fprintf(&sb, " Content: %s\n", mem.Content) fmt.Fprintf(&sb, " Concepts: %v\n", mem.Concepts) fmt.Fprintf(&sb, " Salience: %.2f\n", mem.Salience) fmt.Fprintf(&sb, " State: %s\n", mem.State) diff --git a/internal/mcp/server_test.go b/internal/mcp/server_test.go index c65dc2b0..1bd850e4 100644 --- a/internal/mcp/server_test.go +++ b/internal/mcp/server_test.go @@ -5,10 +5,12 @@ import ( "encoding/json" "io" "log/slog" + "strings" "testing" "time" "github.com/appsprout-dev/mnemonic/internal/events" + "github.com/appsprout-dev/mnemonic/internal/store" "github.com/appsprout-dev/mnemonic/internal/store/storetest" ) @@ -17,6 +19,17 @@ type mockStore struct { storetest.MockStore } +// checkMemoryStore overrides GetMemory to return a configurable memory. +type checkMemoryStore struct { + storetest.MockStore + memory store.Memory + err error +} + +func (s *checkMemoryStore) GetMemory(_ context.Context, _ string) (store.Memory, error) { + return s.memory, s.err +} + // mockBus is a minimal mock of the Bus interface for testing. type mockBus struct{} @@ -472,3 +485,53 @@ func TestContextMetricsJSON(t *testing.T) { } } } + +// TestHandleCheckMemoryIncludesContent verifies that check_memory output +// includes the full memory content, not just the summary. +func TestHandleCheckMemoryIncludesContent(t *testing.T) { + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + + mem := store.Memory{ + ID: "mem-123", + RawID: "raw-456", + Summary: "short summary", + Content: "This is the full memory content that must appear in check_memory output.", + Concepts: []string{"test", "content"}, + Salience: 0.85, + State: "active", + Source: "mcp", + Type: "handoff", + CreatedAt: time.Now(), + } + + s := &checkMemoryStore{memory: mem} + srv := NewMCPServer(s, nil, &mockBus{}, logger, "test", "", []string{}, 0, nil, "", DefaultMemoryDefaults()) + + result, err := srv.handleCheckMemory(context.Background(), map[string]any{ + "memory_id": "mem-123", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Extract the text from the tool result + resultMap, ok := result.(map[string]any) + if !ok { + t.Fatalf("expected map result, got %T", result) + } + contentArray, ok := resultMap["content"].([]map[string]any) + if !ok || len(contentArray) == 0 { + t.Fatal("expected content array in result") + } + text, _ := contentArray[0]["text"].(string) + + // Verify content is present + if !strings.Contains(text, "Content: "+mem.Content) { + t.Errorf("check_memory output missing Content field.\nGot:\n%s", text) + } + + // Verify summary is also present (regression check) + if !strings.Contains(text, "Summary: "+mem.Summary) { + t.Errorf("check_memory output missing Summary field.\nGot:\n%s", text) + } +} diff --git a/training/docs/experiment_registry.md b/training/docs/experiment_registry.md index a29243a9..b3ed33eb 100644 --- a/training/docs/experiment_registry.md +++ b/training/docs/experiment_registry.md @@ -1192,7 +1192,7 @@ Gemma E2B matches Qwen 4B on faithfulness while being 44% faster. The faithful p ### EXP-30: Gemma 4 E2B Spoke Training — Faithful Prompt + V7 Data - **Date:** 2026-04-10 -- **Status:** RUNNING +- **Status:** COMPLETED (training), PENDING (evaluation) - **Hypothesis:** Gemma 4 E2B with trained Felix spokes on v7 data + faithful prompt will achieve 100% SC (schema compliance) while maintaining the 100% EPR and 100% NP demonstrated by the base model with the faithful prompt in EXP-29. The spokes learn the structural schema that the base model can't produce without grammar enforcement. - **Null hypothesis:** Spoke training on Gemma E2B degrades the faithfulness achieved by the faithful prompt alone (EPR drops below 90% or FR rises above 5%). The base model + prompt is sufficient and spokes add no value. - **Variable:** Spoke adapters trained on v7 encoding data with faithful prompt format. Base model, prompt, and quantization held constant. @@ -1205,6 +1205,45 @@ Gemma E2B matches Qwen 4B on faithfulness while being 44% faster. The faithful p - **Export plan:** Export spokes via Gemma-specific export script, quantize to RQ4 via rotorq pipeline, deploy in embedded llama.cpp backend. - **Tracking:** Branch feat/gemma-e2b-spokes - **Overfit probe (2026-04-10):** 10 train / 5 eval, 200 optimizer steps, batch 1 x accum 1, LR 3e-4. Online train loss 14.4→5.2, eval loss 1.80→1.61. Online train loss was misleading — diagnostic showed batch-1 oscillation noise. Evaluating the final checkpoint on training data in eval mode gave loss 1.56 (PPL 4.8), confirming the model learned. Train eval-mode loss (1.56) < eval loss (1.61) — pipeline is working. Gates barely moved (expected at 200 steps). Autocast asymmetry ruled out as cause (NF4 outputs bf16 regardless). WandB: spokes_tmp_b1x1. -- **Full training run (2026-04-10):** 15,714 micro-steps (1,964 optimizer steps, ~3 epochs), batch 1 x accum 8 = 8 effective. Warmup 20 optimizer steps. Eval loss trajectory: 1.6830 (init) → 1.6823 (step 200) → 1.6713 (step 400) → 1.6480 (step 600). Steady decline, LR still ramping. Gates frozen at initialization through step 600 (expected — scalar_lr_scale 0.1 is conservative). WandB: exp30_gemma4_v7_faithful. Checkpoints: `checkpoints/exp30_gemma4_v7_faithful/`. -- **Result:** (pending) -- **Verdict:** (pending) +- **Full training run (2026-04-10):** Early stopped at step 5,800 (patience 5). Best checkpoint: step 4,800 (eval loss 1.2002, PPL 3.3). Config ran past the planned 1,964 optimizer steps (~3 epochs) because the cosine schedule wraps — training continued through ~7.4 epochs total. WandB: exp30_gemma4_v7_faithful. Checkpoints: `checkpoints/exp30_gemma4_v7_faithful/`. +- **Eval loss trajectory:** + +| Step | Eval Loss | PPL | Delta | Phase | +|------|-----------|-----|-------|-------| +| init | 1.6830 | 5.4 | — | baseline | +| 200 | 1.6823 | 5.4 | -0.001 | warmup | +| 400 | 1.6713 | 5.3 | -0.011 | LR ramping | +| 600 | 1.6480 | 5.2 | -0.023 | | +| 800 | 1.6026 | 5.0 | -0.045 | peak LR | +| 1000 | 1.5786 | 4.8 | -0.024 | phase 1 best | +| 1200 | 1.6137 | 5.0 | +0.035 | regression | +| 1400 | 1.6694 | 5.3 | +0.056 | | +| 1600 | 1.6786 | 5.4 | +0.009 | near-init | +| 1800 | 1.6153 | 5.0 | -0.063 | recovery | +| 2000 | 1.5248 | 4.6 | -0.091 | phase 2 begins | +| 2200 | 1.4991 | 4.5 | -0.026 | | +| 2400 | 1.4657 | 4.3 | -0.033 | | +| 2600 | 1.4438 | 4.2 | -0.022 | | +| 2800 | 1.4145 | 4.1 | -0.029 | | +| 3000 | 1.3913 | 4.0 | -0.023 | | +| 3200 | 1.3113 | 3.7 | -0.080 | accelerating | +| 3400 | 1.3026 | 3.7 | -0.009 | | +| 3600 | 1.2493 | 3.5 | -0.053 | | +| 3800 | 1.2284 | 3.4 | -0.021 | below MI300X init | +| 4000 | 1.2110 | 3.4 | -0.017 | | +| 4200 | 1.2256 | 3.4 | +0.015 | patience 1 | +| 4400 | 1.2098 | 3.4 | -0.016 | recovered | +| 4600 | 1.2017 | 3.3 | -0.008 | | +| **4800** | **1.2002** | **3.3** | **-0.002** | **best** | +| 5000 | 1.2160 | 3.4 | +0.016 | patience 1 | +| 5200 | 1.2190 | 3.4 | +0.003 | patience 2 | +| 5400 | 1.2296 | 3.4 | +0.011 | patience 3 | +| 5600 | 1.2349 | 3.4 | +0.005 | patience 4 | +| 5800 | 1.2688 | 3.6 | +0.034 | early stop | + +- **Training dynamics:** Two distinct phases. Phase 1 (steps 0-1000, peak cosine LR ~3e-4): fast improvement to 1.5786, then regression back to near-init as LR decayed — the spokes couldn't maintain learned behavior at intermediate LR with NF4 quantization noise. Phase 2 (steps 1800+, minimum cosine LR ~3e-5): stable second descent through 14 consecutive new bests. The minimum LR is the productive regime for NF4 spoke training. **Implication:** future NF4 runs should use lower peak LR or longer training at constant low LR. +- **Gate movement:** 8 of 35 layers shifted from initialization — layers 0, 1, 2, 3, 4, 5 (early) and 32, 33, 34 (late). Movement was small (0.001-0.002 per layer) but consistent. Scalar_lr_scale=0.1 at peak LR 3e-4 = gate LR 3e-5 is too conservative for meaningful gate differentiation on NF4. +- **Evaluation (2026-04-11):** Multiple eval runs on 25 EXP-25 gold probes. Best result: 1/10 valid JSON (10%), 0 SC. The base model without spokes (EXP-29) achieves 24/25 valid JSON zero-shot. Diagnostic showed the model generates faithful *content* (entity preservation, correct facts) but cannot maintain valid JSON *structure* — `structured_concepts` has mixed types, fields are nested incorrectly, output truncated by verbose malformed sections. The model was trained on 5,238 perfectly structured examples but the spokes failed to learn schema compliance. +- **Result:** NEGATIVE. Best eval loss 1.2002 (PPL 3.3) does not translate to usable generation. The eval loss improvement (-0.483) is real for teacher-forced prediction but autoregressive generation with NF4 spokes degrades output quality below the base model's zero-shot capability. +- **Verdict:** INCONCLUSIVE. Python HF generate() with trained spokes produces valid faithful JSON (entity preservation, correct schema fields), but llama.cpp server with the same exported GGUF produces incoherent output. The discrepancy points to a bug in the llama.cpp fork's Gemma spoke application (gemma4-iswa.cpp), not a training failure. Additionally, GBNF grammar enforcement was never tested through a working inference path — the experiment cannot be judged until spokes + grammar are evaluated together. Verdict suspended pending: (1) llama.cpp spoke debugging, (2) spokes + GBNF eval on the 25 gold probes. +- **Key learning:** Do not declare verdicts based on incomplete inference pipelines. The eval script had multiple bugs (missing repetition_penalty, no markdown fence stripping, insufficient max_tokens) that produced false negatives. Always verify the inference path produces sane output on a trivial input before running the full evaluation.