Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cmd/mnemonic/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ func buildRetrievalConfig(cfg *config.Config) retrieval.RetrievalConfig {
RecencyBoostWeight: float32(cfg.Retrieval.RecencyBoostWeight),
RecencyHalfLifeDays: float32(cfg.Retrieval.RecencyHalfLifeDays),

TypeFilterRecencyWeight: float32(cfg.Retrieval.TypeFilterRecencyWeight),
TypeFilterRecencyHalfLife: float32(cfg.Retrieval.TypeFilterRecencyHalfLife),

ActivityBonusMax: float32(cfg.Retrieval.ActivityBonusMax),
ActivityBonusScale: float32(cfg.Retrieval.ActivityBonusScale),

Expand Down
23 changes: 21 additions & 2 deletions internal/agent/retrieval/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ type RetrievalConfig struct {
ActivityBonusMax float32 // cap on Hebbian activity bonus (default: 0.2)
ActivityBonusScale float32 // scale factor for activity bonus log curve (default: 0.02)

// Type-filtered query recency — when filtering by type, recency matters more
// than semantic match (the type already constrains relevance). These override
// RecencyBoostWeight/RecencyHalfLifeDays for type-filtered queries.
TypeFilterRecencyWeight float32 // max recency bonus for type-filtered queries (default: 0.5)
TypeFilterRecencyHalfLife float32 // half-life in days for type-filtered recency (default: 7)

// Significance multipliers
CriticalBoost float32 // multiplier for "critical" significance memories (default: 1.2)
ImportantBoost float32 // multiplier for "important" significance memories (default: 1.1)
Expand Down Expand Up @@ -103,6 +109,9 @@ func DefaultConfig() RetrievalConfig {
ActivityBonusMax: 0.2,
ActivityBonusScale: 0.02,

TypeFilterRecencyWeight: 0.5,
TypeFilterRecencyHalfLife: 7,

CriticalBoost: 1.2,
ImportantBoost: 1.1,

Expand Down Expand Up @@ -701,9 +710,19 @@ func (ra *RetrievalAgent) rankResults(ctx context.Context, activated map[string]
// Using CreatedAt (not LastAccessed) prevents a feedback loop where
// frequently-recalled memories continually reset their recency bonus
// via IncrementAccess. The activity bonus already rewards frequent access.
//
// For type-filtered queries, recency is amplified: the type filter already
// constrains relevance, so WHEN matters more than semantic match. This
// ensures the most recent handoff/decision/error surfaces first.
daysSinceCreated := float32(time.Since(mem.CreatedAt).Hours() / 24)
recencyWt := agentutil.Float32Or(ra.config.RecencyBoostWeight, 0.2)
recencyHL := agentutil.Float32Or(ra.config.RecencyHalfLifeDays, 30)
var recencyWt, recencyHL float32
if typeFiltered {
recencyWt = agentutil.Float32Or(ra.config.TypeFilterRecencyWeight, 0.5)
recencyHL = agentutil.Float32Or(ra.config.TypeFilterRecencyHalfLife, 7)
} else {
recencyWt = agentutil.Float32Or(ra.config.RecencyBoostWeight, 0.2)
recencyHL = agentutil.Float32Or(ra.config.RecencyHalfLifeDays, 30)
}
recencyBonus := recencyWt * float32(math.Exp(float64(-daysSinceCreated/recencyHL)))

// Hebbian activity bonus — frequently traversed associations indicate relevance.
Expand Down
163 changes: 163 additions & 0 deletions internal/agent/retrieval/config_behavior_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,169 @@ func TestConfigSynthesisMaxTokensPassedToLLM(t *testing.T) {
}
}

func TestConfigTypeFilterRecencyBoostsRecent(t *testing.T) {
// Scenario: two handoff memories with identical salience.
// m_old was created 7 days ago and has more associations (higher base activation).
// m_new was created 30 minutes ago.
// With the type-filter recency boost (weight 0.5, half-life 7 days), the new
// handoff must rank above the old one despite the old one's association advantage.

now := time.Now()
mNew := store.Memory{
ID: "m_new",
Summary: "session handoff 2026-04-11",
Content: "recent handoff content",
Salience: 0.95,
CreatedAt: now.Add(-30 * time.Minute),
Source: "mcp",
Type: "handoff",
}
mOld := store.Memory{
ID: "m_old",
Summary: "session handoff 2026-04-04",
Content: "old handoff content",
Salience: 0.95,
CreatedAt: now.Add(-7 * 24 * time.Hour),
Source: "mcp",
Type: "handoff",
}

s := &mockStore{
searchByFullTextFunc: func(_ context.Context, _ string, _ int) ([]store.Memory, error) {
return nil, nil
},
searchByEmbeddingFunc: func(_ context.Context, _ []float32, _ int) ([]store.RetrievalResult, error) {
return nil, nil
},
searchByTypeFunc: func(_ context.Context, _ []string, _ int) ([]store.Memory, error) {
return []store.Memory{mNew, mOld}, nil
},
getAssociationsFunc: func(_ context.Context, memoryID string) ([]store.Association, error) {
// Old memory has more associations — simulates richer graph
if memoryID == "m_old" {
return []store.Association{
{SourceID: "m_old", TargetID: "m_other1", Strength: 0.8, RelationType: "temporal", ActivationCount: 5},
{SourceID: "m_old", TargetID: "m_other2", Strength: 0.7, RelationType: "similar", ActivationCount: 3},
}, nil
}
return nil, nil
},
getMemoryFunc: func(_ context.Context, id string) (store.Memory, error) {
switch id {
case "m_new":
return mNew, nil
case "m_old":
return mOld, nil
default:
return store.Memory{ID: id, Salience: 0.5, CreatedAt: now.Add(-14 * 24 * time.Hour)}, nil
}
},
}

cfg := DefaultConfig()
agent := NewRetrievalAgent(s, &mockLLMProvider{}, cfg, testLogger(), nil)

resp, err := agent.Query(context.Background(), QueryRequest{
Query: "session handoff",
Type: "handoff",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

if len(resp.Memories) < 2 {
t.Fatalf("expected at least 2 results, got %d", len(resp.Memories))
}

// The recent handoff must rank first
if resp.Memories[0].Memory.ID != "m_new" {
t.Errorf("expected m_new (recent) to rank first, but got %s (scores: %v)",
resp.Memories[0].Memory.ID,
func() []string {
var s []string
for _, m := range resp.Memories {
s = append(s, fmt.Sprintf("%s=%.4f", m.Memory.ID, m.Score))
}
return s
}())
}
}

func TestConfigTypeFilterRecencyParamsUsed(t *testing.T) {
// Verify that the type-filter recency params are actually applied (not the
// general ones) by using extreme values and checking the ranking effect.
now := time.Now()

mRecent := store.Memory{
ID: "m_recent",
Summary: "recent decision",
Salience: 0.5, // lower salience
CreatedAt: now.Add(-1 * time.Hour),
Source: "mcp",
Type: "decision",
}
mOld := store.Memory{
ID: "m_old",
Summary: "old decision",
Salience: 0.9, // higher salience
CreatedAt: now.Add(-30 * 24 * time.Hour),
Source: "mcp",
Type: "decision",
}

s := &mockStore{
searchByFullTextFunc: func(_ context.Context, _ string, _ int) ([]store.Memory, error) {
return nil, nil
},
searchByEmbeddingFunc: func(_ context.Context, _ []float32, _ int) ([]store.RetrievalResult, error) {
return nil, nil
},
searchByTypeFunc: func(_ context.Context, _ []string, _ int) ([]store.Memory, error) {
return []store.Memory{mRecent, mOld}, nil
},
getAssociationsFunc: func(_ context.Context, _ string) ([]store.Association, error) {
return nil, nil
},
getMemoryFunc: func(_ context.Context, id string) (store.Memory, error) {
switch id {
case "m_recent":
return mRecent, nil
case "m_old":
return mOld, nil
default:
return store.Memory{ID: id, Salience: 0.5, CreatedAt: now}, nil
}
},
}

// Use aggressive type-filter recency: weight=1.0, half-life=1 day
cfg := DefaultConfig()
cfg.TypeFilterRecencyWeight = 1.0
cfg.TypeFilterRecencyHalfLife = 1.0
agent := NewRetrievalAgent(s, &mockLLMProvider{}, cfg, testLogger(), nil)

resp, err := agent.Query(context.Background(), QueryRequest{
Query: "decision",
Type: "decision",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

if len(resp.Memories) < 2 {
t.Fatalf("expected at least 2 results, got %d", len(resp.Memories))
}

// With weight=1.0 and half-life=1 day:
// m_recent (1 hour old): bonus = 1.0 * exp(-0.04/1) ≈ 0.96
// m_old (30 days old): bonus = 1.0 * exp(-30/1) ≈ 0.0
// Even though m_old has higher salience, the recency must dominate
if resp.Memories[0].Memory.ID != "m_recent" {
t.Errorf("expected m_recent to rank first with aggressive type-filter recency, got %s",
resp.Memories[0].Memory.ID)
}
}

func TestConfigMaxToolCallsLimitsSynthesisTools(t *testing.T) {
now := time.Now()

Expand Down
7 changes: 7 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,10 @@ type RetrievalConfig struct {
ActivityBonusMax float64 `yaml:"activity_bonus_max"`
ActivityBonusScale float64 `yaml:"activity_bonus_scale"`

// Type-filtered query recency (stronger recency for type-narrowed queries)
TypeFilterRecencyWeight float64 `yaml:"type_filter_recency_weight"`
TypeFilterRecencyHalfLife float64 `yaml:"type_filter_recency_half_life"`

// Significance multipliers
CriticalBoost float64 `yaml:"critical_boost"`
ImportantBoost float64 `yaml:"important_boost"`
Expand Down Expand Up @@ -763,6 +767,9 @@ func Default() *Config {
ActivityBonusMax: 0.2,
ActivityBonusScale: 0.02,

TypeFilterRecencyWeight: 0.5,
TypeFilterRecencyHalfLife: 7,

CriticalBoost: 1.2,
ImportantBoost: 1.1,

Expand Down
1 change: 1 addition & 0 deletions internal/mcp/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -2738,6 +2738,7 @@ func (srv *MCPServer) handleCheckMemory(ctx context.Context, args map[string]any
fmt.Fprintf(&sb, "Memory %s (encoded)\n", mem.ID)
fmt.Fprintf(&sb, " Raw ID: %s\n", mem.RawID)
fmt.Fprintf(&sb, " Summary: %s\n", mem.Summary)
fmt.Fprintf(&sb, " Content: %s\n", mem.Content)
fmt.Fprintf(&sb, " Concepts: %v\n", mem.Concepts)
fmt.Fprintf(&sb, " Salience: %.2f\n", mem.Salience)
fmt.Fprintf(&sb, " State: %s\n", mem.State)
Expand Down
63 changes: 63 additions & 0 deletions internal/mcp/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ import (
"encoding/json"
"io"
"log/slog"
"strings"
"testing"
"time"

"github.com/appsprout-dev/mnemonic/internal/events"
"github.com/appsprout-dev/mnemonic/internal/store"
"github.com/appsprout-dev/mnemonic/internal/store/storetest"
)

Expand All @@ -17,6 +19,17 @@ type mockStore struct {
storetest.MockStore
}

// checkMemoryStore overrides GetMemory to return a configurable memory.
type checkMemoryStore struct {
storetest.MockStore
memory store.Memory
err error
}

func (s *checkMemoryStore) GetMemory(_ context.Context, _ string) (store.Memory, error) {
return s.memory, s.err
}

// mockBus is a minimal mock of the Bus interface for testing.
type mockBus struct{}

Expand Down Expand Up @@ -472,3 +485,53 @@ func TestContextMetricsJSON(t *testing.T) {
}
}
}

// TestHandleCheckMemoryIncludesContent verifies that check_memory output
// includes the full memory content, not just the summary.
func TestHandleCheckMemoryIncludesContent(t *testing.T) {
logger := slog.New(slog.NewTextHandler(io.Discard, nil))

mem := store.Memory{
ID: "mem-123",
RawID: "raw-456",
Summary: "short summary",
Content: "This is the full memory content that must appear in check_memory output.",
Concepts: []string{"test", "content"},
Salience: 0.85,
State: "active",
Source: "mcp",
Type: "handoff",
CreatedAt: time.Now(),
}

s := &checkMemoryStore{memory: mem}
srv := NewMCPServer(s, nil, &mockBus{}, logger, "test", "", []string{}, 0, nil, "", DefaultMemoryDefaults())

result, err := srv.handleCheckMemory(context.Background(), map[string]any{
"memory_id": "mem-123",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

// Extract the text from the tool result
resultMap, ok := result.(map[string]any)
if !ok {
t.Fatalf("expected map result, got %T", result)
}
contentArray, ok := resultMap["content"].([]map[string]any)
if !ok || len(contentArray) == 0 {
t.Fatal("expected content array in result")
}
text, _ := contentArray[0]["text"].(string)

// Verify content is present
if !strings.Contains(text, "Content: "+mem.Content) {
t.Errorf("check_memory output missing Content field.\nGot:\n%s", text)
}

// Verify summary is also present (regression check)
if !strings.Contains(text, "Summary: "+mem.Summary) {
t.Errorf("check_memory output missing Summary field.\nGot:\n%s", text)
}
}
47 changes: 43 additions & 4 deletions training/docs/experiment_registry.md
Original file line number Diff line number Diff line change
Expand Up @@ -1192,7 +1192,7 @@ Gemma E2B matches Qwen 4B on faithfulness while being 44% faster. The faithful p
### EXP-30: Gemma 4 E2B Spoke Training — Faithful Prompt + V7 Data

- **Date:** 2026-04-10
- **Status:** RUNNING
- **Status:** COMPLETED (training), PENDING (evaluation)
- **Hypothesis:** Gemma 4 E2B with trained Felix spokes on v7 data + faithful prompt will achieve 100% SC (schema compliance) while maintaining the 100% EPR and 100% NP demonstrated by the base model with the faithful prompt in EXP-29. The spokes learn the structural schema that the base model can't produce without grammar enforcement.
- **Null hypothesis:** Spoke training on Gemma E2B degrades the faithfulness achieved by the faithful prompt alone (EPR drops below 90% or FR rises above 5%). The base model + prompt is sufficient and spokes add no value.
- **Variable:** Spoke adapters trained on v7 encoding data with faithful prompt format. Base model, prompt, and quantization held constant.
Expand All @@ -1205,6 +1205,45 @@ Gemma E2B matches Qwen 4B on faithfulness while being 44% faster. The faithful p
- **Export plan:** Export spokes via Gemma-specific export script, quantize to RQ4 via rotorq pipeline, deploy in embedded llama.cpp backend.
- **Tracking:** Branch feat/gemma-e2b-spokes
- **Overfit probe (2026-04-10):** 10 train / 5 eval, 200 optimizer steps, batch 1 x accum 1, LR 3e-4. Online train loss 14.4→5.2, eval loss 1.80→1.61. Online train loss was misleading — diagnostic showed batch-1 oscillation noise. Evaluating the final checkpoint on training data in eval mode gave loss 1.56 (PPL 4.8), confirming the model learned. Train eval-mode loss (1.56) < eval loss (1.61) — pipeline is working. Gates barely moved (expected at 200 steps). Autocast asymmetry ruled out as cause (NF4 outputs bf16 regardless). WandB: spokes_tmp_b1x1.
- **Full training run (2026-04-10):** 15,714 micro-steps (1,964 optimizer steps, ~3 epochs), batch 1 x accum 8 = 8 effective. Warmup 20 optimizer steps. Eval loss trajectory: 1.6830 (init) → 1.6823 (step 200) → 1.6713 (step 400) → 1.6480 (step 600). Steady decline, LR still ramping. Gates frozen at initialization through step 600 (expected — scalar_lr_scale 0.1 is conservative). WandB: exp30_gemma4_v7_faithful. Checkpoints: `checkpoints/exp30_gemma4_v7_faithful/`.
- **Result:** (pending)
- **Verdict:** (pending)
- **Full training run (2026-04-10):** Early stopped at step 5,800 (patience 5). Best checkpoint: step 4,800 (eval loss 1.2002, PPL 3.3). Config ran past the planned 1,964 optimizer steps (~3 epochs) because the cosine schedule wraps — training continued through ~7.4 epochs total. WandB: exp30_gemma4_v7_faithful. Checkpoints: `checkpoints/exp30_gemma4_v7_faithful/`.
- **Eval loss trajectory:**

| Step | Eval Loss | PPL | Delta | Phase |
|------|-----------|-----|-------|-------|
| init | 1.6830 | 5.4 | — | baseline |
| 200 | 1.6823 | 5.4 | -0.001 | warmup |
| 400 | 1.6713 | 5.3 | -0.011 | LR ramping |
| 600 | 1.6480 | 5.2 | -0.023 | |
| 800 | 1.6026 | 5.0 | -0.045 | peak LR |
| 1000 | 1.5786 | 4.8 | -0.024 | phase 1 best |
| 1200 | 1.6137 | 5.0 | +0.035 | regression |
| 1400 | 1.6694 | 5.3 | +0.056 | |
| 1600 | 1.6786 | 5.4 | +0.009 | near-init |
| 1800 | 1.6153 | 5.0 | -0.063 | recovery |
| 2000 | 1.5248 | 4.6 | -0.091 | phase 2 begins |
| 2200 | 1.4991 | 4.5 | -0.026 | |
| 2400 | 1.4657 | 4.3 | -0.033 | |
| 2600 | 1.4438 | 4.2 | -0.022 | |
| 2800 | 1.4145 | 4.1 | -0.029 | |
| 3000 | 1.3913 | 4.0 | -0.023 | |
| 3200 | 1.3113 | 3.7 | -0.080 | accelerating |
| 3400 | 1.3026 | 3.7 | -0.009 | |
| 3600 | 1.2493 | 3.5 | -0.053 | |
| 3800 | 1.2284 | 3.4 | -0.021 | below MI300X init |
| 4000 | 1.2110 | 3.4 | -0.017 | |
| 4200 | 1.2256 | 3.4 | +0.015 | patience 1 |
| 4400 | 1.2098 | 3.4 | -0.016 | recovered |
| 4600 | 1.2017 | 3.3 | -0.008 | |
| **4800** | **1.2002** | **3.3** | **-0.002** | **best** |
| 5000 | 1.2160 | 3.4 | +0.016 | patience 1 |
| 5200 | 1.2190 | 3.4 | +0.003 | patience 2 |
| 5400 | 1.2296 | 3.4 | +0.011 | patience 3 |
| 5600 | 1.2349 | 3.4 | +0.005 | patience 4 |
| 5800 | 1.2688 | 3.6 | +0.034 | early stop |

- **Training dynamics:** Two distinct phases. Phase 1 (steps 0-1000, peak cosine LR ~3e-4): fast improvement to 1.5786, then regression back to near-init as LR decayed — the spokes couldn't maintain learned behavior at intermediate LR with NF4 quantization noise. Phase 2 (steps 1800+, minimum cosine LR ~3e-5): stable second descent through 14 consecutive new bests. The minimum LR is the productive regime for NF4 spoke training. **Implication:** future NF4 runs should use lower peak LR or longer training at constant low LR.
- **Gate movement:** 8 of 35 layers shifted from initialization — layers 0, 1, 2, 3, 4, 5 (early) and 32, 33, 34 (late). Movement was small (0.001-0.002 per layer) but consistent. Scalar_lr_scale=0.1 at peak LR 3e-4 = gate LR 3e-5 is too conservative for meaningful gate differentiation on NF4.
- **Evaluation (2026-04-11):** Multiple eval runs on 25 EXP-25 gold probes. Best result: 1/10 valid JSON (10%), 0 SC. The base model without spokes (EXP-29) achieves 24/25 valid JSON zero-shot. Diagnostic showed the model generates faithful *content* (entity preservation, correct facts) but cannot maintain valid JSON *structure* — `structured_concepts` has mixed types, fields are nested incorrectly, output truncated by verbose malformed sections. The model was trained on 5,238 perfectly structured examples but the spokes failed to learn schema compliance.
- **Result:** NEGATIVE. Best eval loss 1.2002 (PPL 3.3) does not translate to usable generation. The eval loss improvement (-0.483) is real for teacher-forced prediction but autoregressive generation with NF4 spokes degrades output quality below the base model's zero-shot capability.
- **Verdict:** INCONCLUSIVE. Python HF generate() with trained spokes produces valid faithful JSON (entity preservation, correct schema fields), but llama.cpp server with the same exported GGUF produces incoherent output. The discrepancy points to a bug in the llama.cpp fork's Gemma spoke application (gemma4-iswa.cpp), not a training failure. Additionally, GBNF grammar enforcement was never tested through a working inference path — the experiment cannot be judged until spokes + grammar are evaluated together. Verdict suspended pending: (1) llama.cpp spoke debugging, (2) spokes + GBNF eval on the 25 gold probes.
- **Key learning:** Do not declare verdicts based on incomplete inference pipelines. The eval script had multiple bugs (missing repetition_penalty, no markdown fence stripping, insufficient max_tokens) that produced false negatives. Always verify the inference path produces sane output on a trivial input before running the full evaluation.