From 9e874ab87a123eeda74d12945df5d78472151226 Mon Sep 17 00:00:00 2001 From: Caleb Gross Date: Thu, 9 Apr 2026 15:12:55 -0400 Subject: [PATCH] fix: encoding faithfulness + amend raw_id + dashboard timeline bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove getRelatedContext() from encoding pipeline — FTS5 keyword matching injected unrelated memory summaries into the LLM prompt, causing cross-contamination (#383). Also removes extractKeywords and joinConcepts (dead code after removal). - amend tool now accepts raw_id in addition to memory_id, resolving via GetMemoryByRawID when memory_id lookup fails (#382). Mirrors the check_memory pattern. - Dashboard: fix sticky "Today" header overlapping first timeline entry (top: 30px → 0, solid background). Fix time formatting producing single-digit minutes (manual zero-padding replaces locale-dependent toLocaleString). - Sync Python training_constants.py with Go buildCompressionPrompt (remove related_ctx parameter). Remove RELATED_MEMORY_STUB from prepare_faithfulness_data.py. Closes #382, closes #383 Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/agent/encoding/agent.go | 79 +---------------- internal/agent/encoding/agent_test.go | 85 ------------------- internal/api/server.go | 2 +- internal/mcp/server.go | 35 ++++++-- internal/mcp/tools.go | 10 ++- internal/web/static/css/components.css | 4 +- internal/web/static/js/timeline.js | 5 +- training/docs/experiment_registry.md | 17 ++++ training/scripts/export_qwen35_spokes.py | 11 ++- training/scripts/prepare_faithfulness_data.py | 11 +-- training/scripts/train_qwen_spokes.py | 2 +- training/scripts/training_constants.py | 3 - 12 files changed, 71 insertions(+), 193 deletions(-) diff --git a/internal/agent/encoding/agent.go b/internal/agent/encoding/agent.go index cc215c8e..9bde7087 100644 --- a/internal/agent/encoding/agent.go +++ b/internal/agent/encoding/agent.go @@ -1147,10 +1147,9 @@ func (ea *EncodingAgent) compressAndExtractConcepts(ctx context.Context, raw sto // Gather contextual information for richer encoding episodeCtx := ea.getEpisodeContext(ctx, raw) - relatedCtx := ea.getRelatedContext(ctx, raw) // Build the LLM prompt - prompt := buildCompressionPrompt(truncatedContent, raw.Source, raw.Type, episodeCtx, relatedCtx, ea.coachingInstructions, ea.config.ConceptVocabulary) + prompt := buildCompressionPrompt(truncatedContent, raw.Source, raw.Type, episodeCtx, ea.coachingInstructions, ea.config.ConceptVocabulary) req := llm.CompletionRequest{ Messages: []llm.Message{ @@ -1221,7 +1220,7 @@ func (ea *EncodingAgent) compressAndExtractConcepts(ctx context.Context, raw sto // NOTE: The prompt deliberately avoids showing a JSON template because the local LLM model // echoes template placeholder text verbatim into the output fields. Structured output // (response_format with json_schema) enforces the JSON structure instead. -func buildCompressionPrompt(content, source, memType, episodeCtx, relatedCtx, coachingInstructions string, conceptVocabulary []string) string { +func buildCompressionPrompt(content, source, memType, episodeCtx, coachingInstructions string, conceptVocabulary []string) string { var b strings.Builder if source == "ingest" { @@ -1268,10 +1267,6 @@ Fill in every JSON field based on the actual event content below: if episodeCtx != "" { b.WriteString(episodeCtx) } - if relatedCtx != "" { - b.WriteString(relatedCtx) - } - if coachingInstructions != "" { b.WriteString(coachingInstructions) b.WriteString("\n\n") @@ -1779,35 +1774,6 @@ func (ea *EncodingAgent) getEpisodeContext(ctx context.Context, raw store.RawMem return result } -// getRelatedContext gathers semantically similar existing memories for context. -func (ea *EncodingAgent) getRelatedContext(ctx context.Context, raw store.RawMemory) string { - // Use concept-based search with keywords from the raw content - words := extractKeywords(raw.Content) - if len(words) == 0 { - return "" - } - - if len(words) > 5 { - words = words[:5] - } - - related, err := ea.store.SearchByConcepts(ctx, words, 3) - if err != nil || len(related) == 0 { - return "" - } - - result := "RELATED EXISTING MEMORIES:\n" - for _, mem := range related { - result += fmt.Sprintf(" - [%s] %s (concepts: %s)\n", - mem.Timestamp.Format("2006-01-02 15:04"), - mem.Summary, - joinConcepts(mem.Concepts), - ) - } - result += "\n" - return result -} - // getEpisodeIDForRaw finds which episode a raw memory belongs to. // Checks both open and recently closed episodes since encoding is async // and the episode may close before encoding completes. @@ -1836,47 +1802,6 @@ func getEpisodeIDForRaw(ea *EncodingAgent, ctx context.Context, raw store.RawMem return "" } -// extractKeywords pulls significant words from content for concept search. -func extractKeywords(content string) []string { - // Simple keyword extraction: split, filter short/common words - words := strings.Fields(strings.ToLower(content)) - seen := make(map[string]bool) - var keywords []string - - stopWords := map[string]bool{ - "the": true, "a": true, "an": true, "is": true, "was": true, - "are": true, "were": true, "be": true, "been": true, "being": true, - "have": true, "has": true, "had": true, "do": true, "does": true, - "did": true, "will": true, "would": true, "could": true, "should": true, - "may": true, "might": true, "shall": true, "can": true, "to": true, - "of": true, "in": true, "for": true, "on": true, "with": true, - "at": true, "by": true, "from": true, "as": true, "into": true, - "through": true, "during": true, "before": true, "after": true, - "it": true, "its": true, "this": true, "that": true, "these": true, - "and": true, "but": true, "or": true, "nor": true, "not": true, - } - - for _, w := range words { - if len(w) < 3 || stopWords[w] || seen[w] { - continue - } - seen[w] = true - keywords = append(keywords, w) - if len(keywords) >= 10 { - break - } - } - return keywords -} - -// joinConcepts joins concepts with commas. -func joinConcepts(concepts []string) string { - if len(concepts) == 0 { - return "none" - } - return strings.Join(concepts, ", ") -} - // truncateString truncates a string to maxLen characters. // Uses rune-aware slicing to avoid splitting multi-byte UTF-8 characters. func truncateString(s string, maxLen int) string { diff --git a/internal/agent/encoding/agent_test.go b/internal/agent/encoding/agent_test.go index 16f566d6..72f8a84b 100644 --- a/internal/agent/encoding/agent_test.go +++ b/internal/agent/encoding/agent_test.go @@ -539,91 +539,6 @@ func TestHeuristicSalience(t *testing.T) { }) } -// --------------------------------------------------------------------------- -// Tests for extractKeywords -// --------------------------------------------------------------------------- - -func TestExtractKeywords(t *testing.T) { - t.Run("extracts meaningful words", func(t *testing.T) { - keywords := extractKeywords("debugging the authentication module for error handling") - - if len(keywords) == 0 { - t.Fatal("expected at least one keyword") - } - // Should not contain stop words - for _, kw := range keywords { - if kw == "the" || kw == "for" { - t.Errorf("unexpected stop word %q in keywords", kw) - } - } - }) - - t.Run("limits to 10 keywords", func(t *testing.T) { - longContent := strings.Repeat("alpha bravo charlie delta echo foxtrot golf hotel india juliet kilo lima ", 5) - keywords := extractKeywords(longContent) - - if len(keywords) > 10 { - t.Errorf("expected at most 10 keywords, got %d", len(keywords)) - } - }) - - t.Run("deduplicates words", func(t *testing.T) { - keywords := extractKeywords("testing testing testing testing") - count := 0 - for _, kw := range keywords { - if kw == "testing" { - count++ - } - } - if count > 1 { - t.Errorf("expected 'testing' to appear at most once, appeared %d times", count) - } - }) - - t.Run("empty content returns empty", func(t *testing.T) { - keywords := extractKeywords("") - if len(keywords) != 0 { - t.Errorf("expected empty keywords for empty content, got %v", keywords) - } - }) - - t.Run("filters short words", func(t *testing.T) { - keywords := extractKeywords("go is ok to do it") - for _, kw := range keywords { - if len(kw) < 3 { - t.Errorf("unexpected short word %q in keywords", kw) - } - } - }) -} - -// --------------------------------------------------------------------------- -// Tests for joinConcepts -// --------------------------------------------------------------------------- - -func TestJoinConcepts(t *testing.T) { - t.Run("joins concepts with comma", func(t *testing.T) { - result := joinConcepts([]string{"go", "testing", "memory"}) - if result != "go, testing, memory" { - t.Errorf("expected 'go, testing, memory', got %q", result) - } - }) - - t.Run("empty returns none", func(t *testing.T) { - result := joinConcepts([]string{}) - if result != "none" { - t.Errorf("expected 'none', got %q", result) - } - }) - - t.Run("single concept", func(t *testing.T) { - result := joinConcepts([]string{"single"}) - if result != "single" { - t.Errorf("expected 'single', got %q", result) - } - }) -} - // --------------------------------------------------------------------------- // Tests for isTemporalRelationship // --------------------------------------------------------------------------- diff --git a/internal/api/server.go b/internal/api/server.go index 9ea6528d..8fab0a21 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -30,7 +30,7 @@ type ServerConfig struct { type ServerDeps struct { Store store.Store LLM llm.Provider - ModelManager llm.ModelManager // can be nil if not using embedded provider + ModelManager llm.ModelManager // can be nil if not using embedded provider Bus events.Bus Retriever *retrieval.RetrievalAgent Consolidator routes.ConsolidationRunner // can be nil if disabled diff --git a/internal/mcp/server.go b/internal/mcp/server.go index 4663e5a6..5372b66d 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -2596,9 +2596,11 @@ func (srv *MCPServer) handleListExclusions(ctx context.Context, args map[string] // handleAmend updates a memory's content in place, preserving associations and history. func (srv *MCPServer) handleAmend(ctx context.Context, args map[string]interface{}) (interface{}, error) { - memoryID, ok := args["memory_id"].(string) - if !ok || memoryID == "" { - return nil, fmt.Errorf("memory_id parameter is required") + rawID, _ := args["raw_id"].(string) + memoryID, _ := args["memory_id"].(string) + + if rawID == "" && memoryID == "" { + return nil, fmt.Errorf("at least one of raw_id or memory_id is required") } correctedContent, ok := args["corrected_content"].(string) @@ -2606,6 +2608,23 @@ func (srv *MCPServer) handleAmend(ctx context.Context, args map[string]interface return nil, fmt.Errorf("corrected_content parameter is required") } + // Resolve to encoded memory ID — try memory_id first, fall back to raw_id + var resolvedID string + if memoryID != "" { + if _, err := srv.store.GetMemory(ctx, memoryID); err == nil { + resolvedID = memoryID + } + } + if resolvedID == "" && rawID != "" { + m, err := srv.store.GetMemoryByRawID(ctx, rawID) + if err == nil { + resolvedID = m.ID + } + } + if resolvedID == "" { + return nil, fmt.Errorf("memory not found — check that the ID is correct (use check_memory to look up by raw_id)") + } + // Generate a simple summary (first 120 chars of content) summary := correctedContent if len(summary) > 120 { @@ -2613,22 +2632,22 @@ func (srv *MCPServer) handleAmend(ctx context.Context, args map[string]interface } // Use empty concepts and embedding — encoding agent can re-process if needed - if err := srv.store.AmendMemory(ctx, memoryID, correctedContent, summary, nil, nil); err != nil { - srv.log.Error("failed to amend memory", "memory_id", memoryID, "error", err) + if err := srv.store.AmendMemory(ctx, resolvedID, correctedContent, summary, nil, nil); err != nil { + srv.log.Error("failed to amend memory", "memory_id", resolvedID, "error", err) return nil, fmt.Errorf("failed to amend memory: %w", err) } // Publish event if srv.bus != nil { _ = srv.bus.Publish(ctx, events.MemoryAmended{ - MemoryID: memoryID, + MemoryID: resolvedID, NewSummary: summary, Ts: time.Now(), }) } - srv.log.Info("memory amended", "memory_id", memoryID) - return toolResult(fmt.Sprintf("Amended memory %s. Content updated, associations and history preserved. Salience bumped +0.05.", memoryID)), nil + srv.log.Info("memory amended", "memory_id", resolvedID) + return toolResult(fmt.Sprintf("Amended memory %s. Content updated, associations and history preserved. Salience bumped +0.05.", resolvedID)), nil } // handleCheckMemory inspects a memory's encoding status, concepts, and associations. diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 6bb344b5..99c74338 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -591,20 +591,24 @@ func listExclusionsToolDef() ToolDefinition { func amendToolDef() ToolDefinition { return ToolDefinition{ Name: "amend", - Description: "Update a memory's content while preserving its ID, associations, activation history, and salience. Use when a recalled memory is stale or incorrect. Records an audit trail of the change.", + Description: "Update a memory's content while preserving its ID, associations, activation history, and salience. Use when a recalled memory is stale or incorrect. Records an audit trail of the change. Accepts either raw_id (from remember) or memory_id (encoded).", InputSchema: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ "memory_id": map[string]interface{}{ "type": "string", - "description": "The memory ID to amend", + "description": "The encoded memory ID to amend", + }, + "raw_id": map[string]interface{}{ + "type": "string", + "description": "The raw memory ID returned by remember — will be resolved to the encoded memory", }, "corrected_content": map[string]interface{}{ "type": "string", "description": "The updated memory content", }, }, - "required": []string{"memory_id", "corrected_content"}, + "required": []string{"corrected_content"}, }, } } diff --git a/internal/web/static/css/components.css b/internal/web/static/css/components.css index ac1bb01f..fee06eb5 100644 --- a/internal/web/static/css/components.css +++ b/internal/web/static/css/components.css @@ -466,10 +466,10 @@ blockquote.quote .quote-body { font-size: 0.88rem; font-weight: bold; color: var(--text-dim); - background: linear-gradient(to bottom, rgba(92,114,184,0.08), rgba(92,114,184,0.02)); + background: var(--bg-primary, #0f172a); border-bottom: 1px solid var(--border-color); position: sticky; - top: 30px; + top: 0; z-index: 50; display: flex; justify-content: space-between; diff --git a/internal/web/static/js/timeline.js b/internal/web/static/js/timeline.js index d4bbb429..1cab3b0a 100644 --- a/internal/web/static/js/timeline.js +++ b/internal/web/static/js/timeline.js @@ -158,7 +158,10 @@ export function renderTimelineItems() { export function renderTimelineCard(item, idx) { var kind = item._kind; var salPct = Math.min(100, Math.round((item._salience || 0) * 100)); - var absTime = item._date.toLocaleString(undefined, { hour: '2-digit', minute: '2-digit' }); + var h = item._date.getHours(), m = item._date.getMinutes(); + var ampm = h >= 12 ? 'PM' : 'AM'; + h = h % 12 || 12; + var absTime = h + ':' + (m < 10 ? '0' : '') + m + ' ' + ampm; var concepts = item._concepts || []; var source = item._source || ''; var project = item._project || ''; diff --git a/training/docs/experiment_registry.md b/training/docs/experiment_registry.md index a5a9e144..3788889b 100644 --- a/training/docs/experiment_registry.md +++ b/training/docs/experiment_registry.md @@ -1000,3 +1000,20 @@ Rotation parameter overhead per layer (rank=64): - **Tracking:** GitHub issue #381 (Phase 4) - **Result:** (pending — awaiting v7 gold-standard outputs from Gemini Batch API) - **Verdict:** (pending) + +### EXP-27: Qwen 3.5 4B — Model Scale Upgrade with V7 Data + +- **Date:** 2026-04-09 +- **Status:** REGISTERED +- **Hypothesis:** Qwen 3.5 4B (2560 hidden, 32 layers, 16/4 Q/KV heads) as the frozen base will match or exceed Qwen 3.5 2B spoke quality on encoding while providing a stronger foundation for multi-task spokes (synthesis, retrieval). The wider hidden dim and deeper architecture should improve faithfulness and generalization on diverse inputs without spoke architecture changes. +- **Variable:** Base model size (Qwen 3.5 2B → Qwen 3.5 4B). All other config matched to EXP-26. +- **Control:** EXP-26 (Qwen 3.5 2B, v7 data, same hardware). Direct comparison: same data, same spoke config (4 spokes, rank 64), same hyperparameters. +- **Prediction:** Faithfulness metrics match or exceed EXP-26 (EPR >90%, FR <5%, SC 100%). Eval loss ≤ EXP-26. Stress test 7/7. If 4B doesn't improve over 2B on encoding, the value is in multi-task spoke routing (synthesis/retrieval) where richer base representations matter. +- **Config:** Qwen 3.5 4B (frozen, bf16, ~8 GB) + 4 spokes rank 64 on all 32 layers (~33M trainable params, ~0.8% overhead), batch 1, grad_accum 8, seq_len 2375, LR 3e-4, scalar_lr_scale 0.1, Muon + AdamW, gradient_checkpointing, patience 5, eval_interval 200. Chunked cross-entropy (256 positions). Architecture note: 32 layers in 3:1 DeltaNet/attention ratio (24 DeltaNet + 8 full attention). Spokes applied to all 32 layers. +- **Data:** V7 dataset (same as EXP-26). Production prompt format via build_production_prompt(). Retokenized with Qwen 3.5 4B tokenizer (same tokenizer family, 248K vocab). +- **Hardware:** Local RX 7800 XT, 16GB VRAM, ROCm 7.2.1. Daemon stopped for training. VRAM budget: ~8 GB base (bf16) + ~132 MB spokes (fp32) + ~264 MB optimizer + activations (gradient checkpointing). Expected to fit within 16 GB. +- **Metrics:** Primary: 7-metric faithfulness eval (EPR, FR, TED, CCS, MIH, NP, SC). Secondary: eval loss/PPL, stress_test_hallucination.py (7/7 target), novel schema compliance. Tertiary: inference throughput (tok/s) at RQ4 via llama.cpp. +- **Inference plan:** Export via export_qwen35_spokes.py (now parameterized for any Qwen 3.5 size), quantize to RQ4 via rotorq_quantize_gguf.py, benchmark throughput on RX 7800 XT. Expected: ~2.25 GB weights (RQ4), ~60-70 tok/s. +- **Open question:** Should spokes be placed on all 32 layers, or only the 8 full-attention layers? DeltaNet layers use linear attention with recurrent state — spoke adaptation may not be needed there. Could test attention-only spoke placement as a follow-up (EXP-28). +- **Result:** (pending — blocked on EXP-26 completion) +- **Verdict:** (pending) diff --git a/training/scripts/export_qwen35_spokes.py b/training/scripts/export_qwen35_spokes.py index 1e46b015..0d229e94 100644 --- a/training/scripts/export_qwen35_spokes.py +++ b/training/scripts/export_qwen35_spokes.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Export Qwen 3.5 2B + trained spoke weights to a single GGUF file. +"""Export Qwen 3.5 + trained spoke weights to a single GGUF file. Two-phase approach: (1) convert the base HF model to GGUF using llama.cpp's standard converter, then (2) patch the GGUF to add spoke tensors and metadata @@ -12,6 +12,11 @@ --spokes checkpoints/exp20_v6_local/best_spokes.pt \ --output models/qwen35-2b-spokes-f16.gguf + python training/scripts/export_qwen35_spokes.py \ + --model models/qwen3.5-4b \ + --spokes checkpoints/exp27_v7_4b/best_spokes.pt \ + --output models/qwen35-4b-spokes-f16.gguf + Requires: pip install gguf numpy torch (in the felixlm venv) """ @@ -107,7 +112,9 @@ def main(): print(f" Output: {output_path}") # --- Phase 1: Convert base model to GGUF --- - base_gguf = output_path.parent / "qwen35-2b-f16.gguf" + # Derive base GGUF name from model directory (e.g., "qwen3.5-2b" -> "qwen35-2b-f16.gguf") + model_stem = model_path.name.replace(".", "") # "qwen3.5-4b" -> "qwen35-4b" + base_gguf = output_path.parent / f"{model_stem}-f16.gguf" if not base_gguf.exists(): print(f"\nPhase 1: Converting base model to GGUF...") converter = LLAMACPP_DIR / "convert_hf_to_gguf.py" diff --git a/training/scripts/prepare_faithfulness_data.py b/training/scripts/prepare_faithfulness_data.py index 345f6185..333cad7a 100644 --- a/training/scripts/prepare_faithfulness_data.py +++ b/training/scripts/prepare_faithfulness_data.py @@ -38,13 +38,7 @@ "(make test), 1 MCP remember call.\n\n" ) -RELATED_MEMORY_STUB = ( - "RELATED EXISTING MEMORIES (for context, do not copy into encoding):\n" - "- [mem-001] Decision: chose SQLite over Postgres for local-first simplicity\n" - "- [mem-002] Insight: spread activation with decay 0.7 limits distant associations\n\n" -) - -# Ids that get episode + related context (per issue spec: 2 of 25) +# Ids that get episode context (per issue spec: 2 of 25) CONTEXT_IDS = {3, 18} @@ -110,14 +104,11 @@ def format_for_training( # Build the production-format user prompt episode_ctx = EPISODE_CONTEXT_STUB if entry_id in CONTEXT_IDS else "" - related_ctx = RELATED_MEMORY_STUB if entry_id in CONTEXT_IDS else "" - user_prompt = build_production_prompt( content=raw_input, source=source, mem_type=mem_type, episode_ctx=episode_ctx, - related_ctx=related_ctx, ) # The assistant response is the gold JSON diff --git a/training/scripts/train_qwen_spokes.py b/training/scripts/train_qwen_spokes.py index 3798d85e..71998ab1 100644 --- a/training/scripts/train_qwen_spokes.py +++ b/training/scripts/train_qwen_spokes.py @@ -226,7 +226,7 @@ def train(args): ModelClass = GemmaWithSpokes if model_type == "gemma" else QwenWithSpokes extra_kwargs = {} if model_type == "qwen": - extra_kwargs["attn_implementation"] = "eager" # Flash attention may not work with hooks + extra_kwargs["attn_implementation"] = "sdpa" # Memory-efficient attention (SpokeWrappedLayer is SDPA-compatible) if model_type == "gemma" and not args.gradient_checkpointing: # No gradient checkpointing implies high-VRAM hardware — skip NF4 and PLE offload extra_kwargs["no_quantize"] = True diff --git a/training/scripts/training_constants.py b/training/scripts/training_constants.py index b2d24689..5ae3f7e4 100644 --- a/training/scripts/training_constants.py +++ b/training/scripts/training_constants.py @@ -102,7 +102,6 @@ def build_production_prompt( source: str = "mcp", mem_type: str = "general", episode_ctx: str = "", - related_ctx: str = "", coaching_instructions: str = "", concept_vocabulary: list[str] | None = None, ) -> str: @@ -162,8 +161,6 @@ def build_production_prompt( if episode_ctx: parts.append(episode_ctx) - if related_ctx: - parts.append(related_ctx) if coaching_instructions: parts.append(coaching_instructions) parts.append("\n\n")