From acdb385edcd89c24c44c7c3fffb95513eb7903eb Mon Sep 17 00:00:00 2001 From: Kirill Turanskiy <7106373+thebtf@users.noreply.github.com> Date: Sat, 21 Mar 2026 04:04:27 +0300 Subject: [PATCH 1/2] fix: orphan vector purge + low-quality pattern purge (migrations 041-042) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migration 041: DELETE FROM vectors WHERE sqlite_id NOT IN (SELECT id FROM observations) — purges ~229K orphan vectors from correct table name Migration 042: DELETE FROM patterns WHERE frequency < 5 — purges ~111K low-quality patterns accumulated from garbage SDK extraction — matches raised MinFrequency threshold (T019) --- internal/db/gorm/migrations.go | 34 +++++++++++++++++++++++++ internal/worker/handlers_maintenance.go | 1 + 2 files changed, 35 insertions(+) diff --git a/internal/db/gorm/migrations.go b/internal/db/gorm/migrations.go index 1ea9d0fe..aa4d92e2 100644 --- a/internal/db/gorm/migrations.go +++ b/internal/db/gorm/migrations.go @@ -1399,6 +1399,40 @@ func runMigrations(db *gorm.DB, embeddingDims int) error { return nil }, }, + // Migration 041: Purge orphan vectors — correct table name (vectors, not observation_vectors). + { + ID: "041_purge_orphan_vectors", + Migrate: func(tx *gorm.DB) error { + result := tx.Exec(`DELETE FROM vectors WHERE sqlite_id NOT IN (SELECT id FROM observations)`) + if result.Error != nil { + log.Warn().Err(result.Error).Msg("migration 041: orphan vector purge failed (non-fatal)") + return nil + } + log.Info().Int64("orphan_vectors_deleted", result.RowsAffected).Msg("migration 041: orphan vector purge complete") + return nil + }, + Rollback: func(tx *gorm.DB) error { + return nil + }, + }, + // Migration 042: Purge low-quality patterns (frequency < 5). + // With 111K+ patterns accumulated from garbage SDK extraction, most are noise. + // MinFrequency threshold was raised to 5 in T019 — patterns below this are worthless. + { + ID: "042_purge_low_quality_patterns", + Migrate: func(tx *gorm.DB) error { + result := tx.Exec(`DELETE FROM patterns WHERE frequency < 5`) + if result.Error != nil { + log.Warn().Err(result.Error).Msg("migration 042: pattern purge failed (non-fatal)") + return nil + } + log.Info().Int64("patterns_deleted", result.RowsAffected).Msg("migration 042: low-quality pattern purge complete") + return nil + }, + Rollback: func(tx *gorm.DB) error { + return nil + }, + }, }) if err := m.Migrate(); err != nil { return fmt.Errorf("run gormigrate migrations: %w", err) diff --git a/internal/worker/handlers_maintenance.go b/internal/worker/handlers_maintenance.go index 4d91bf8b..9a069421 100644 --- a/internal/worker/handlers_maintenance.go +++ b/internal/worker/handlers_maintenance.go @@ -343,3 +343,4 @@ func (s *Service) handlePatternCleanup(w http.ResponseWriter, r *http.Request) { "deleted": deleted, }) } + From efa50847774a7e4ffac05b6811f478a8499594e2 Mon Sep 17 00:00:00 2001 From: Kirill Turanskiy Date: Sat, 21 Mar 2026 14:27:31 +0300 Subject: [PATCH 2/2] feat: composite relevance scoring + radical observation cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migration 043: deletes ~170 garbage observations matching 45 title patterns (tool mechanics, task transitions, system prompt analysis, iSCSI debug, etc.) Composite scoring: search results re-ranked by: score = similarity × recencyDecay × typeWeight × importance - recencyDecay: 0.5^(age_days/7) — halves every 7 days - typeWeight: decision=1.4, bugfix=1.3, discovery=0.8, change=0.7 - importance: floor at 0.3 to prevent zero-scored observations Based on Deep Research (Gemini) + claude-mnemonic analysis findings. Sources: CrewAI, Mem0, Letta, Zep scoring formulas. --- internal/db/gorm/migrations.go | 92 +++++++++++++++++++++++++++++ internal/search/manager.go | 49 +++++++++++++++ internal/worker/handlers_context.go | 12 ++-- 3 files changed, 149 insertions(+), 4 deletions(-) diff --git a/internal/db/gorm/migrations.go b/internal/db/gorm/migrations.go index aa4d92e2..75a349a6 100644 --- a/internal/db/gorm/migrations.go +++ b/internal/db/gorm/migrations.go @@ -1433,6 +1433,98 @@ func runMigrations(db *gorm.DB, embeddingDims int) error { return nil }, }, + // Migration 043: Radical cleanup of garbage SDK-extracted observations. + // These observations were created by the SDK tool output extraction pipeline before v1.3.4 + // (whitelist mode). They are trivially discoverable facts, tool errors, status transitions, + // and cross-project noise that pollute semantic search and degrade agent performance. + { + ID: "043_radical_observation_cleanup", + Migrate: func(tx *gorm.DB) error { + garbagePatterns := []string{ + // Tool mechanics (trivially discoverable at runtime) + "Tool%Query Pattern%", + "Tool%Search%Pattern%", + "Tool%Naming Convention%", + "Tool%Selection%Pattern%", + "Tool Search%Found%", + "Tool%Match%Found%", + "Memory Store Tool%", + "Deferred Tool%", + "Exact Tool Match%", + + // Task status transitions (repeated 20+ times, zero value) + "Task Status%Transition%", + "Task%Completion%Confirmed%", + "Status Transition%", + "Status%Discrepancy%", + "No Work Available%", + + // Job tracking noise + "Job Status%", + "Job-Session ID%", + + // Process output artifacts + "Process Output%", + "Stderr%Handling%", + + // System prompt meta-observations + "Claude Anti-Sycophancy%", + "User Interaction Guidelines%", + "User Communication Guidelines%", + "Strict Verification Guidelines%", + "Copyright Enforcement%", + "Critical Reminders%", + "Search Scaling by%", + "Past Conversation Search%", + "System Prompt Access%", + "Anti-Sycophancy%", + "Keyword Extraction Guidelines%", + "Tone Consistency%", + "Zero-confirmation Rule%", + "Plugin Configuration Warnings%", + "Prioritize Internal Tools%", + + // Generic discoveries with no behavioral impact + "Brace%Discrepancy%", + "Brace%Detection%", + "Content Structure Pattern%", + "Severity Classification%", + "Pre-commit Check%", + "Commit Message%Convention%", + "Commit Message Structure%", + "File Size Monitoring%", + + // iSCSI debug noise (from nvmdfs project) + "iSCSI%", + + // Timestamp-based titles from subtitle parser + "00:%", + + // Test observations + "type test", + + // Robocopy/npm transient noise + "Robocopy%", + "npm install completion%", + } + + var totalDeleted int64 + for _, pattern := range garbagePatterns { + result := tx.Exec("DELETE FROM observations WHERE title LIKE ?", pattern) + if result.Error != nil { + log.Warn().Err(result.Error).Str("pattern", pattern).Msg("migration 043: delete failed") + continue + } + totalDeleted += result.RowsAffected + } + + log.Info().Int64("total_deleted", totalDeleted).Msg("migration 043: radical observation cleanup complete") + return nil + }, + Rollback: func(tx *gorm.DB) error { + return nil + }, + }, }) if err := m.Migrate(); err != nil { return fmt.Errorf("run gormigrate migrations: %w", err) diff --git a/internal/search/manager.go b/internal/search/manager.go index 6227e467..3e5ea917 100644 --- a/internal/search/manager.go +++ b/internal/search/manager.go @@ -119,6 +119,55 @@ func (m *SearchMetrics) GetStats() map[string]any { } } +// ApplyCompositeScoring re-ranks observations using multi-signal scoring. +// Formula: score = similarity × recencyDecay × typeWeight × max(importance, 0.3) +// This ensures that recent, high-importance decisions rank above old generic discoveries. +func ApplyCompositeScoring(observations []*models.Observation, similarityScores map[int64]float64) { + now := time.Now() + + // Type weights: decisions and patterns have higher behavioral impact + typeWeights := map[models.ObservationType]float64{ + "decision": 1.4, + "bugfix": 1.3, + "feature": 1.2, + "pattern": 1.2, + "discovery": 0.8, + "change": 0.7, + "refactor": 0.9, + } + + for _, obs := range observations { + sim := similarityScores[obs.ID] + if sim == 0 { + sim = 0.5 // default if no similarity score + } + + // Recency decay: half-life of 7 days + ageDays := now.Sub(time.Unix(obs.CreatedAtEpoch/1000, 0)).Hours() / 24.0 + recency := math.Pow(0.5, ageDays/7.0) + // Floor at 0.05 so old but very important observations don't disappear + if recency < 0.05 { + recency = 0.05 + } + + // Type weight + tw := 1.0 + if w, ok := typeWeights[obs.Type]; ok { + tw = w + } + + // Importance (floor at 0.3 so unscored observations aren't penalized to zero) + imp := obs.ImportanceScore + if imp < 0.3 { + imp = 0.3 + } + + // Composite score replaces raw similarity + compositeScore := sim * recency * tw * imp + similarityScores[obs.ID] = compositeScore + } +} + // Manager provides unified search across PostgreSQL and pgvector. type Manager struct { ctx context.Context diff --git a/internal/worker/handlers_context.go b/internal/worker/handlers_context.go index a010bf9e..ee0879a6 100644 --- a/internal/worker/handlers_context.go +++ b/internal/worker/handlers_context.go @@ -234,7 +234,6 @@ func (s *Service) handleSearchByPrompt(w http.ResponseWriter, r *http.Request) { } // Apply cross-encoder reranking if available - var reranked bool if s.reranker != nil && len(freshObservations) > 0 && usedVector { // Build candidates from observations with their bi-encoder scores candidates := make([]reranking.Candidate, len(freshObservations)) @@ -291,7 +290,6 @@ func (s *Service) handleSearchByPrompt(w http.ResponseWriter, r *http.Request) { } } freshObservations = reorderedObs - reranked = true log.Debug(). Int("candidates", len(candidates)). @@ -304,8 +302,14 @@ func (s *Service) handleSearchByPrompt(w http.ResponseWriter, r *http.Request) { clusteredObservations := clusterObservations(freshObservations, s.config.ClusteringThreshold) duplicatesRemoved := len(freshObservations) - len(clusteredObservations) - // Sort by similarity score (highest first) if we have scores and didn't rerank - if len(similarityScores) > 0 && len(clusteredObservations) > 0 && !reranked { + // Apply composite scoring (recency × type × importance) as a post-processing step. + // This re-weights scores already computed by vector search or cross-encoder reranking. + if len(clusteredObservations) > 0 { + search.ApplyCompositeScoring(clusteredObservations, similarityScores) + } + + // Sort by composite score (highest first) + if len(similarityScores) > 0 && len(clusteredObservations) > 0 { sort.Slice(clusteredObservations, func(i, j int) bool { scoreI := similarityScores[clusteredObservations[i].ID] scoreJ := similarityScores[clusteredObservations[j].ID]