From d70697af6b85e0310a0e89b4a3032bfb551bf05a Mon Sep 17 00:00:00 2001 From: chancsc Date: Wed, 13 May 2026 18:22:38 +0800 Subject: [PATCH 1/4] fix: raise cosine dedup threshold to prevent same-domain false UPDATE nomic-embed-text produces cosine ~0.75 for same-domain different-fact pairs (e.g. two butterfly survey records at different locations). The old threshold of 0.70 let cosine override token similarity, incorrectly classifying distinct insights as UPDATE and replacing the original. Raising to 0.85 ensures cosine only confirms deduplication when texts are genuinely near-identical. Adds regression test with controlled 0.75-cosine fake embeddings. Co-Authored-By: Claude Sonnet 4.6 --- internal/search/diff.go | 9 +++++---- internal/search/diff_test.go | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/internal/search/diff.go b/internal/search/diff.go index b897eec..832111e 100644 --- a/internal/search/diff.go +++ b/internal/search/diff.go @@ -76,10 +76,11 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe } } - // Combined similarity: cosine only contributes when above 0.7 - // (below that, same-domain but different content can produce false matches) + // Combined similarity: cosine only contributes when above 0.85. + // Below that, same-domain content (e.g. two butterfly survey locations) + // clusters around 0.70–0.84 and produces false UPDATE matches. similarity := tokenSim - if cosineSim >= 0.7 && cosineSim > similarity { + if cosineSim >= 0.85 && cosineSim > similarity { similarity = cosineSim } @@ -139,7 +140,7 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe } tokenSim := ContentSimilarity(newContent, ins.Content) similarity := tokenSim - if cp.sim >= 0.7 && cp.sim > similarity { + if cp.sim >= 0.85 && cp.sim > similarity { similarity = cp.sim } suggestion := classifySuggestion(similarity, newContent, ins.Content) diff --git a/internal/search/diff_test.go b/internal/search/diff_test.go index 802e496..c012a0c 100644 --- a/internal/search/diff_test.go +++ b/internal/search/diff_test.go @@ -105,6 +105,29 @@ func TestDiff_DuplicateOverridesOverall(t *testing.T) { } } +func TestDiff_SameDomainCosineNoOverride(t *testing.T) { + // Regression: same-domain facts with different locations must not trigger UPDATE. + // nomic-embed-text produces cosine ~0.75 for same-domain different-fact pairs. + // The old threshold (0.70) let cosine override token similarity and incorrectly + // classified as UPDATE, replacing the original insight. The fix raises it to 0.85. + insights := []*model.Insight{ + {ID: "kinabalu", Content: "Dichorragia nesimachus singleton at Kinabalu Park, Sabah."}, + } + // Two unit vectors with cosine similarity = 0.75: simulates same-domain different-fact embeddings. + newVec := []float64{1.0, 0.0} + existVec := []float64{0.75, 0.6614} // cos(newVec, existVec) = 0.75 + + result := Diff(insights, + "Dichorragia nesimachus first record in Bentong, Pahang.", + DiffOptions{ + NewEmbedding: newVec, + ExistingEmbed: []EmbeddedItem{{ID: "kinabalu", Embedding: existVec}}, + }) + if result.Suggestion != DiffAdd { + t.Errorf("cosine=0.75 (same domain, different location): want ADD, got %s", result.Suggestion) + } +} + func TestDiff_LimitDefault(t *testing.T) { insights := make([]*model.Insight, 20) for i := range insights { From 8dc68a97f112b3d8be07f84ece34902a0836d464 Mon Sep 17 00:00:00 2001 From: chancsc Date: Wed, 13 May 2026 19:12:42 +0800 Subject: [PATCH 2/4] fix: use Jaccard similarity in dedup to prevent same-domain false UPDATE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ContentSimilarity (bidirectional max) was too sensitive for formulaic scientific records: a Raub butterfly entry sharing the species name and standard phrasing with a Kinabalu entry produced tokenSim=0.5, crossing the UPDATE threshold and replacing the original. Jaccard (|A∩B|/|A∪B|) penalises texts that share domain vocabulary but have many distinct tokens (different facts). Same-domain different-location pairs now score ~0.28, falling below the 0.5 ADD threshold. Genuine one-word-change updates (SQLite→PostgreSQL) still score ~0.6 → UPDATE. ContentSimilarity is unchanged — bidirectional max remains correct for recall and keyword search. Co-Authored-By: Claude Sonnet 4.6 --- internal/search/diff.go | 4 ++-- internal/search/keyword.go | 23 +++++++++++++++++++++++ internal/search/keyword_test.go | 31 +++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/internal/search/diff.go b/internal/search/diff.go index 832111e..11b02d1 100644 --- a/internal/search/diff.go +++ b/internal/search/diff.go @@ -67,7 +67,7 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe // Step 2: score each candidate matches := make([]DiffMatch, 0, len(candidates)) for _, c := range candidates { - tokenSim := ContentSimilarity(newContent, c.Insight.Content) + tokenSim := JaccardSimilarity(newContent, c.Insight.Content) var cosineSim float64 if opts.NewEmbedding != nil { @@ -138,7 +138,7 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe if !ok { continue } - tokenSim := ContentSimilarity(newContent, ins.Content) + tokenSim := JaccardSimilarity(newContent, ins.Content) similarity := tokenSim if cp.sim >= 0.85 && cp.sim > similarity { similarity = cp.sim diff --git a/internal/search/keyword.go b/internal/search/keyword.go index e7c4344..26020b5 100644 --- a/internal/search/keyword.go +++ b/internal/search/keyword.go @@ -179,6 +179,29 @@ func flushCJK(buf []rune, tokens map[string]bool) { } } +// JaccardSimilarity computes token-set Jaccard similarity: |A∩B| / |A∪B|. +// Used for deduplication — stricter than ContentSimilarity because it penalises +// texts that share domain vocabulary but differ in the specific facts they state +// (e.g. same species name, different location). +func JaccardSimilarity(a, b string) float64 { + tokA := Tokenize(a) + tokB := Tokenize(b) + if len(tokA) == 0 || len(tokB) == 0 { + return 0 + } + intersection := 0 + for t := range tokA { + if tokB[t] { + intersection++ + } + } + union := len(tokA) + len(tokB) - intersection + if union == 0 { + return 0 + } + return float64(intersection) / float64(union) +} + // ContentSimilarity computes bidirectional token overlap between two texts. // Returns max(overlap_a_to_b, overlap_b_to_a) for a symmetric measure. func ContentSimilarity(a, b string) float64 { diff --git a/internal/search/keyword_test.go b/internal/search/keyword_test.go index 2d566ce..73d644f 100644 --- a/internal/search/keyword_test.go +++ b/internal/search/keyword_test.go @@ -124,6 +124,37 @@ func TestContentSimilarity_Empty(t *testing.T) { } } +func TestJaccardSimilarity_Identical(t *testing.T) { + if sim := JaccardSimilarity("Go uses SQLite", "Go uses SQLite"); sim != 1.0 { + t.Errorf("identical: want 1.0, got %f", sim) + } +} + +func TestJaccardSimilarity_Disjoint(t *testing.T) { + if sim := JaccardSimilarity("apple banana", "dog elephant"); sim != 0 { + t.Errorf("disjoint: want 0, got %f", sim) + } +} + +func TestJaccardSimilarity_SameDomainDifferentFact(t *testing.T) { + // Same species name (shared tokens) but different location (distinct tokens). + // Jaccard penalises the distinct tokens; bidirectional max would not. + a := "Dichorragia nesimachus singleton at Kinabalu Park Sabah lowland forest elevation" + b := "Dichorragia nesimachus first record Raub Pahang dipterocarp forest canopy specimen" + sim := JaccardSimilarity(a, b) + if sim >= 0.5 { + t.Errorf("same-domain different-fact: want Jaccard < 0.5 (ADD territory), got %f", sim) + } +} + +func TestJaccardSimilarity_OneWordChange(t *testing.T) { + // Same sentence with one word swapped — genuine update, Jaccard should be >= 0.5. + sim := JaccardSimilarity("Go uses SQLite for storage", "Go uses PostgreSQL for storage") + if sim < 0.5 { + t.Errorf("one-word-change: want Jaccard >= 0.5 (UPDATE territory), got %f", sim) + } +} + func TestKeywordSearch_Ranking(t *testing.T) { insights := []*model.Insight{ {ID: "1", Content: "Go language for building CLI tools", Importance: 3}, From da43bfabbde30a35ea39efc90a0208cb10d1e6a4 Mon Sep 17 00:00:00 2001 From: chancsc Date: Wed, 13 May 2026 21:14:10 +0800 Subject: [PATCH 3/4] fix: remove 'not' from negation words, gate conflict check at similarity>=0.7 Two bugs caused CONFLICT false positives on butterfly survey data: 1. "not" in negationWords fires on virtually all scientific text ("species not previously recorded", "not endemic to region"). Removed: only multi-word state-change phrases remain as signals. 2. Negation check fired at similarity>=0.5. At borderline similarity, texts share domain vocabulary without being about the same subject. Now only checked when similarity>=0.7. Also updates guide.md: PDF/external-document facts must use --no-diff since each document is a distinct authoritative source. Co-Authored-By: Claude Sonnet 4.6 --- internal/search/diff.go | 25 ++++++++++++++++--------- internal/search/diff_test.go | 22 +++++++++++++++++++++- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/internal/search/diff.go b/internal/search/diff.go index 11b02d1..9d33c4a 100644 --- a/internal/search/diff.go +++ b/internal/search/diff.go @@ -178,11 +178,13 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe } } -// negationWords detects potential contradictions. +// negationWords detects clear state-change signals. Single common words like +// "not" are intentionally excluded — they appear constantly in scientific and +// research text and cause false CONFLICT classifications. var negationWords = []string{ - "not", "no longer", "don't", "doesn't", "never", "switched from", + "no longer", "don't", "doesn't", "never", "switched from", "instead of", "rather than", "replaced", "deprecated", - "不", "没有", "不再", "放弃", "替换", "取消", + "不再", "放弃", "替换", "取消", } func classifySuggestion(similarity float64, newText, existingText string) DiffSuggestion { @@ -190,12 +192,17 @@ func classifySuggestion(similarity float64, newText, existingText string) DiffSu return DiffAdd } - // Check for negation/conflict signals (even at high similarity) - newLower := strings.ToLower(newText) - existLower := strings.ToLower(existingText) - for _, neg := range negationWords { - if strings.Contains(newLower, neg) || strings.Contains(existLower, neg) { - return DiffConflict + // Only check for conflict signals when texts are substantially similar. + // At borderline similarity (0.5–0.7) texts may share domain vocabulary + // without being about the same subject (e.g. two survey records from + // different locations with shared species names). + if similarity >= 0.7 { + newLower := strings.ToLower(newText) + existLower := strings.ToLower(existingText) + for _, neg := range negationWords { + if strings.Contains(newLower, neg) || strings.Contains(existLower, neg) { + return DiffConflict + } } } diff --git a/internal/search/diff_test.go b/internal/search/diff_test.go index c012a0c..b63dd72 100644 --- a/internal/search/diff_test.go +++ b/internal/search/diff_test.go @@ -33,13 +33,13 @@ func TestClassifySuggestion_ConflictNegation(t *testing.T) { newText string existing string }{ - {"not", "do not use Redis", "use Redis for caching"}, {"no longer", "no longer supports Python 2", "supports Python 2"}, {"replaced", "replaced Flask with FastAPI", "uses Flask for API"}, {"chinese_negation", "不再使用Redis", "使用Redis"}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + // Similarity >= 0.7 so negation check is active. got := classifySuggestion(0.7, tt.newText, tt.existing) if got != DiffConflict { t.Errorf("want CONFLICT, got %s", got) @@ -48,6 +48,26 @@ func TestClassifySuggestion_ConflictNegation(t *testing.T) { } } +func TestClassifySuggestion_NotWordNoConflict(t *testing.T) { + // "not" alone must NOT trigger CONFLICT — it appears constantly in + // scientific text ("species not previously recorded") and would cause + // false replacements of distinct survey records. + got := classifySuggestion(0.7, "species not recorded at this site", "species recorded at Kinabalu") + if got == DiffConflict { + t.Error("bare 'not' should not trigger CONFLICT") + } +} + +func TestClassifySuggestion_ConflictBelowThreshold(t *testing.T) { + // Negation words must NOT trigger CONFLICT when similarity < 0.7. + // Two survey records from different locations may share domain vocabulary + // and contain "no longer" or "replaced" in unrelated sentences. + got := classifySuggestion(0.6, "no longer present at Raub site", "butterfly survey Kinabalu") + if got == DiffConflict { + t.Error("negation below similarity 0.7 should not trigger CONFLICT") + } +} + func TestClassifySuggestion_Boundary(t *testing.T) { // Exactly 0.5 should not be ADD (it's >= 0.5) got := classifySuggestion(0.5, "some content here", "other content here") From 7044690367790408b57e06d8eff1b6162df1504e Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 09:58:36 +0000 Subject: [PATCH 4/4] chore: update CHANGELOG for dedup false-positive fixes --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9b8697..33c3cab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/). ## [Unreleased] +### Fixed +- Deduplication false positives on scientific and domain-specific text: + - Removed bare `"not"` from negation words — it appears in virtually all + scientific prose and caused unrelated records to be classified as CONFLICT. + - Gated negation-word check behind similarity ≥ 0.7 — at borderline + similarity, shared domain vocabulary is not a reliable conflict signal. + - Raised cosine dedup threshold from 0.70 to 0.85 — same-domain + different-fact pairs (e.g. survey records at different locations) produce + cosine ~0.75 with nomic-embed-text and were incorrectly triggering UPDATE. + - Switched token dedup from bidirectional-max (`ContentSimilarity`) to + Jaccard (`|A∩B|/|A∪B|`) — penalises texts that share vocabulary but + differ in most tokens, preventing formulaic records from scoring as UPDATE. + ## [0.1.4] - 2026-05-16 ### Added