mnemon-dev · Grivn · May 17, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
 
 ## [Unreleased]
 
+### Fixed
+- Deduplication false positives on scientific and domain-specific text:
+  - Removed bare `"not"` from negation words — it appears in virtually all
+    scientific prose and caused unrelated records to be classified as CONFLICT.
+  - Gated negation-word check behind similarity ≥ 0.7 — at borderline
+    similarity, shared domain vocabulary is not a reliable conflict signal.
+  - Raised cosine dedup threshold from 0.70 to 0.85 — same-domain
+    different-fact pairs (e.g. survey records at different locations) produce
+    cosine ~0.75 with nomic-embed-text and were incorrectly triggering UPDATE.
+  - Switched token dedup from bidirectional-max (`ContentSimilarity`) to
+    Jaccard (`|A∩B|/|A∪B|`) — penalises texts that share vocabulary but
+    differ in most tokens, preventing formulaic records from scoring as UPDATE.
+
 ## [0.1.4] - 2026-05-16
 
 ### Added

diff --git a/internal/search/diff.go b/internal/search/diff.go
@@ -67,7 +67,7 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe
 	// Step 2: score each candidate
 	matches := make([]DiffMatch, 0, len(candidates))
 	for _, c := range candidates {
-		tokenSim := ContentSimilarity(newContent, c.Insight.Content)
+		tokenSim := JaccardSimilarity(newContent, c.Insight.Content)
 
 		var cosineSim float64
 		if opts.NewEmbedding != nil {
@@ -76,10 +76,11 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe
 			}
 		}
 
-		// Combined similarity: cosine only contributes when above 0.7
-		// (below that, same-domain but different content can produce false matches)
+		// Combined similarity: cosine only contributes when above 0.85.
+		// Below that, same-domain content (e.g. two butterfly survey locations)
+		// clusters around 0.70–0.84 and produces false UPDATE matches.
 		similarity := tokenSim
-		if cosineSim >= 0.7 && cosineSim > similarity {
+		if cosineSim >= 0.85 && cosineSim > similarity {
 			similarity = cosineSim
 		}
 
@@ -137,9 +138,9 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe
 			if !ok {
 				continue
 			}
-			tokenSim := ContentSimilarity(newContent, ins.Content)
+			tokenSim := JaccardSimilarity(newContent, ins.Content)
 			similarity := tokenSim
-			if cp.sim >= 0.7 && cp.sim > similarity {
+			if cp.sim >= 0.85 && cp.sim > similarity {
 				similarity = cp.sim
 			}
 			suggestion := classifySuggestion(similarity, newContent, ins.Content)
@@ -177,24 +178,31 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe
 	}
 }
 
-// negationWords detects potential contradictions.
+// negationWords detects clear state-change signals. Single common words like
+// "not" are intentionally excluded — they appear constantly in scientific and
+// research text and cause false CONFLICT classifications.
 var negationWords = []string{
-	"not", "no longer", "don't", "doesn't", "never", "switched from",
+	"no longer", "don't", "doesn't", "never", "switched from",
 	"instead of", "rather than", "replaced", "deprecated",
-	"不", "没有", "不再", "放弃", "替换", "取消",
+	"不再", "放弃", "替换", "取消",
 }
 
 func classifySuggestion(similarity float64, newText, existingText string) DiffSuggestion {
 	if similarity < 0.5 {
 		return DiffAdd
 	}
 
-	// Check for negation/conflict signals (even at high similarity)
-	newLower := strings.ToLower(newText)
-	existLower := strings.ToLower(existingText)
-	for _, neg := range negationWords {
-		if strings.Contains(newLower, neg) || strings.Contains(existLower, neg) {
-			return DiffConflict
+	// Only check for conflict signals when texts are substantially similar.
+	// At borderline similarity (0.5–0.7) texts may share domain vocabulary
+	// without being about the same subject (e.g. two survey records from
+	// different locations with shared species names).
+	if similarity >= 0.7 {
+		newLower := strings.ToLower(newText)
+		existLower := strings.ToLower(existingText)
+		for _, neg := range negationWords {
+			if strings.Contains(newLower, neg) || strings.Contains(existLower, neg) {
+				return DiffConflict
+			}
 		}
 	}
 

diff --git a/internal/search/diff_test.go b/internal/search/diff_test.go
@@ -33,13 +33,13 @@ func TestClassifySuggestion_ConflictNegation(t *testing.T) {
 		newText  string
 		existing string
 	}{
-		{"not", "do not use Redis", "use Redis for caching"},
 		{"no longer", "no longer supports Python 2", "supports Python 2"},
 		{"replaced", "replaced Flask with FastAPI", "uses Flask for API"},
 		{"chinese_negation", "不再使用Redis", "使用Redis"},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
+			// Similarity >= 0.7 so negation check is active.
 			got := classifySuggestion(0.7, tt.newText, tt.existing)
 			if got != DiffConflict {
 				t.Errorf("want CONFLICT, got %s", got)
@@ -48,6 +48,26 @@ func TestClassifySuggestion_ConflictNegation(t *testing.T) {
 	}
 }
 
+func TestClassifySuggestion_NotWordNoConflict(t *testing.T) {
+	// "not" alone must NOT trigger CONFLICT — it appears constantly in
+	// scientific text ("species not previously recorded") and would cause
+	// false replacements of distinct survey records.
+	got := classifySuggestion(0.7, "species not recorded at this site", "species recorded at Kinabalu")
+	if got == DiffConflict {
+		t.Error("bare 'not' should not trigger CONFLICT")
+	}
+}
+
+func TestClassifySuggestion_ConflictBelowThreshold(t *testing.T) {
+	// Negation words must NOT trigger CONFLICT when similarity < 0.7.
+	// Two survey records from different locations may share domain vocabulary
+	// and contain "no longer" or "replaced" in unrelated sentences.
+	got := classifySuggestion(0.6, "no longer present at Raub site", "butterfly survey Kinabalu")
+	if got == DiffConflict {
+		t.Error("negation below similarity 0.7 should not trigger CONFLICT")
+	}
+}
+
 func TestClassifySuggestion_Boundary(t *testing.T) {
 	// Exactly 0.5 should not be ADD (it's >= 0.5)
 	got := classifySuggestion(0.5, "some content here", "other content here")
@@ -105,6 +125,29 @@ func TestDiff_DuplicateOverridesOverall(t *testing.T) {
 	}
 }
 
+func TestDiff_SameDomainCosineNoOverride(t *testing.T) {
+	// Regression: same-domain facts with different locations must not trigger UPDATE.
+	// nomic-embed-text produces cosine ~0.75 for same-domain different-fact pairs.
+	// The old threshold (0.70) let cosine override token similarity and incorrectly
+	// classified as UPDATE, replacing the original insight. The fix raises it to 0.85.
+	insights := []*model.Insight{
+		{ID: "kinabalu", Content: "Dichorragia nesimachus singleton at Kinabalu Park, Sabah."},
+	}
+	// Two unit vectors with cosine similarity = 0.75: simulates same-domain different-fact embeddings.
+	newVec := []float64{1.0, 0.0}
+	existVec := []float64{0.75, 0.6614} // cos(newVec, existVec) = 0.75
+
+	result := Diff(insights,
+		"Dichorragia nesimachus first record in Bentong, Pahang.",
+		DiffOptions{
+			NewEmbedding:  newVec,
+			ExistingEmbed: []EmbeddedItem{{ID: "kinabalu", Embedding: existVec}},
+		})
+	if result.Suggestion != DiffAdd {
+		t.Errorf("cosine=0.75 (same domain, different location): want ADD, got %s", result.Suggestion)
+	}
+}
+
 func TestDiff_LimitDefault(t *testing.T) {
 	insights := make([]*model.Insight, 20)
 	for i := range insights {

diff --git a/internal/search/keyword.go b/internal/search/keyword.go
@@ -179,6 +179,29 @@ func flushCJK(buf []rune, tokens map[string]bool) {
 	}
 }
 
+// JaccardSimilarity computes token-set Jaccard similarity: |A∩B| / |A∪B|.
+// Used for deduplication — stricter than ContentSimilarity because it penalises
+// texts that share domain vocabulary but differ in the specific facts they state
+// (e.g. same species name, different location).
+func JaccardSimilarity(a, b string) float64 {
+	tokA := Tokenize(a)
+	tokB := Tokenize(b)
+	if len(tokA) == 0 || len(tokB) == 0 {
+		return 0
+	}
+	intersection := 0
+	for t := range tokA {
+		if tokB[t] {
+			intersection++
+		}
+	}
+	union := len(tokA) + len(tokB) - intersection
+	if union == 0 {
+		return 0
+	}
+	return float64(intersection) / float64(union)
+}
+
 // ContentSimilarity computes bidirectional token overlap between two texts.
 // Returns max(overlap_a_to_b, overlap_b_to_a) for a symmetric measure.
 func ContentSimilarity(a, b string) float64 {

diff --git a/internal/search/keyword_test.go b/internal/search/keyword_test.go
@@ -124,6 +124,37 @@ func TestContentSimilarity_Empty(t *testing.T) {
 	}
 }
 
+func TestJaccardSimilarity_Identical(t *testing.T) {
+	if sim := JaccardSimilarity("Go uses SQLite", "Go uses SQLite"); sim != 1.0 {
+		t.Errorf("identical: want 1.0, got %f", sim)
+	}
+}
+
+func TestJaccardSimilarity_Disjoint(t *testing.T) {
+	if sim := JaccardSimilarity("apple banana", "dog elephant"); sim != 0 {
+		t.Errorf("disjoint: want 0, got %f", sim)
+	}
+}
+
+func TestJaccardSimilarity_SameDomainDifferentFact(t *testing.T) {
+	// Same species name (shared tokens) but different location (distinct tokens).
+	// Jaccard penalises the distinct tokens; bidirectional max would not.
+	a := "Dichorragia nesimachus singleton at Kinabalu Park Sabah lowland forest elevation"
+	b := "Dichorragia nesimachus first record Raub Pahang dipterocarp forest canopy specimen"
+	sim := JaccardSimilarity(a, b)
+	if sim >= 0.5 {
+		t.Errorf("same-domain different-fact: want Jaccard < 0.5 (ADD territory), got %f", sim)
+	}
+}
+
+func TestJaccardSimilarity_OneWordChange(t *testing.T) {
+	// Same sentence with one word swapped — genuine update, Jaccard should be >= 0.5.
+	sim := JaccardSimilarity("Go uses SQLite for storage", "Go uses PostgreSQL for storage")
+	if sim < 0.5 {
+		t.Errorf("one-word-change: want Jaccard >= 0.5 (UPDATE territory), got %f", sim)
+	}
+}
+
 func TestKeywordSearch_Ranking(t *testing.T) {
 	insights := []*model.Insight{
 		{ID: "1", Content: "Go language for building CLI tools", Importance: 3},