diff --git a/CHANGELOG.md b/CHANGELOG.md index a9b8697..33c3cab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/). ## [Unreleased] +### Fixed +- Deduplication false positives on scientific and domain-specific text: + - Removed bare `"not"` from negation words — it appears in virtually all + scientific prose and caused unrelated records to be classified as CONFLICT. + - Gated negation-word check behind similarity ≥ 0.7 — at borderline + similarity, shared domain vocabulary is not a reliable conflict signal. + - Raised cosine dedup threshold from 0.70 to 0.85 — same-domain + different-fact pairs (e.g. survey records at different locations) produce + cosine ~0.75 with nomic-embed-text and were incorrectly triggering UPDATE. + - Switched token dedup from bidirectional-max (`ContentSimilarity`) to + Jaccard (`|A∩B|/|A∪B|`) — penalises texts that share vocabulary but + differ in most tokens, preventing formulaic records from scoring as UPDATE. + ## [0.1.4] - 2026-05-16 ### Added diff --git a/internal/search/diff.go b/internal/search/diff.go index b897eec..9d33c4a 100644 --- a/internal/search/diff.go +++ b/internal/search/diff.go @@ -67,7 +67,7 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe // Step 2: score each candidate matches := make([]DiffMatch, 0, len(candidates)) for _, c := range candidates { - tokenSim := ContentSimilarity(newContent, c.Insight.Content) + tokenSim := JaccardSimilarity(newContent, c.Insight.Content) var cosineSim float64 if opts.NewEmbedding != nil { @@ -76,10 +76,11 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe } } - // Combined similarity: cosine only contributes when above 0.7 - // (below that, same-domain but different content can produce false matches) + // Combined similarity: cosine only contributes when above 0.85. + // Below that, same-domain content (e.g. two butterfly survey locations) + // clusters around 0.70–0.84 and produces false UPDATE matches. similarity := tokenSim - if cosineSim >= 0.7 && cosineSim > similarity { + if cosineSim >= 0.85 && cosineSim > similarity { similarity = cosineSim } @@ -137,9 +138,9 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe if !ok { continue } - tokenSim := ContentSimilarity(newContent, ins.Content) + tokenSim := JaccardSimilarity(newContent, ins.Content) similarity := tokenSim - if cp.sim >= 0.7 && cp.sim > similarity { + if cp.sim >= 0.85 && cp.sim > similarity { similarity = cp.sim } suggestion := classifySuggestion(similarity, newContent, ins.Content) @@ -177,11 +178,13 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe } } -// negationWords detects potential contradictions. +// negationWords detects clear state-change signals. Single common words like +// "not" are intentionally excluded — they appear constantly in scientific and +// research text and cause false CONFLICT classifications. var negationWords = []string{ - "not", "no longer", "don't", "doesn't", "never", "switched from", + "no longer", "don't", "doesn't", "never", "switched from", "instead of", "rather than", "replaced", "deprecated", - "不", "没有", "不再", "放弃", "替换", "取消", + "不再", "放弃", "替换", "取消", } func classifySuggestion(similarity float64, newText, existingText string) DiffSuggestion { @@ -189,12 +192,17 @@ func classifySuggestion(similarity float64, newText, existingText string) DiffSu return DiffAdd } - // Check for negation/conflict signals (even at high similarity) - newLower := strings.ToLower(newText) - existLower := strings.ToLower(existingText) - for _, neg := range negationWords { - if strings.Contains(newLower, neg) || strings.Contains(existLower, neg) { - return DiffConflict + // Only check for conflict signals when texts are substantially similar. + // At borderline similarity (0.5–0.7) texts may share domain vocabulary + // without being about the same subject (e.g. two survey records from + // different locations with shared species names). + if similarity >= 0.7 { + newLower := strings.ToLower(newText) + existLower := strings.ToLower(existingText) + for _, neg := range negationWords { + if strings.Contains(newLower, neg) || strings.Contains(existLower, neg) { + return DiffConflict + } } } diff --git a/internal/search/diff_test.go b/internal/search/diff_test.go index 802e496..b63dd72 100644 --- a/internal/search/diff_test.go +++ b/internal/search/diff_test.go @@ -33,13 +33,13 @@ func TestClassifySuggestion_ConflictNegation(t *testing.T) { newText string existing string }{ - {"not", "do not use Redis", "use Redis for caching"}, {"no longer", "no longer supports Python 2", "supports Python 2"}, {"replaced", "replaced Flask with FastAPI", "uses Flask for API"}, {"chinese_negation", "不再使用Redis", "使用Redis"}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + // Similarity >= 0.7 so negation check is active. got := classifySuggestion(0.7, tt.newText, tt.existing) if got != DiffConflict { t.Errorf("want CONFLICT, got %s", got) @@ -48,6 +48,26 @@ func TestClassifySuggestion_ConflictNegation(t *testing.T) { } } +func TestClassifySuggestion_NotWordNoConflict(t *testing.T) { + // "not" alone must NOT trigger CONFLICT — it appears constantly in + // scientific text ("species not previously recorded") and would cause + // false replacements of distinct survey records. + got := classifySuggestion(0.7, "species not recorded at this site", "species recorded at Kinabalu") + if got == DiffConflict { + t.Error("bare 'not' should not trigger CONFLICT") + } +} + +func TestClassifySuggestion_ConflictBelowThreshold(t *testing.T) { + // Negation words must NOT trigger CONFLICT when similarity < 0.7. + // Two survey records from different locations may share domain vocabulary + // and contain "no longer" or "replaced" in unrelated sentences. + got := classifySuggestion(0.6, "no longer present at Raub site", "butterfly survey Kinabalu") + if got == DiffConflict { + t.Error("negation below similarity 0.7 should not trigger CONFLICT") + } +} + func TestClassifySuggestion_Boundary(t *testing.T) { // Exactly 0.5 should not be ADD (it's >= 0.5) got := classifySuggestion(0.5, "some content here", "other content here") @@ -105,6 +125,29 @@ func TestDiff_DuplicateOverridesOverall(t *testing.T) { } } +func TestDiff_SameDomainCosineNoOverride(t *testing.T) { + // Regression: same-domain facts with different locations must not trigger UPDATE. + // nomic-embed-text produces cosine ~0.75 for same-domain different-fact pairs. + // The old threshold (0.70) let cosine override token similarity and incorrectly + // classified as UPDATE, replacing the original insight. The fix raises it to 0.85. + insights := []*model.Insight{ + {ID: "kinabalu", Content: "Dichorragia nesimachus singleton at Kinabalu Park, Sabah."}, + } + // Two unit vectors with cosine similarity = 0.75: simulates same-domain different-fact embeddings. + newVec := []float64{1.0, 0.0} + existVec := []float64{0.75, 0.6614} // cos(newVec, existVec) = 0.75 + + result := Diff(insights, + "Dichorragia nesimachus first record in Bentong, Pahang.", + DiffOptions{ + NewEmbedding: newVec, + ExistingEmbed: []EmbeddedItem{{ID: "kinabalu", Embedding: existVec}}, + }) + if result.Suggestion != DiffAdd { + t.Errorf("cosine=0.75 (same domain, different location): want ADD, got %s", result.Suggestion) + } +} + func TestDiff_LimitDefault(t *testing.T) { insights := make([]*model.Insight, 20) for i := range insights { diff --git a/internal/search/keyword.go b/internal/search/keyword.go index e7c4344..26020b5 100644 --- a/internal/search/keyword.go +++ b/internal/search/keyword.go @@ -179,6 +179,29 @@ func flushCJK(buf []rune, tokens map[string]bool) { } } +// JaccardSimilarity computes token-set Jaccard similarity: |A∩B| / |A∪B|. +// Used for deduplication — stricter than ContentSimilarity because it penalises +// texts that share domain vocabulary but differ in the specific facts they state +// (e.g. same species name, different location). +func JaccardSimilarity(a, b string) float64 { + tokA := Tokenize(a) + tokB := Tokenize(b) + if len(tokA) == 0 || len(tokB) == 0 { + return 0 + } + intersection := 0 + for t := range tokA { + if tokB[t] { + intersection++ + } + } + union := len(tokA) + len(tokB) - intersection + if union == 0 { + return 0 + } + return float64(intersection) / float64(union) +} + // ContentSimilarity computes bidirectional token overlap between two texts. // Returns max(overlap_a_to_b, overlap_b_to_a) for a symmetric measure. func ContentSimilarity(a, b string) float64 { diff --git a/internal/search/keyword_test.go b/internal/search/keyword_test.go index 2d566ce..73d644f 100644 --- a/internal/search/keyword_test.go +++ b/internal/search/keyword_test.go @@ -124,6 +124,37 @@ func TestContentSimilarity_Empty(t *testing.T) { } } +func TestJaccardSimilarity_Identical(t *testing.T) { + if sim := JaccardSimilarity("Go uses SQLite", "Go uses SQLite"); sim != 1.0 { + t.Errorf("identical: want 1.0, got %f", sim) + } +} + +func TestJaccardSimilarity_Disjoint(t *testing.T) { + if sim := JaccardSimilarity("apple banana", "dog elephant"); sim != 0 { + t.Errorf("disjoint: want 0, got %f", sim) + } +} + +func TestJaccardSimilarity_SameDomainDifferentFact(t *testing.T) { + // Same species name (shared tokens) but different location (distinct tokens). + // Jaccard penalises the distinct tokens; bidirectional max would not. + a := "Dichorragia nesimachus singleton at Kinabalu Park Sabah lowland forest elevation" + b := "Dichorragia nesimachus first record Raub Pahang dipterocarp forest canopy specimen" + sim := JaccardSimilarity(a, b) + if sim >= 0.5 { + t.Errorf("same-domain different-fact: want Jaccard < 0.5 (ADD territory), got %f", sim) + } +} + +func TestJaccardSimilarity_OneWordChange(t *testing.T) { + // Same sentence with one word swapped — genuine update, Jaccard should be >= 0.5. + sim := JaccardSimilarity("Go uses SQLite for storage", "Go uses PostgreSQL for storage") + if sim < 0.5 { + t.Errorf("one-word-change: want Jaccard >= 0.5 (UPDATE territory), got %f", sim) + } +} + func TestKeywordSearch_Ranking(t *testing.T) { insights := []*model.Insight{ {ID: "1", Content: "Go language for building CLI tools", Importance: 3},