Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/).

## [Unreleased]

### Fixed
- Deduplication false positives on scientific and domain-specific text:
- Removed bare `"not"` from negation words — it appears in virtually all
scientific prose and caused unrelated records to be classified as CONFLICT.
- Gated negation-word check behind similarity ≥ 0.7 — at borderline
similarity, shared domain vocabulary is not a reliable conflict signal.
- Raised cosine dedup threshold from 0.70 to 0.85 — same-domain
different-fact pairs (e.g. survey records at different locations) produce
cosine ~0.75 with nomic-embed-text and were incorrectly triggering UPDATE.
- Switched token dedup from bidirectional-max (`ContentSimilarity`) to
Jaccard (`|A∩B|/|A∪B|`) — penalises texts that share vocabulary but
differ in most tokens, preventing formulaic records from scoring as UPDATE.

## [0.1.4] - 2026-05-16

### Added
Expand Down
38 changes: 23 additions & 15 deletions internal/search/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe
// Step 2: score each candidate
matches := make([]DiffMatch, 0, len(candidates))
for _, c := range candidates {
tokenSim := ContentSimilarity(newContent, c.Insight.Content)
tokenSim := JaccardSimilarity(newContent, c.Insight.Content)

var cosineSim float64
if opts.NewEmbedding != nil {
Expand All @@ -76,10 +76,11 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe
}
}

// Combined similarity: cosine only contributes when above 0.7
// (below that, same-domain but different content can produce false matches)
// Combined similarity: cosine only contributes when above 0.85.
// Below that, same-domain content (e.g. two butterfly survey locations)
// clusters around 0.70–0.84 and produces false UPDATE matches.
similarity := tokenSim
if cosineSim >= 0.7 && cosineSim > similarity {
if cosineSim >= 0.85 && cosineSim > similarity {
similarity = cosineSim
}

Expand Down Expand Up @@ -137,9 +138,9 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe
if !ok {
continue
}
tokenSim := ContentSimilarity(newContent, ins.Content)
tokenSim := JaccardSimilarity(newContent, ins.Content)
similarity := tokenSim
if cp.sim >= 0.7 && cp.sim > similarity {
if cp.sim >= 0.85 && cp.sim > similarity {
similarity = cp.sim
}
suggestion := classifySuggestion(similarity, newContent, ins.Content)
Expand Down Expand Up @@ -177,24 +178,31 @@ func Diff(insights []*model.Insight, newContent string, opts DiffOptions) DiffRe
}
}

// negationWords detects potential contradictions.
// negationWords detects clear state-change signals. Single common words like
// "not" are intentionally excluded — they appear constantly in scientific and
// research text and cause false CONFLICT classifications.
var negationWords = []string{
"not", "no longer", "don't", "doesn't", "never", "switched from",
"no longer", "don't", "doesn't", "never", "switched from",
"instead of", "rather than", "replaced", "deprecated",
"不", "没有", "不再", "放弃", "替换", "取消",
"不再", "放弃", "替换", "取消",
}

func classifySuggestion(similarity float64, newText, existingText string) DiffSuggestion {
if similarity < 0.5 {
return DiffAdd
}

// Check for negation/conflict signals (even at high similarity)
newLower := strings.ToLower(newText)
existLower := strings.ToLower(existingText)
for _, neg := range negationWords {
if strings.Contains(newLower, neg) || strings.Contains(existLower, neg) {
return DiffConflict
// Only check for conflict signals when texts are substantially similar.
// At borderline similarity (0.5–0.7) texts may share domain vocabulary
// without being about the same subject (e.g. two survey records from
// different locations with shared species names).
if similarity >= 0.7 {
newLower := strings.ToLower(newText)
existLower := strings.ToLower(existingText)
for _, neg := range negationWords {
if strings.Contains(newLower, neg) || strings.Contains(existLower, neg) {
return DiffConflict
}
}
}

Expand Down
45 changes: 44 additions & 1 deletion internal/search/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ func TestClassifySuggestion_ConflictNegation(t *testing.T) {
newText string
existing string
}{
{"not", "do not use Redis", "use Redis for caching"},
{"no longer", "no longer supports Python 2", "supports Python 2"},
{"replaced", "replaced Flask with FastAPI", "uses Flask for API"},
{"chinese_negation", "不再使用Redis", "使用Redis"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Similarity >= 0.7 so negation check is active.
got := classifySuggestion(0.7, tt.newText, tt.existing)
if got != DiffConflict {
t.Errorf("want CONFLICT, got %s", got)
Expand All @@ -48,6 +48,26 @@ func TestClassifySuggestion_ConflictNegation(t *testing.T) {
}
}

func TestClassifySuggestion_NotWordNoConflict(t *testing.T) {
// "not" alone must NOT trigger CONFLICT — it appears constantly in
// scientific text ("species not previously recorded") and would cause
// false replacements of distinct survey records.
got := classifySuggestion(0.7, "species not recorded at this site", "species recorded at Kinabalu")
if got == DiffConflict {
t.Error("bare 'not' should not trigger CONFLICT")
}
}

func TestClassifySuggestion_ConflictBelowThreshold(t *testing.T) {
// Negation words must NOT trigger CONFLICT when similarity < 0.7.
// Two survey records from different locations may share domain vocabulary
// and contain "no longer" or "replaced" in unrelated sentences.
got := classifySuggestion(0.6, "no longer present at Raub site", "butterfly survey Kinabalu")
if got == DiffConflict {
t.Error("negation below similarity 0.7 should not trigger CONFLICT")
}
}

func TestClassifySuggestion_Boundary(t *testing.T) {
// Exactly 0.5 should not be ADD (it's >= 0.5)
got := classifySuggestion(0.5, "some content here", "other content here")
Expand Down Expand Up @@ -105,6 +125,29 @@ func TestDiff_DuplicateOverridesOverall(t *testing.T) {
}
}

func TestDiff_SameDomainCosineNoOverride(t *testing.T) {
// Regression: same-domain facts with different locations must not trigger UPDATE.
// nomic-embed-text produces cosine ~0.75 for same-domain different-fact pairs.
// The old threshold (0.70) let cosine override token similarity and incorrectly
// classified as UPDATE, replacing the original insight. The fix raises it to 0.85.
insights := []*model.Insight{
{ID: "kinabalu", Content: "Dichorragia nesimachus singleton at Kinabalu Park, Sabah."},
}
// Two unit vectors with cosine similarity = 0.75: simulates same-domain different-fact embeddings.
newVec := []float64{1.0, 0.0}
existVec := []float64{0.75, 0.6614} // cos(newVec, existVec) = 0.75

result := Diff(insights,
"Dichorragia nesimachus first record in Bentong, Pahang.",
DiffOptions{
NewEmbedding: newVec,
ExistingEmbed: []EmbeddedItem{{ID: "kinabalu", Embedding: existVec}},
})
if result.Suggestion != DiffAdd {
t.Errorf("cosine=0.75 (same domain, different location): want ADD, got %s", result.Suggestion)
}
}

func TestDiff_LimitDefault(t *testing.T) {
insights := make([]*model.Insight, 20)
for i := range insights {
Expand Down
23 changes: 23 additions & 0 deletions internal/search/keyword.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,29 @@ func flushCJK(buf []rune, tokens map[string]bool) {
}
}

// JaccardSimilarity computes token-set Jaccard similarity: |A∩B| / |A∪B|.
// Used for deduplication — stricter than ContentSimilarity because it penalises
// texts that share domain vocabulary but differ in the specific facts they state
// (e.g. same species name, different location).
func JaccardSimilarity(a, b string) float64 {
tokA := Tokenize(a)
tokB := Tokenize(b)
if len(tokA) == 0 || len(tokB) == 0 {
return 0
}
intersection := 0
for t := range tokA {
if tokB[t] {
intersection++
}
}
union := len(tokA) + len(tokB) - intersection
if union == 0 {
return 0
}
return float64(intersection) / float64(union)
}

// ContentSimilarity computes bidirectional token overlap between two texts.
// Returns max(overlap_a_to_b, overlap_b_to_a) for a symmetric measure.
func ContentSimilarity(a, b string) float64 {
Expand Down
31 changes: 31 additions & 0 deletions internal/search/keyword_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,37 @@ func TestContentSimilarity_Empty(t *testing.T) {
}
}

func TestJaccardSimilarity_Identical(t *testing.T) {
if sim := JaccardSimilarity("Go uses SQLite", "Go uses SQLite"); sim != 1.0 {
t.Errorf("identical: want 1.0, got %f", sim)
}
}

func TestJaccardSimilarity_Disjoint(t *testing.T) {
if sim := JaccardSimilarity("apple banana", "dog elephant"); sim != 0 {
t.Errorf("disjoint: want 0, got %f", sim)
}
}

func TestJaccardSimilarity_SameDomainDifferentFact(t *testing.T) {
// Same species name (shared tokens) but different location (distinct tokens).
// Jaccard penalises the distinct tokens; bidirectional max would not.
a := "Dichorragia nesimachus singleton at Kinabalu Park Sabah lowland forest elevation"
b := "Dichorragia nesimachus first record Raub Pahang dipterocarp forest canopy specimen"
sim := JaccardSimilarity(a, b)
if sim >= 0.5 {
t.Errorf("same-domain different-fact: want Jaccard < 0.5 (ADD territory), got %f", sim)
}
}

func TestJaccardSimilarity_OneWordChange(t *testing.T) {
// Same sentence with one word swapped — genuine update, Jaccard should be >= 0.5.
sim := JaccardSimilarity("Go uses SQLite for storage", "Go uses PostgreSQL for storage")
if sim < 0.5 {
t.Errorf("one-word-change: want Jaccard >= 0.5 (UPDATE territory), got %f", sim)
}
}

func TestKeywordSearch_Ranking(t *testing.T) {
insights := []*model.Insight{
{ID: "1", Content: "Go language for building CLI tools", Importance: 3},
Expand Down
Loading