diff --git a/src/core/normalize.ts b/src/core/normalize.ts index 46b20c4..601cdc4 100644 --- a/src/core/normalize.ts +++ b/src/core/normalize.ts @@ -40,6 +40,75 @@ const VARIANT_MAP: Record = { "條": "条", "圓": "円", }; +type ScriptType = "kanji" | "hiragana" | "katakana" | "other"; + +function scriptOf(ch: string): ScriptType { + if (/[\u3041-\u3096]/.test(ch)) return "hiragana"; + if (/[\u30A1-\u30F6\u30FC]/.test(ch)) return "katakana"; + if (/[\p{Script=Han}々〆ヶ]/u.test(ch)) return "kanji"; + return "other"; +} + +/** + * Find the split position where a single script boundary occurs + * between kanji and kana (in either direction). + * Returns the character index and direction, or undefined if no unique boundary exists. + * + * Examples: + * "夏色まつり" → { index: 2, direction: "kanji-to-kana" } + * "白銀ノエル" → { index: 2, direction: "kanji-to-kana" } + * "デーモン閣下" → { index: 4, direction: "kana-to-kanji" } + * "もこ田めめめ" → undefined (2 transitions) + * "田中太郎" → undefined (all kanji, no transition) + */ +export interface ScriptBoundary { + index: number; + direction: "kanji-to-kana" | "kana-to-kanji"; +} + +export function findSingleScriptBoundary(fullName: string): ScriptBoundary | undefined { + const chars = [...fullName]; + let transitionCount = 0; + let splitIndex: number | undefined; + let fromScript: ScriptType | undefined; + let toScript: ScriptType | undefined; + + for (let i = 1; i < chars.length; i++) { + const prev = scriptOf(chars[i - 1]); + const next = scriptOf(chars[i]); + if (prev === next) continue; + if (prev === "other" || next === "other") return undefined; + transitionCount++; + if (transitionCount > 1) return undefined; + splitIndex = i; + fromScript = prev; + toScript = next; + } + + if (transitionCount !== 1 || splitIndex === undefined) return undefined; + + const fromIsKanji = fromScript === "kanji"; + const toIsKanji = toScript === "kanji"; + const fromIsKana = fromScript === "hiragana" || fromScript === "katakana"; + const toIsKana = toScript === "hiragana" || toScript === "katakana"; + + if (fromIsKanji && toIsKana) { + return { index: splitIndex, direction: "kanji-to-kana" }; + } + if (fromIsKana && toIsKanji) { + return { index: splitIndex, direction: "kana-to-kanji" }; + } + + return undefined; +} + +/** @deprecated Use findSingleScriptBoundary instead */ +export function findSingleKanjiToKanaBoundary(fullName: string): number | undefined { + const result = findSingleScriptBoundary(fullName); + if (result?.direction === "kanji-to-kana") return result.index; + return undefined; +} + /** * Fold variant kanji to their canonical forms. */ diff --git a/src/core/scorer.ts b/src/core/scorer.ts index e0ca84f..cf9383e 100644 --- a/src/core/scorer.ts +++ b/src/core/scorer.ts @@ -30,6 +30,21 @@ const MEI_LENGTH_SCORE: Record = { const PAIR_BONUS = 0.8; const BOTH_SINGLE_CHAR_PENALTY = -1.0; +// Surname hit is stronger evidence than given name hit, +// because surnames are a finite known set while given names are creative +const SEI_HIT_BONUS = 0.5; + +// Script boundary scoring +const BOUNDARY_MATCH_BONUS = 1.2; +const BOUNDARY_MATCH_WITH_DICT_BONUS = 0.8; +const BOUNDARY_BEFORE_PENALTY = -3.0; +const BOUNDARY_AFTER_PENALTY = -1.8; + +// Sei mixed-script penalty: OOV surname containing kana is unnatural +const SEI_MIXED_SINGLE_HIRA_PENALTY = -2.5; +const SEI_MIXED_SINGLE_KATA_PENALTY = -3.0; +const SEI_MIXED_MULTI_KANA_PENALTY = -1.5; + // Cache for Set-based lookups built from string[] const setCache = new WeakMap; mei: Set }>(); @@ -45,6 +60,61 @@ function getSets(lexicon: PackedLexicon): { sei: Set; mei: Set } return cached; } +const RE_KANJI = /[\p{Script=Han}々〆ヶ]/u; +const RE_HIRAGANA = /[\u3041-\u3096]/; +const RE_KATAKANA = /[\u30A1-\u30F6\u30FC]/; + +function scriptOf(ch: string): "K" | "H" | "T" | "O" { + if (RE_KANJI.test(ch)) return "K"; + if (RE_HIRAGANA.test(ch)) return "H"; + if (RE_KATAKANA.test(ch)) return "T"; + return "O"; +} + +function scriptPattern(s: string): string { + return [...s].map(scriptOf).join(""); +} + +/** + * Penalty for OOV surnames that contain kana (e.g. 宝鐘マ, 星街すい). + * Real Japanese surnames are almost always pure kanji. + * Only applied when the surname has no dictionary hit. + */ +function seiMixedScriptPenalty(sei: string, seiMatch: MatchType): number { + if (seiMatch !== "none") return 0; + + const p = scriptPattern(sei); + if (!/^K+[HT]+$/.test(p)) return 0; + + const suffix = p.match(/[HT]+$/)![0]; + if (suffix.length === 1) { + return suffix[0] === "T" + ? SEI_MIXED_SINGLE_KATA_PENALTY + : SEI_MIXED_SINGLE_HIRA_PENALTY; + } + return SEI_MIXED_MULTI_KANA_PENALTY; +} + +/** + * Penalty for OOV given names that start with kana followed by kanji (e.g. モン閣下, イク眞木). + * When a kana→kanji boundary exists, the mei side should be pure kanji. + * Only applied when the given name has no dictionary hit. + */ +function meiMixedScriptPenalty(mei: string, meiMatch: MatchType): number { + if (meiMatch !== "none") return 0; + + const p = scriptPattern(mei); + if (!/^[HT]+K+$/.test(p)) return 0; + + const prefix = p.match(/^[HT]+/)![0]; + if (prefix.length === 1) { + return prefix[0] === "T" + ? SEI_MIXED_SINGLE_KATA_PENALTY + : SEI_MIXED_SINGLE_HIRA_PENALTY; + } + return SEI_MIXED_MULTI_KANA_PENALTY; +} + /** * Look up a candidate string in the lexicon. * Returns the match type: surface > folded > reading > none. @@ -87,12 +157,20 @@ export function lookupMatch( /** * Calculate the score for a split candidate. + * + * @param sei - the surname candidate string + * @param splitIndex - the character index where this candidate splits (i.e. sei length) + * @param boundaryIndex - the unique kanji→kana boundary position, or undefined if none */ export function calcScore( + sei: string, + mei: string, seiMatch: MatchType, meiMatch: MatchType, seiLen: number, meiLen: number, + splitIndex: number, + boundaryIndex: number | undefined, ): number { let score = 0; @@ -100,6 +178,12 @@ export function calcScore( score += MATCH_SCORE[seiMatch]; score += MATCH_SCORE[meiMatch]; + // Surname hit bonus: surnames are a known finite set, + // so a dictionary hit on sei is stronger evidence than on mei + if (seiMatch === "surface" || seiMatch === "folded") { + score += SEI_HIT_BONUS; + } + // Length scores (secondary signal) score += SEI_LENGTH_SCORE[Math.min(seiLen, 6)] ?? -0.5; score += MEI_LENGTH_SCORE[Math.min(meiLen, 6)] ?? -0.5; @@ -124,5 +208,23 @@ export function calcScore( score = -Infinity; } + // Script boundary scoring + if (boundaryIndex !== undefined) { + if (splitIndex === boundaryIndex) { + score += BOUNDARY_MATCH_BONUS; + if (seiMatch === "surface" || seiMatch === "folded") { + score += BOUNDARY_MATCH_WITH_DICT_BONUS; + } + } else if (splitIndex < boundaryIndex) { + score += BOUNDARY_BEFORE_PENALTY; + } else { + score += BOUNDARY_AFTER_PENALTY; + } + } + + // OOV surname mixed-script penalty (mei side is not penalized — + // names like よね子, ルミ子, 美つ子 naturally mix scripts) + score += seiMixedScriptPenalty(sei, seiMatch); + return score; } diff --git a/src/core/splitter.ts b/src/core/splitter.ts index a4bd19b..aa615bd 100644 --- a/src/core/splitter.ts +++ b/src/core/splitter.ts @@ -6,12 +6,17 @@ import type { SeimeiResult, SplitOptions, } from "./types.js"; -import { isAllHiragana, isAllKatakana, isNonJapanese } from "./normalize.js"; +import { isAllHiragana, isAllKatakana, isNonJapanese, findSingleScriptBoundary } from "./normalize.js"; import { calcScore, lookupMatch } from "./scorer.js"; const CONFIDENCE_THRESHOLD = 6.0; const CONFIDENCE_GAP = 1.0; +// Boundary confidence: when the best candidate aligns with a script boundary +// and has dictionary evidence, grant confidence 0.8 +const BOUNDARY_CONFIDENCE = 0.8; +const BOUNDARY_CONFIDENCE_GAP = 0.5; + let defaultLexicon: PackedLexicon | undefined; let defaultReading: ReadingData | undefined; @@ -73,7 +78,6 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult const lexicon = options?.lexicon ?? defaultLexicon; if (!lexicon) { - // No lexicon loaded: return unsplit return { best: { sei: trimmed, mei: "" }, confidence: 0, @@ -94,6 +98,8 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult const isKana = isAllHiragana(trimmed) || isAllKatakana(trimmed); const maxSplit = Math.min(lexicon.maxSeiLen, n - 1); + const boundary = findSingleScriptBoundary(trimmed); + const boundaryIndex = boundary?.index; const candidates: SeimeiCandidate[] = []; for (let i = 1; i <= maxSplit; i++) { @@ -107,7 +113,7 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult const readingData = options?.readingData ?? defaultReading; const seiMatch = lookupMatch(sei, "sei", lexicon, isKana, readingData); const meiMatch = lookupMatch(mei, "mei", lexicon, isKana, readingData); - const score = calcScore(seiMatch, meiMatch, seiLen, meiLen); + const score = calcScore(sei, mei, seiMatch, meiMatch, seiLen, meiLen, i, boundaryIndex); candidates.push({ sei, mei, score, seiMatch, meiMatch }); } @@ -129,15 +135,67 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult const confident = best.score >= CONFIDENCE_THRESHOLD && gap >= CONFIDENCE_GAP; - if (confident || options?.allowLowConfidence) { + // 1. Normal confidence: dictionary-based high score + if (confident) { + return { + best: { sei: best.sei, mei: best.mei }, + confidence: 1.0, + candidates, + }; + } + + // 2. Boundary confidence: best candidate aligns with script boundary + // and has dictionary evidence on the appropriate side + if (boundaryIndex !== undefined && [...best.sei].length === boundaryIndex) { + const hasDictEvidence = boundary?.direction === "kanji-to-kana" + ? (best.seiMatch === "surface" || best.seiMatch === "folded") + : (best.meiMatch === "surface" || best.meiMatch === "folded"); + + if ( + hasDictEvidence && + best.score >= CONFIDENCE_THRESHOLD && + gap >= BOUNDARY_CONFIDENCE_GAP + ) { + return { + best: { sei: best.sei, mei: best.mei }, + confidence: BOUNDARY_CONFIDENCE, + candidates, + }; + } + } + + // 3. Katakana-sei exception: when the best candidate's sei is all katakana + // (e.g. ジャガー/横田), real Japanese surnames are never pure katakana. + // Re-score the boundary candidate by looking up mei in sei dict instead. + if (boundary?.direction === "kana-to-kanji" && boundaryIndex !== undefined) { + const boundaryCandidate = candidates.find( + (c) => [...c.sei].length === boundaryIndex && isAllKatakana(c.sei) + ); + if (boundaryCandidate) { + const meiAsSei = lookupMatch( + boundaryCandidate.mei, "sei", lexicon, false, + options?.readingData ?? defaultReading, + ); + if (meiAsSei === "surface" || meiAsSei === "folded") { + return { + best: { sei: boundaryCandidate.sei, mei: boundaryCandidate.mei }, + confidence: BOUNDARY_CONFIDENCE, + candidates, + }; + } + } + } + + // 4. Low confidence mode + if (options?.allowLowConfidence) { return { best: { sei: best.sei, mei: best.mei }, - confidence: confident ? 1.0 : best.score / CONFIDENCE_THRESHOLD, + confidence: best.score / CONFIDENCE_THRESHOLD, candidates, }; } - // Not confident enough: return unsplit + // 4. Not confident enough: return unsplit return { best: { sei: trimmed, mei: "" }, confidence: 0, diff --git a/test/unit/split.test.ts b/test/unit/split.test.ts index 67ea94b..5fa1daa 100644 --- a/test/unit/split.test.ts +++ b/test/unit/split.test.ts @@ -1,16 +1,16 @@ import { describe, it, expect, beforeAll } from "vitest"; import { split, analyze, setLexicon } from "../../src/core/splitter"; +import { findSingleScriptBoundary } from "../../src/core/normalize"; import type { PackedLexicon } from "../../src/core/types"; -// Minimal test lexicon const testLexicon: PackedLexicon = { - sei: ["田中", "佐藤", "大瀬良", "林", "勅使河原", "小鳥遊", "西園寺", "齋藤"], - mei: ["太郎", "花子", "大地", "健太", "公望", "翔", "一郎"], + sei: ["田中", "佐藤", "大瀬良", "林", "勅使河原", "小鳥遊", "西園寺", "齋藤", "綾瀬", "夏", "周防", "横田", "池田", "秋山", "松村", "高峰"], + mei: ["太郎", "花子", "大地", "健太", "公望", "翔", "一郎", "リン", "田"], folded: { "斎藤": ["齋藤"], }, maxSeiLen: 4, - maxMeiLen: 3, + maxMeiLen: 5, }; describe("split", () => { @@ -79,6 +79,125 @@ describe("split", () => { }); }); + describe("findSingleScriptBoundary", () => { + it("漢字→ひらがな境界を検出する", () => { + expect(findSingleScriptBoundary("夏色まつり")).toEqual({ index: 2, direction: "kanji-to-kana" }); + }); + + it("漢字→カタカナ境界を検出する", () => { + expect(findSingleScriptBoundary("白銀ノエル")).toEqual({ index: 2, direction: "kanji-to-kana" }); + }); + + it("カタカナ→漢字境界を検出する", () => { + expect(findSingleScriptBoundary("ジャガー横田")).toEqual({ index: 4, direction: "kana-to-kanji" }); + }); + + it("ひらがな→漢字境界を検出する", () => { + expect(findSingleScriptBoundary("かたせ梨乃")).toEqual({ index: 3, direction: "kana-to-kanji" }); + }); + + it("全漢字は undefined", () => { + expect(findSingleScriptBoundary("田中太郎")).toBeUndefined(); + }); + + it("遷移2回は undefined", () => { + expect(findSingleScriptBoundary("もこ田めめめ")).toBeUndefined(); + }); + + it("全ひらがなは undefined", () => { + expect(findSingleScriptBoundary("たなかたろう")).toBeUndefined(); + }); + }); + + describe("文字種境界スコアリング", () => { + it("辞書ヒット姓 + かな名を境界で分離する(綾瀬はるか)", () => { + const result = analyze("綾瀬はるか"); + expect(result.best).toEqual({ sei: "綾瀬", mei: "はるか" }); + expect(result.confidence).toBeGreaterThanOrEqual(0.8); + }); + + it("辞書ヒット姓 + カタカナ名を境界で分離する(周防パトラ)", () => { + const result = analyze("周防パトラ"); + expect(result.best).toEqual({ sei: "周防", mei: "パトラ" }); + expect(result.confidence).toBeGreaterThanOrEqual(0.8); + }); + + it("allowLowConfidence: 辞書未登録でも境界位置が最高スコアになる", () => { + const result = analyze("夏色まつり", { allowLowConfidence: true }); + expect(result.best).toEqual({ sei: "夏色", mei: "まつり" }); + }); + + it("allowLowConfidence: 漢字→カタカナ境界が勝つ", () => { + const result = analyze("白銀ノエル", { allowLowConfidence: true }); + expect(result.best).toEqual({ sei: "白銀", mei: "ノエル" }); + }); + + it("辞書根拠がない場合は通常モードで unsplit", () => { + expect(split("東京はなこ")).toEqual({ sei: "東京はなこ", mei: "" }); + }); + + it("文字種遷移が2回以上ある場合は境界ボーナスなし", () => { + expect(split("夢野あき子")).toEqual({ sei: "夢野あき子", mei: "" }); + }); + + it("全漢字名は境界スコアの影響を受けない", () => { + const result = analyze("田中太郎"); + expect(result.best).toEqual({ sei: "田中", mei: "太郎" }); + expect(result.confidence).toBe(1.0); + }); + + it("1文字姓の正当な辞書ヒットは境界がなければ維持される", () => { + expect(split("林一郎")).toEqual({ sei: "林", mei: "一郎" }); + }); + }); + + describe("OOV姓の混在ペナルティ", () => { + it("漢字+カタカナ1文字の姓は大きく減点される", () => { + // 宝鐘マ/リン より 宝鐘/マリン が勝つべき + const result = analyze("宝鐘マリン", { allowLowConfidence: true }); + expect(result.best).toEqual({ sei: "宝鐘", mei: "マリン" }); + }); + + it("漢字+ひらがなの姓も減点される", () => { + const result = analyze("星街すいせい", { allowLowConfidence: true }); + expect(result.best).toEqual({ sei: "星街", mei: "すいせい" }); + }); + + it("辞書ヒットする姓には混在ペナルティが適用されない", () => { + // 綾瀬 is in dict — no penalty + const result = analyze("綾瀬はるか"); + expect(result.best).toEqual({ sei: "綾瀬", mei: "はるか" }); + }); + }); + + describe("姓ヒットボーナス", () => { + it("姓辞書ヒットは名辞書ヒットより優先される", () => { + // 松村/沙友理 (sei=surface) vs 松村沙/友理 (mei=surface) + const result = analyze("松村沙友理", { allowLowConfidence: true }); + expect(result.best.sei).toBe("松村"); + }); + }); + + describe("カタカナ姓の例外フロー", () => { + it("全カタカナ姓 + 漢字名は後半を姓辞書で照合する", () => { + // ジャガー/横田: 横田が姓辞書にヒット → 例外フローで採用 + const result = analyze("ジャガー横田"); + expect(result.best).toEqual({ sei: "ジャガー", mei: "横田" }); + expect(result.confidence).toBe(0.8); + }); + + it("ダン/池田も例外フローで分離する", () => { + const result = analyze("ダン池田"); + expect(result.best).toEqual({ sei: "ダン", mei: "池田" }); + expect(result.confidence).toBe(0.8); + }); + + it("漢字姓+漢字名には例外フローが適用されない", () => { + const result = analyze("田中太郎"); + expect(result.confidence).toBe(1.0); + }); + }); + describe("analyze", () => { it("候補リストとconfidenceを返す", () => { const result = analyze("田中太郎");