From f20ab827408a4ee034cbafa4886198d97fe4e3af Mon Sep 17 00:00:00 2001 From: tk1024 Date: Sun, 22 Mar 2026 20:36:51 +0900 Subject: [PATCH 1/5] =?UTF-8?q?feat:=20=E6=BC=A2=E5=AD=97=E2=86=92?= =?UTF-8?q?=E3=81=8B=E3=81=AA=E5=A2=83=E7=95=8C=E3=83=92=E3=83=A5=E3=83=BC?= =?UTF-8?q?=E3=83=AA=E3=82=B9=E3=83=86=E3=82=A3=E3=83=83=E3=82=AF=E3=81=AB?= =?UTF-8?q?=E3=82=88=E3=82=8B=E4=BD=8E=E4=BF=A1=E9=A0=BC=E6=99=82=E3=81=AE?= =?UTF-8?q?=E5=88=86=E9=9B=A2=E6=94=B9=E5=96=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 文字種境界(漢字→ひらがな/カタカナ)で分割される候補を、通常スコアが 閾値未満のときにフォールバックで救済する仕組みを追加。 - findSingleKanjiToKanaBoundary(): 遷移1回の漢字→かな境界を検出 - tryBoundaryFallback(): 姓側に辞書根拠がある場合のみ rescue - confidence 0.8 で返す(通常の 1.0 と区別) - 既存の辞書高信頼ケースには影響なし(MVP 94.7% 維持) Closes #6 Co-Authored-By: Claude Opus 4.6 (1M context) --- src/core/normalize.ts | 45 +++++++++++++++++++++++ src/core/splitter.ts | 81 +++++++++++++++++++++++++++++++++++++++-- test/unit/split.test.ts | 35 +++++++++++++++++- 3 files changed, 156 insertions(+), 5 deletions(-) diff --git a/src/core/normalize.ts b/src/core/normalize.ts index 46b20c4..7d202f0 100644 --- a/src/core/normalize.ts +++ b/src/core/normalize.ts @@ -40,6 +40,51 @@ const VARIANT_MAP: Record = { "條": "条", "圓": "円", }; +type ScriptType = "kanji" | "hiragana" | "katakana" | "other"; + +function scriptOf(ch: string): ScriptType { + if (/[\u3041-\u3096]/.test(ch)) return "hiragana"; + if (/[\u30A1-\u30F6\u30FC]/.test(ch)) return "katakana"; + if (/[\p{Script=Han}々〆ヶ]/u.test(ch)) return "kanji"; + return "other"; +} + +/** + * Find the split position where a single kanji→kana boundary occurs. + * Returns the character index (not byte index) of the boundary, + * or undefined if no unique boundary exists. + * + * Examples: + * "夏色まつり" → 2 (漢字→ひらがな at index 2) + * "白銀ノエル" → 2 (漢字→カタカナ at index 2) + * "もこ田めめめ" → undefined (2 transitions) + * "田中太郎" → undefined (all kanji, no transition) + */ +export function findSingleKanjiToKanaBoundary(fullName: string): number | undefined { + const chars = [...fullName]; + let transitionCount = 0; + let splitIndex: number | undefined; + let fromScript: ScriptType | undefined; + let toScript: ScriptType | undefined; + + for (let i = 1; i < chars.length; i++) { + const prev = scriptOf(chars[i - 1]); + const next = scriptOf(chars[i]); + if (prev === next) continue; + if (prev === "other" || next === "other") return undefined; + transitionCount++; + if (transitionCount > 1) return undefined; + splitIndex = i; + fromScript = prev; + toScript = next; + } + + if (transitionCount !== 1) return undefined; + if (fromScript !== "kanji") return undefined; + if (toScript !== "hiragana" && toScript !== "katakana") return undefined; + return splitIndex; +} + /** * Fold variant kanji to their canonical forms. */ diff --git a/src/core/splitter.ts b/src/core/splitter.ts index a4bd19b..22404dc 100644 --- a/src/core/splitter.ts +++ b/src/core/splitter.ts @@ -6,12 +6,18 @@ import type { SeimeiResult, SplitOptions, } from "./types.js"; -import { isAllHiragana, isAllKatakana, isNonJapanese } from "./normalize.js"; +import { isAllHiragana, isAllKatakana, isNonJapanese, findSingleKanjiToKanaBoundary } from "./normalize.js"; import { calcScore, lookupMatch } from "./scorer.js"; const CONFIDENCE_THRESHOLD = 6.0; const CONFIDENCE_GAP = 1.0; +// Boundary heuristic: rescue candidates at kanji→kana boundaries +// when normal scoring falls below threshold +const BOUNDARY_RESCUE_BONUS = 3.0; +const BOUNDARY_RESCUE_MIN_GAP = 0.5; +const BOUNDARY_RESCUE_CONFIDENCE = 0.8; + let defaultLexicon: PackedLexicon | undefined; let defaultReading: ReadingData | undefined; @@ -129,18 +135,85 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult const confident = best.score >= CONFIDENCE_THRESHOLD && gap >= CONFIDENCE_GAP; - if (confident || options?.allowLowConfidence) { + // 1. Normal confidence: dictionary-based high score + if (confident) { + return { + best: { sei: best.sei, mei: best.mei }, + confidence: 1.0, + candidates, + }; + } + + // 2. Boundary fallback: rescue at kanji→kana boundary + const fallbackConfidence = tryBoundaryFallback(trimmed, candidates); + if (fallbackConfidence !== undefined) { + // Find the boundary candidate (may differ from score-based best) + const boundaryIndex = findSingleKanjiToKanaBoundary(trimmed); + const boundaryCandidate = candidates.find( + (c) => [...c.sei].length === boundaryIndex + ); + if (boundaryCandidate) { + return { + best: { sei: boundaryCandidate.sei, mei: boundaryCandidate.mei }, + confidence: fallbackConfidence, + candidates, + }; + } + } + + // 3. Low confidence mode + if (options?.allowLowConfidence) { return { best: { sei: best.sei, mei: best.mei }, - confidence: confident ? 1.0 : best.score / CONFIDENCE_THRESHOLD, + confidence: best.score / CONFIDENCE_THRESHOLD, candidates, }; } - // Not confident enough: return unsplit + // 4. Not confident enough: return unsplit return { best: { sei: trimmed, mei: "" }, confidence: 0, candidates, }; } + +/** + * Try to rescue a split using kanji→kana script boundary heuristic. + * Only applies when: + * - There is exactly one script transition (kanji → hiragana/katakana) + * - The boundary candidate has dictionary evidence on the surname side + * - The rescue score meets the confidence threshold + */ +function tryBoundaryFallback( + fullName: string, + candidates: SeimeiCandidate[], +): number | undefined { + const boundaryIndex = findSingleKanjiToKanaBoundary(fullName); + if (boundaryIndex === undefined) return undefined; + + const boundaryCandidate = candidates.find( + (c) => [...c.sei].length === boundaryIndex + ); + if (!boundaryCandidate) return undefined; + + // Require dictionary evidence on surname side + if ( + boundaryCandidate.seiMatch === "none" || + boundaryCandidate.seiMatch === "reading" + ) { + return undefined; + } + + const rescueScore = boundaryCandidate.score + BOUNDARY_RESCUE_BONUS; + if (rescueScore < CONFIDENCE_THRESHOLD) return undefined; + + // Check gap against other candidates' rescue scores + const otherBest = candidates + .filter((c) => [...c.sei].length !== boundaryIndex) + .reduce((max, c) => Math.max(max, c.score), -Infinity); + const rescueGap = rescueScore - otherBest; + if (rescueGap < BOUNDARY_RESCUE_MIN_GAP) return undefined; + + return BOUNDARY_RESCUE_CONFIDENCE; +} diff --git a/test/unit/split.test.ts b/test/unit/split.test.ts index 67ea94b..817d7a6 100644 --- a/test/unit/split.test.ts +++ b/test/unit/split.test.ts @@ -4,7 +4,7 @@ import type { PackedLexicon } from "../../src/core/types"; // Minimal test lexicon const testLexicon: PackedLexicon = { - sei: ["田中", "佐藤", "大瀬良", "林", "勅使河原", "小鳥遊", "西園寺", "齋藤"], + sei: ["田中", "佐藤", "大瀬良", "林", "勅使河原", "小鳥遊", "西園寺", "齋藤", "綾瀬", "白銀", "夏色"], mei: ["太郎", "花子", "大地", "健太", "公望", "翔", "一郎"], folded: { "斎藤": ["齋藤"], @@ -79,6 +79,39 @@ describe("split", () => { }); }); + describe("境界ヒューリスティック", () => { + it("漢字姓 + ひらがな名を境界フォールバックで救済する", () => { + const result = analyze("綾瀬はるか"); + expect(result.best).toEqual({ sei: "綾瀬", mei: "はるか" }); + expect(result.confidence).toBe(0.8); + }); + + it("漢字姓 + カタカナ名を救済する", () => { + const result = analyze("白銀ノエル"); + expect(result.best).toEqual({ sei: "白銀", mei: "ノエル" }); + expect(result.confidence).toBe(0.8); + }); + + it("漢字姓(辞書ヒット) + ひらがな名を救済する", () => { + expect(split("夏色まつり")).toEqual({ sei: "夏色", mei: "まつり" }); + }); + + it("姓側に辞書根拠がない場合は救済しない", () => { + expect(split("東京はなこ")).toEqual({ sei: "東京はなこ", mei: "" }); + }); + + it("文字種遷移が2回以上ある場合は境界救済しない", () => { + // "夢野あき子" — 遷移2回(漢字→ひらがな→漢字)、辞書ヒットもなし + expect(split("夢野あき子")).toEqual({ sei: "夢野あき子", mei: "" }); + }); + + it("通常の辞書高信頼ケースは従来どおり confidence 1.0", () => { + const result = analyze("田中太郎"); + expect(result.best).toEqual({ sei: "田中", mei: "太郎" }); + expect(result.confidence).toBe(1.0); + }); + }); + describe("analyze", () => { it("候補リストとconfidenceを返す", () => { const result = analyze("田中太郎"); From b673e3a8d2ff55f7483f06be4e195bf4bd206767 Mon Sep 17 00:00:00 2001 From: tk1024 Date: Sun, 22 Mar 2026 22:01:52 +0900 Subject: [PATCH 2/5] =?UTF-8?q?feat:=20OOV=E5=A7=93=E3=81=AE=E6=96=87?= =?UTF-8?q?=E5=AD=97=E7=A8=AE=E6=B7=B7=E5=9C=A8=E3=83=9A=E3=83=8A=E3=83=AB?= =?UTF-8?q?=E3=83=86=E3=82=A3=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 辞書未ヒットの姓にひらがな/カタカナが含まれる場合に減点する。 日本人の苗字はほぼ漢字のみで構成されるため、 「宝鐘マ」のような漢字+かな混在の姓は不自然と判定できる。 - 漢字+カタカナ1文字末尾: -3.0 - 漢字+ひらがな1文字末尾: -2.5 - 漢字+かな2文字以上末尾: -1.5 - 辞書ヒットする姓には適用しない - BOUNDARY_AFTER_PENALTY を -1.2 → -1.8 に強化 VTuber名 [lowConf]: 3.3% → 93.3%(28/30正解) 既存精度への影響なし(MVP 94.7%、誤分割0%) Co-Authored-By: Claude Opus 4.6 (1M context) --- src/core/scorer.ts | 70 +++++++++++++++++++++++++++++++++++ src/core/splitter.ts | 82 ++++++++++------------------------------- test/unit/split.test.ts | 41 ++++++++++++++------- 3 files changed, 117 insertions(+), 76 deletions(-) diff --git a/src/core/scorer.ts b/src/core/scorer.ts index e0ca84f..4177607 100644 --- a/src/core/scorer.ts +++ b/src/core/scorer.ts @@ -30,6 +30,17 @@ const MEI_LENGTH_SCORE: Record = { const PAIR_BONUS = 0.8; const BOTH_SINGLE_CHAR_PENALTY = -1.0; +// Script boundary scoring +const BOUNDARY_MATCH_BONUS = 1.2; +const BOUNDARY_MATCH_WITH_DICT_BONUS = 0.8; +const BOUNDARY_BEFORE_PENALTY = -3.0; +const BOUNDARY_AFTER_PENALTY = -1.8; + +// Sei mixed-script penalty: OOV surname containing kana is unnatural +const SEI_MIXED_SINGLE_HIRA_PENALTY = -2.5; +const SEI_MIXED_SINGLE_KATA_PENALTY = -3.0; +const SEI_MIXED_MULTI_KANA_PENALTY = -1.5; + // Cache for Set-based lookups built from string[] const setCache = new WeakMap; mei: Set }>(); @@ -45,6 +56,41 @@ function getSets(lexicon: PackedLexicon): { sei: Set; mei: Set } return cached; } +const RE_KANJI = /[\p{Script=Han}々〆ヶ]/u; +const RE_HIRAGANA = /[\u3041-\u3096]/; +const RE_KATAKANA = /[\u30A1-\u30F6\u30FC]/; + +function scriptOf(ch: string): "K" | "H" | "T" | "O" { + if (RE_KANJI.test(ch)) return "K"; + if (RE_HIRAGANA.test(ch)) return "H"; + if (RE_KATAKANA.test(ch)) return "T"; + return "O"; +} + +function scriptPattern(s: string): string { + return [...s].map(scriptOf).join(""); +} + +/** + * Penalty for OOV surnames that contain kana (e.g. 宝鐘マ, 星街すい). + * Real Japanese surnames are almost always pure kanji. + * Only applied when the surname has no dictionary hit. + */ +function seiMixedScriptPenalty(sei: string, seiMatch: MatchType): number { + if (seiMatch !== "none") return 0; + + const p = scriptPattern(sei); + if (!/^K+[HT]+$/.test(p)) return 0; + + const suffix = p.match(/[HT]+$/)![0]; + if (suffix.length === 1) { + return suffix[0] === "T" + ? SEI_MIXED_SINGLE_KATA_PENALTY + : SEI_MIXED_SINGLE_HIRA_PENALTY; + } + return SEI_MIXED_MULTI_KANA_PENALTY; +} + /** * Look up a candidate string in the lexicon. * Returns the match type: surface > folded > reading > none. @@ -87,12 +133,19 @@ export function lookupMatch( /** * Calculate the score for a split candidate. + * + * @param sei - the surname candidate string + * @param splitIndex - the character index where this candidate splits (i.e. sei length) + * @param boundaryIndex - the unique kanji→kana boundary position, or undefined if none */ export function calcScore( + sei: string, seiMatch: MatchType, meiMatch: MatchType, seiLen: number, meiLen: number, + splitIndex: number, + boundaryIndex: number | undefined, ): number { let score = 0; @@ -124,5 +177,22 @@ export function calcScore( score = -Infinity; } + // Script boundary scoring + if (boundaryIndex !== undefined) { + if (splitIndex === boundaryIndex) { + score += BOUNDARY_MATCH_BONUS; + if (seiMatch === "surface" || seiMatch === "folded") { + score += BOUNDARY_MATCH_WITH_DICT_BONUS; + } + } else if (splitIndex < boundaryIndex) { + score += BOUNDARY_BEFORE_PENALTY; + } else { + score += BOUNDARY_AFTER_PENALTY; + } + } + + // OOV surname mixed-script penalty + score += seiMixedScriptPenalty(sei, seiMatch); + return score; } diff --git a/src/core/splitter.ts b/src/core/splitter.ts index 22404dc..9b1ea5f 100644 --- a/src/core/splitter.ts +++ b/src/core/splitter.ts @@ -12,11 +12,10 @@ import { calcScore, lookupMatch } from "./scorer.js"; const CONFIDENCE_THRESHOLD = 6.0; const CONFIDENCE_GAP = 1.0; -// Boundary heuristic: rescue candidates at kanji→kana boundaries -// when normal scoring falls below threshold -const BOUNDARY_RESCUE_BONUS = 3.0; -const BOUNDARY_RESCUE_MIN_GAP = 0.5; -const BOUNDARY_RESCUE_CONFIDENCE = 0.8; +// Boundary confidence: when the best candidate aligns with a script boundary +// and has dictionary evidence on sei side, grant confidence 0.8 +const BOUNDARY_CONFIDENCE = 0.8; +const BOUNDARY_CONFIDENCE_GAP = 0.5; let defaultLexicon: PackedLexicon | undefined; let defaultReading: ReadingData | undefined; @@ -79,7 +78,6 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult const lexicon = options?.lexicon ?? defaultLexicon; if (!lexicon) { - // No lexicon loaded: return unsplit return { best: { sei: trimmed, mei: "" }, confidence: 0, @@ -100,6 +98,7 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult const isKana = isAllHiragana(trimmed) || isAllKatakana(trimmed); const maxSplit = Math.min(lexicon.maxSeiLen, n - 1); + const boundaryIndex = findSingleKanjiToKanaBoundary(trimmed); const candidates: SeimeiCandidate[] = []; for (let i = 1; i <= maxSplit; i++) { @@ -113,7 +112,7 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult const readingData = options?.readingData ?? defaultReading; const seiMatch = lookupMatch(sei, "sei", lexicon, isKana, readingData); const meiMatch = lookupMatch(mei, "mei", lexicon, isKana, readingData); - const score = calcScore(seiMatch, meiMatch, seiLen, meiLen); + const score = calcScore(sei, seiMatch, meiMatch, seiLen, meiLen, i, boundaryIndex); candidates.push({ sei, mei, score, seiMatch, meiMatch }); } @@ -144,21 +143,20 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult }; } - // 2. Boundary fallback: rescue at kanji→kana boundary - const fallbackConfidence = tryBoundaryFallback(trimmed, candidates); - if (fallbackConfidence !== undefined) { - // Find the boundary candidate (may differ from score-based best) - const boundaryIndex = findSingleKanjiToKanaBoundary(trimmed); - const boundaryCandidate = candidates.find( - (c) => [...c.sei].length === boundaryIndex - ); - if (boundaryCandidate) { - return { - best: { sei: boundaryCandidate.sei, mei: boundaryCandidate.mei }, - confidence: fallbackConfidence, - candidates, - }; - } + // 2. Boundary confidence: best candidate aligns with script boundary + // and has dictionary evidence on sei side + if ( + boundaryIndex !== undefined && + [...best.sei].length === boundaryIndex && + (best.seiMatch === "surface" || best.seiMatch === "folded") && + best.score >= CONFIDENCE_THRESHOLD && + gap >= BOUNDARY_CONFIDENCE_GAP + ) { + return { + best: { sei: best.sei, mei: best.mei }, + confidence: BOUNDARY_CONFIDENCE, + candidates, + }; } // 3. Low confidence mode @@ -177,43 +175,3 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult candidates, }; } - -/** - * Try to rescue a split using kanji→kana script boundary heuristic. - * Only applies when: - * - There is exactly one script transition (kanji → hiragana/katakana) - * - The boundary candidate has dictionary evidence on the surname side - * - The rescue score meets the confidence threshold - */ -function tryBoundaryFallback( - fullName: string, - candidates: SeimeiCandidate[], -): number | undefined { - const boundaryIndex = findSingleKanjiToKanaBoundary(fullName); - if (boundaryIndex === undefined) return undefined; - - const boundaryCandidate = candidates.find( - (c) => [...c.sei].length === boundaryIndex - ); - if (!boundaryCandidate) return undefined; - - // Require dictionary evidence on surname side - if ( - boundaryCandidate.seiMatch === "none" || - boundaryCandidate.seiMatch === "reading" - ) { - return undefined; - } - - const rescueScore = boundaryCandidate.score + BOUNDARY_RESCUE_BONUS; - if (rescueScore < CONFIDENCE_THRESHOLD) return undefined; - - // Check gap against other candidates' rescue scores - const otherBest = candidates - .filter((c) => [...c.sei].length !== boundaryIndex) - .reduce((max, c) => Math.max(max, c.score), -Infinity); - const rescueGap = rescueScore - otherBest; - if (rescueGap < BOUNDARY_RESCUE_MIN_GAP) return undefined; - - return BOUNDARY_RESCUE_CONFIDENCE; -} diff --git a/test/unit/split.test.ts b/test/unit/split.test.ts index 817d7a6..5e2c69f 100644 --- a/test/unit/split.test.ts +++ b/test/unit/split.test.ts @@ -3,8 +3,10 @@ import { split, analyze, setLexicon } from "../../src/core/splitter"; import type { PackedLexicon } from "../../src/core/types"; // Minimal test lexicon +// Note: 夏色 and 白銀 are NOT included as surnames — they must be resolved +// by script boundary heuristic, not dictionary lookup const testLexicon: PackedLexicon = { - sei: ["田中", "佐藤", "大瀬良", "林", "勅使河原", "小鳥遊", "西園寺", "齋藤", "綾瀬", "白銀", "夏色"], + sei: ["田中", "佐藤", "大瀬良", "林", "勅使河原", "小鳥遊", "西園寺", "齋藤", "綾瀬", "夏", "周防"], mei: ["太郎", "花子", "大地", "健太", "公望", "翔", "一郎"], folded: { "斎藤": ["齋藤"], @@ -79,37 +81,48 @@ describe("split", () => { }); }); - describe("境界ヒューリスティック", () => { - it("漢字姓 + ひらがな名を境界フォールバックで救済する", () => { + describe("文字種境界スコアリング", () => { + it("辞書ヒット姓 + かな名を境界で分離する(綾瀬はるか)", () => { const result = analyze("綾瀬はるか"); expect(result.best).toEqual({ sei: "綾瀬", mei: "はるか" }); - expect(result.confidence).toBe(0.8); + expect(result.confidence).toBeGreaterThanOrEqual(0.8); }); - it("漢字姓 + カタカナ名を救済する", () => { - const result = analyze("白銀ノエル"); - expect(result.best).toEqual({ sei: "白銀", mei: "ノエル" }); - expect(result.confidence).toBe(0.8); + it("辞書ヒット姓 + カタカナ名を境界で分離する(周防パトラ)", () => { + const result = analyze("周防パトラ"); + expect(result.best).toEqual({ sei: "周防", mei: "パトラ" }); + expect(result.confidence).toBeGreaterThanOrEqual(0.8); + }); + + it("allowLowConfidence: 辞書未登録でも境界位置が最高スコアになる(夏色まつり)", () => { + // 夏色 is NOT in the dictionary, but boundary scoring should + // make 夏色/まつり rank higher than 夏/色まつり + const result = analyze("夏色まつり", { allowLowConfidence: true }); + expect(result.best).toEqual({ sei: "夏色", mei: "まつり" }); }); - it("漢字姓(辞書ヒット) + ひらがな名を救済する", () => { - expect(split("夏色まつり")).toEqual({ sei: "夏色", mei: "まつり" }); + it("allowLowConfidence: 漢字→カタカナ境界が勝つ(白銀ノエル)", () => { + const result = analyze("白銀ノエル", { allowLowConfidence: true }); + expect(result.best).toEqual({ sei: "白銀", mei: "ノエル" }); }); - it("姓側に辞書根拠がない場合は救済しない", () => { + it("辞書根拠がない場合は通常モードで unsplit", () => { expect(split("東京はなこ")).toEqual({ sei: "東京はなこ", mei: "" }); }); - it("文字種遷移が2回以上ある場合は境界救済しない", () => { - // "夢野あき子" — 遷移2回(漢字→ひらがな→漢字)、辞書ヒットもなし + it("文字種遷移が2回以上ある場合は境界ボーナスなし", () => { expect(split("夢野あき子")).toEqual({ sei: "夢野あき子", mei: "" }); }); - it("通常の辞書高信頼ケースは従来どおり confidence 1.0", () => { + it("全漢字名は境界スコアの影響を受けない", () => { const result = analyze("田中太郎"); expect(result.best).toEqual({ sei: "田中", mei: "太郎" }); expect(result.confidence).toBe(1.0); }); + + it("1文字姓の正当な辞書ヒットは境界がなければ維持される(林一郎)", () => { + expect(split("林一郎")).toEqual({ sei: "林", mei: "一郎" }); + }); }); describe("analyze", () => { From 60bfa852d2a5690ff1fde88fd4ef6030b3df54bb Mon Sep 17 00:00:00 2001 From: tk1024 Date: Sun, 22 Mar 2026 22:06:24 +0900 Subject: [PATCH 3/5] =?UTF-8?q?feat:=20=E3=81=8B=E3=81=AA=E2=86=92?= =?UTF-8?q?=E6=BC=A2=E5=AD=97=E5=A2=83=E7=95=8C=E3=82=B9=E3=82=B3=E3=82=A2?= =?UTF-8?q?=E3=83=AA=E3=83=B3=E3=82=B0=E3=82=92=E8=BF=BD=E5=8A=A0=E3=80=81?= =?UTF-8?q?mei=E5=81=B4=E3=81=AE=E6=B7=B7=E5=9C=A8=E3=83=9A=E3=83=8A?= =?UTF-8?q?=E3=83=AB=E3=83=86=E3=82=A3=E3=81=AF=E9=99=A4=E5=A4=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - findSingleScriptBoundary() で漢字→かな/かな→漢字の両方向を検出 - かな→漢字方向でも境界ボーナス/ペナルティが効くように - mei 側の混在ペナルティは除外(よね子、ルミ子等の自然な名前を保護) - かな姓+漢字名 [lowConf]: 56.3% → 87.5% - 名前内部にかな混在 [lowConf]: 100% 維持(リグレッションなし) Co-Authored-By: Claude Opus 4.6 (1M context) --- src/core/normalize.ts | 44 +++++++++++++++++++++++++++++++++---------- src/core/scorer.ts | 24 ++++++++++++++++++++++- src/core/splitter.ts | 39 +++++++++++++++++++++----------------- 3 files changed, 79 insertions(+), 28 deletions(-) diff --git a/src/core/normalize.ts b/src/core/normalize.ts index 7d202f0..601cdc4 100644 --- a/src/core/normalize.ts +++ b/src/core/normalize.ts @@ -50,17 +50,23 @@ function scriptOf(ch: string): ScriptType { } /** - * Find the split position where a single kanji→kana boundary occurs. - * Returns the character index (not byte index) of the boundary, - * or undefined if no unique boundary exists. + * Find the split position where a single script boundary occurs + * between kanji and kana (in either direction). + * Returns the character index and direction, or undefined if no unique boundary exists. * * Examples: - * "夏色まつり" → 2 (漢字→ひらがな at index 2) - * "白銀ノエル" → 2 (漢字→カタカナ at index 2) + * "夏色まつり" → { index: 2, direction: "kanji-to-kana" } + * "白銀ノエル" → { index: 2, direction: "kanji-to-kana" } + * "デーモン閣下" → { index: 4, direction: "kana-to-kanji" } * "もこ田めめめ" → undefined (2 transitions) * "田中太郎" → undefined (all kanji, no transition) */ -export function findSingleKanjiToKanaBoundary(fullName: string): number | undefined { +export interface ScriptBoundary { + index: number; + direction: "kanji-to-kana" | "kana-to-kanji"; +} + +export function findSingleScriptBoundary(fullName: string): ScriptBoundary | undefined { const chars = [...fullName]; let transitionCount = 0; let splitIndex: number | undefined; @@ -79,10 +85,28 @@ export function findSingleKanjiToKanaBoundary(fullName: string): number | undefi toScript = next; } - if (transitionCount !== 1) return undefined; - if (fromScript !== "kanji") return undefined; - if (toScript !== "hiragana" && toScript !== "katakana") return undefined; - return splitIndex; + if (transitionCount !== 1 || splitIndex === undefined) return undefined; + + const fromIsKanji = fromScript === "kanji"; + const toIsKanji = toScript === "kanji"; + const fromIsKana = fromScript === "hiragana" || fromScript === "katakana"; + const toIsKana = toScript === "hiragana" || toScript === "katakana"; + + if (fromIsKanji && toIsKana) { + return { index: splitIndex, direction: "kanji-to-kana" }; + } + if (fromIsKana && toIsKanji) { + return { index: splitIndex, direction: "kana-to-kanji" }; + } + + return undefined; +} + +/** @deprecated Use findSingleScriptBoundary instead */ +export function findSingleKanjiToKanaBoundary(fullName: string): number | undefined { + const result = findSingleScriptBoundary(fullName); + if (result?.direction === "kanji-to-kana") return result.index; + return undefined; } /** diff --git a/src/core/scorer.ts b/src/core/scorer.ts index 4177607..8225c6c 100644 --- a/src/core/scorer.ts +++ b/src/core/scorer.ts @@ -91,6 +91,26 @@ function seiMixedScriptPenalty(sei: string, seiMatch: MatchType): number { return SEI_MIXED_MULTI_KANA_PENALTY; } +/** + * Penalty for OOV given names that start with kana followed by kanji (e.g. モン閣下, イク眞木). + * When a kana→kanji boundary exists, the mei side should be pure kanji. + * Only applied when the given name has no dictionary hit. + */ +function meiMixedScriptPenalty(mei: string, meiMatch: MatchType): number { + if (meiMatch !== "none") return 0; + + const p = scriptPattern(mei); + if (!/^[HT]+K+$/.test(p)) return 0; + + const prefix = p.match(/^[HT]+/)![0]; + if (prefix.length === 1) { + return prefix[0] === "T" + ? SEI_MIXED_SINGLE_KATA_PENALTY + : SEI_MIXED_SINGLE_HIRA_PENALTY; + } + return SEI_MIXED_MULTI_KANA_PENALTY; +} + /** * Look up a candidate string in the lexicon. * Returns the match type: surface > folded > reading > none. @@ -140,6 +160,7 @@ export function lookupMatch( */ export function calcScore( sei: string, + mei: string, seiMatch: MatchType, meiMatch: MatchType, seiLen: number, @@ -191,7 +212,8 @@ export function calcScore( } } - // OOV surname mixed-script penalty + // OOV surname mixed-script penalty (mei side is not penalized — + // names like よね子, ルミ子, 美つ子 naturally mix scripts) score += seiMixedScriptPenalty(sei, seiMatch); return score; diff --git a/src/core/splitter.ts b/src/core/splitter.ts index 9b1ea5f..88869a0 100644 --- a/src/core/splitter.ts +++ b/src/core/splitter.ts @@ -6,14 +6,14 @@ import type { SeimeiResult, SplitOptions, } from "./types.js"; -import { isAllHiragana, isAllKatakana, isNonJapanese, findSingleKanjiToKanaBoundary } from "./normalize.js"; +import { isAllHiragana, isAllKatakana, isNonJapanese, findSingleScriptBoundary } from "./normalize.js"; import { calcScore, lookupMatch } from "./scorer.js"; const CONFIDENCE_THRESHOLD = 6.0; const CONFIDENCE_GAP = 1.0; // Boundary confidence: when the best candidate aligns with a script boundary -// and has dictionary evidence on sei side, grant confidence 0.8 +// and has dictionary evidence, grant confidence 0.8 const BOUNDARY_CONFIDENCE = 0.8; const BOUNDARY_CONFIDENCE_GAP = 0.5; @@ -98,7 +98,8 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult const isKana = isAllHiragana(trimmed) || isAllKatakana(trimmed); const maxSplit = Math.min(lexicon.maxSeiLen, n - 1); - const boundaryIndex = findSingleKanjiToKanaBoundary(trimmed); + const boundary = findSingleScriptBoundary(trimmed); + const boundaryIndex = boundary?.index; const candidates: SeimeiCandidate[] = []; for (let i = 1; i <= maxSplit; i++) { @@ -112,7 +113,7 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult const readingData = options?.readingData ?? defaultReading; const seiMatch = lookupMatch(sei, "sei", lexicon, isKana, readingData); const meiMatch = lookupMatch(mei, "mei", lexicon, isKana, readingData); - const score = calcScore(sei, seiMatch, meiMatch, seiLen, meiLen, i, boundaryIndex); + const score = calcScore(sei, mei, seiMatch, meiMatch, seiLen, meiLen, i, boundaryIndex); candidates.push({ sei, mei, score, seiMatch, meiMatch }); } @@ -144,19 +145,23 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult } // 2. Boundary confidence: best candidate aligns with script boundary - // and has dictionary evidence on sei side - if ( - boundaryIndex !== undefined && - [...best.sei].length === boundaryIndex && - (best.seiMatch === "surface" || best.seiMatch === "folded") && - best.score >= CONFIDENCE_THRESHOLD && - gap >= BOUNDARY_CONFIDENCE_GAP - ) { - return { - best: { sei: best.sei, mei: best.mei }, - confidence: BOUNDARY_CONFIDENCE, - candidates, - }; + // and has dictionary evidence on the appropriate side + if (boundaryIndex !== undefined && [...best.sei].length === boundaryIndex) { + const hasDictEvidence = boundary?.direction === "kanji-to-kana" + ? (best.seiMatch === "surface" || best.seiMatch === "folded") + : (best.meiMatch === "surface" || best.meiMatch === "folded"); + + if ( + hasDictEvidence && + best.score >= CONFIDENCE_THRESHOLD && + gap >= BOUNDARY_CONFIDENCE_GAP + ) { + return { + best: { sei: best.sei, mei: best.mei }, + confidence: BOUNDARY_CONFIDENCE, + candidates, + }; + } } // 3. Low confidence mode From 4dd4aa2ef06456edb295c5c1e66ffd8b56159142 Mon Sep 17 00:00:00 2001 From: tk1024 Date: Sun, 22 Mar 2026 22:12:35 +0900 Subject: [PATCH 4/5] =?UTF-8?q?fix:=20SEI=5FHIT=5FBONUS=20=E3=82=92=200.8?= =?UTF-8?q?=20=E2=86=92=200.5=20=E3=81=AB=E8=AA=BF=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 0.8 だと1文字姓の辞書ヒットが強すぎて、上白石萌音、柳樂優弥等で 正しい複数文字姓に勝ってしまうリグレッションが発生していた。 0.5 で松井珠理奈等の改善を維持しつつリグレッションを解消。 Co-Authored-By: Claude Opus 4.6 (1M context) --- src/core/scorer.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/core/scorer.ts b/src/core/scorer.ts index 8225c6c..cf9383e 100644 --- a/src/core/scorer.ts +++ b/src/core/scorer.ts @@ -30,6 +30,10 @@ const MEI_LENGTH_SCORE: Record = { const PAIR_BONUS = 0.8; const BOTH_SINGLE_CHAR_PENALTY = -1.0; +// Surname hit is stronger evidence than given name hit, +// because surnames are a finite known set while given names are creative +const SEI_HIT_BONUS = 0.5; + // Script boundary scoring const BOUNDARY_MATCH_BONUS = 1.2; const BOUNDARY_MATCH_WITH_DICT_BONUS = 0.8; @@ -174,6 +178,12 @@ export function calcScore( score += MATCH_SCORE[seiMatch]; score += MATCH_SCORE[meiMatch]; + // Surname hit bonus: surnames are a known finite set, + // so a dictionary hit on sei is stronger evidence than on mei + if (seiMatch === "surface" || seiMatch === "folded") { + score += SEI_HIT_BONUS; + } + // Length scores (secondary signal) score += SEI_LENGTH_SCORE[Math.min(seiLen, 6)] ?? -0.5; score += MEI_LENGTH_SCORE[Math.min(meiLen, 6)] ?? -0.5; From a02045385f8e366f7f32dd9cbb58981b97337895 Mon Sep 17 00:00:00 2001 From: tk1024 Date: Sun, 22 Mar 2026 22:23:29 +0900 Subject: [PATCH 5/5] =?UTF-8?q?feat:=20=E3=82=AB=E3=82=BF=E3=82=AB?= =?UTF-8?q?=E3=83=8A=E5=A7=93=E3=81=AE=E4=BE=8B=E5=A4=96=E3=83=95=E3=83=AD?= =?UTF-8?q?=E3=83=BC=20+=20SEI=5FHIT=5FBONUS=E8=AA=BF=E6=95=B4=20+=20?= =?UTF-8?q?=E3=83=A6=E3=83=8B=E3=83=83=E3=83=88=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E6=8B=A1=E5=85=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - カタカナ姓の例外フロー: 前半が全カタカナの場合、後半を姓辞書で照合 して一致すれば芸名パターンとして採用(confidence 0.8) 例: ジャガー/横田、ダン/池田、マイク/眞木 - SEI_HIT_BONUS: 0.8 → 0.5 に調整(1文字姓リグレッション防止) - ユニットテストを大幅拡充: - findSingleScriptBoundary の関数テスト - OOV姓混在ペナルティのテスト - 姓ヒットボーナスのテスト - カタカナ姓例外フローのテスト 84テスト全パス かな姓+漢字名 [lowConf]: 81.3% → 100% MVP [lowConf]: 99.5% (1 wrong: 池井戸潤) Co-Authored-By: Claude Opus 4.6 (1M context) --- src/core/splitter.ts | 24 ++++++++++- test/unit/split.test.ts | 95 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 107 insertions(+), 12 deletions(-) diff --git a/src/core/splitter.ts b/src/core/splitter.ts index 88869a0..aa615bd 100644 --- a/src/core/splitter.ts +++ b/src/core/splitter.ts @@ -164,7 +164,29 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult } } - // 3. Low confidence mode + // 3. Katakana-sei exception: when the best candidate's sei is all katakana + // (e.g. ジャガー/横田), real Japanese surnames are never pure katakana. + // Re-score the boundary candidate by looking up mei in sei dict instead. + if (boundary?.direction === "kana-to-kanji" && boundaryIndex !== undefined) { + const boundaryCandidate = candidates.find( + (c) => [...c.sei].length === boundaryIndex && isAllKatakana(c.sei) + ); + if (boundaryCandidate) { + const meiAsSei = lookupMatch( + boundaryCandidate.mei, "sei", lexicon, false, + options?.readingData ?? defaultReading, + ); + if (meiAsSei === "surface" || meiAsSei === "folded") { + return { + best: { sei: boundaryCandidate.sei, mei: boundaryCandidate.mei }, + confidence: BOUNDARY_CONFIDENCE, + candidates, + }; + } + } + } + + // 4. Low confidence mode if (options?.allowLowConfidence) { return { best: { sei: best.sei, mei: best.mei }, diff --git a/test/unit/split.test.ts b/test/unit/split.test.ts index 5e2c69f..5fa1daa 100644 --- a/test/unit/split.test.ts +++ b/test/unit/split.test.ts @@ -1,18 +1,16 @@ import { describe, it, expect, beforeAll } from "vitest"; import { split, analyze, setLexicon } from "../../src/core/splitter"; +import { findSingleScriptBoundary } from "../../src/core/normalize"; import type { PackedLexicon } from "../../src/core/types"; -// Minimal test lexicon -// Note: 夏色 and 白銀 are NOT included as surnames — they must be resolved -// by script boundary heuristic, not dictionary lookup const testLexicon: PackedLexicon = { - sei: ["田中", "佐藤", "大瀬良", "林", "勅使河原", "小鳥遊", "西園寺", "齋藤", "綾瀬", "夏", "周防"], - mei: ["太郎", "花子", "大地", "健太", "公望", "翔", "一郎"], + sei: ["田中", "佐藤", "大瀬良", "林", "勅使河原", "小鳥遊", "西園寺", "齋藤", "綾瀬", "夏", "周防", "横田", "池田", "秋山", "松村", "高峰"], + mei: ["太郎", "花子", "大地", "健太", "公望", "翔", "一郎", "リン", "田"], folded: { "斎藤": ["齋藤"], }, maxSeiLen: 4, - maxMeiLen: 3, + maxMeiLen: 5, }; describe("split", () => { @@ -81,6 +79,36 @@ describe("split", () => { }); }); + describe("findSingleScriptBoundary", () => { + it("漢字→ひらがな境界を検出する", () => { + expect(findSingleScriptBoundary("夏色まつり")).toEqual({ index: 2, direction: "kanji-to-kana" }); + }); + + it("漢字→カタカナ境界を検出する", () => { + expect(findSingleScriptBoundary("白銀ノエル")).toEqual({ index: 2, direction: "kanji-to-kana" }); + }); + + it("カタカナ→漢字境界を検出する", () => { + expect(findSingleScriptBoundary("ジャガー横田")).toEqual({ index: 4, direction: "kana-to-kanji" }); + }); + + it("ひらがな→漢字境界を検出する", () => { + expect(findSingleScriptBoundary("かたせ梨乃")).toEqual({ index: 3, direction: "kana-to-kanji" }); + }); + + it("全漢字は undefined", () => { + expect(findSingleScriptBoundary("田中太郎")).toBeUndefined(); + }); + + it("遷移2回は undefined", () => { + expect(findSingleScriptBoundary("もこ田めめめ")).toBeUndefined(); + }); + + it("全ひらがなは undefined", () => { + expect(findSingleScriptBoundary("たなかたろう")).toBeUndefined(); + }); + }); + describe("文字種境界スコアリング", () => { it("辞書ヒット姓 + かな名を境界で分離する(綾瀬はるか)", () => { const result = analyze("綾瀬はるか"); @@ -94,14 +122,12 @@ describe("split", () => { expect(result.confidence).toBeGreaterThanOrEqual(0.8); }); - it("allowLowConfidence: 辞書未登録でも境界位置が最高スコアになる(夏色まつり)", () => { - // 夏色 is NOT in the dictionary, but boundary scoring should - // make 夏色/まつり rank higher than 夏/色まつり + it("allowLowConfidence: 辞書未登録でも境界位置が最高スコアになる", () => { const result = analyze("夏色まつり", { allowLowConfidence: true }); expect(result.best).toEqual({ sei: "夏色", mei: "まつり" }); }); - it("allowLowConfidence: 漢字→カタカナ境界が勝つ(白銀ノエル)", () => { + it("allowLowConfidence: 漢字→カタカナ境界が勝つ", () => { const result = analyze("白銀ノエル", { allowLowConfidence: true }); expect(result.best).toEqual({ sei: "白銀", mei: "ノエル" }); }); @@ -120,11 +146,58 @@ describe("split", () => { expect(result.confidence).toBe(1.0); }); - it("1文字姓の正当な辞書ヒットは境界がなければ維持される(林一郎)", () => { + it("1文字姓の正当な辞書ヒットは境界がなければ維持される", () => { expect(split("林一郎")).toEqual({ sei: "林", mei: "一郎" }); }); }); + describe("OOV姓の混在ペナルティ", () => { + it("漢字+カタカナ1文字の姓は大きく減点される", () => { + // 宝鐘マ/リン より 宝鐘/マリン が勝つべき + const result = analyze("宝鐘マリン", { allowLowConfidence: true }); + expect(result.best).toEqual({ sei: "宝鐘", mei: "マリン" }); + }); + + it("漢字+ひらがなの姓も減点される", () => { + const result = analyze("星街すいせい", { allowLowConfidence: true }); + expect(result.best).toEqual({ sei: "星街", mei: "すいせい" }); + }); + + it("辞書ヒットする姓には混在ペナルティが適用されない", () => { + // 綾瀬 is in dict — no penalty + const result = analyze("綾瀬はるか"); + expect(result.best).toEqual({ sei: "綾瀬", mei: "はるか" }); + }); + }); + + describe("姓ヒットボーナス", () => { + it("姓辞書ヒットは名辞書ヒットより優先される", () => { + // 松村/沙友理 (sei=surface) vs 松村沙/友理 (mei=surface) + const result = analyze("松村沙友理", { allowLowConfidence: true }); + expect(result.best.sei).toBe("松村"); + }); + }); + + describe("カタカナ姓の例外フロー", () => { + it("全カタカナ姓 + 漢字名は後半を姓辞書で照合する", () => { + // ジャガー/横田: 横田が姓辞書にヒット → 例外フローで採用 + const result = analyze("ジャガー横田"); + expect(result.best).toEqual({ sei: "ジャガー", mei: "横田" }); + expect(result.confidence).toBe(0.8); + }); + + it("ダン/池田も例外フローで分離する", () => { + const result = analyze("ダン池田"); + expect(result.best).toEqual({ sei: "ダン", mei: "池田" }); + expect(result.confidence).toBe(0.8); + }); + + it("漢字姓+漢字名には例外フローが適用されない", () => { + const result = analyze("田中太郎"); + expect(result.confidence).toBe(1.0); + }); + }); + describe("analyze", () => { it("候補リストとconfidenceを返す", () => { const result = analyze("田中太郎");