Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions src/core/normalize.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,75 @@ const VARIANT_MAP: Record<string, string> = {
"條": "条", "圓": "円",
};

type ScriptType = "kanji" | "hiragana" | "katakana" | "other";

function scriptOf(ch: string): ScriptType {
if (/[\u3041-\u3096]/.test(ch)) return "hiragana";
if (/[\u30A1-\u30F6\u30FC]/.test(ch)) return "katakana";
if (/[\p{Script=Han}々〆ヶ]/u.test(ch)) return "kanji";
return "other";
}

/**
* Find the split position where a single script boundary occurs
* between kanji and kana (in either direction).
* Returns the character index and direction, or undefined if no unique boundary exists.
*
* Examples:
* "夏色まつり" → { index: 2, direction: "kanji-to-kana" }
* "白銀ノエル" → { index: 2, direction: "kanji-to-kana" }
* "デーモン閣下" → { index: 4, direction: "kana-to-kanji" }
* "もこ田めめめ" → undefined (2 transitions)
* "田中太郎" → undefined (all kanji, no transition)
*/
export interface ScriptBoundary {
index: number;
direction: "kanji-to-kana" | "kana-to-kanji";
}

export function findSingleScriptBoundary(fullName: string): ScriptBoundary | undefined {
const chars = [...fullName];
let transitionCount = 0;
let splitIndex: number | undefined;
let fromScript: ScriptType | undefined;
let toScript: ScriptType | undefined;

for (let i = 1; i < chars.length; i++) {
const prev = scriptOf(chars[i - 1]);
const next = scriptOf(chars[i]);
if (prev === next) continue;
if (prev === "other" || next === "other") return undefined;
transitionCount++;
if (transitionCount > 1) return undefined;
splitIndex = i;
fromScript = prev;
toScript = next;
}

if (transitionCount !== 1 || splitIndex === undefined) return undefined;

const fromIsKanji = fromScript === "kanji";
const toIsKanji = toScript === "kanji";
const fromIsKana = fromScript === "hiragana" || fromScript === "katakana";
const toIsKana = toScript === "hiragana" || toScript === "katakana";

if (fromIsKanji && toIsKana) {
return { index: splitIndex, direction: "kanji-to-kana" };
}
if (fromIsKana && toIsKanji) {
return { index: splitIndex, direction: "kana-to-kanji" };
}

return undefined;
}

/** @deprecated Use findSingleScriptBoundary instead */
export function findSingleKanjiToKanaBoundary(fullName: string): number | undefined {
const result = findSingleScriptBoundary(fullName);
if (result?.direction === "kanji-to-kana") return result.index;
return undefined;
}

/**
* Fold variant kanji to their canonical forms.
*/
Expand Down
102 changes: 102 additions & 0 deletions src/core/scorer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,21 @@ const MEI_LENGTH_SCORE: Record<number, number> = {
const PAIR_BONUS = 0.8;
const BOTH_SINGLE_CHAR_PENALTY = -1.0;

// Surname hit is stronger evidence than given name hit,
// because surnames are a finite known set while given names are creative
const SEI_HIT_BONUS = 0.5;

// Script boundary scoring
const BOUNDARY_MATCH_BONUS = 1.2;
const BOUNDARY_MATCH_WITH_DICT_BONUS = 0.8;
const BOUNDARY_BEFORE_PENALTY = -3.0;
const BOUNDARY_AFTER_PENALTY = -1.8;

// Sei mixed-script penalty: OOV surname containing kana is unnatural
const SEI_MIXED_SINGLE_HIRA_PENALTY = -2.5;
const SEI_MIXED_SINGLE_KATA_PENALTY = -3.0;
const SEI_MIXED_MULTI_KANA_PENALTY = -1.5;

// Cache for Set-based lookups built from string[]
const setCache = new WeakMap<PackedLexicon, { sei: Set<string>; mei: Set<string> }>();

Expand All @@ -45,6 +60,61 @@ function getSets(lexicon: PackedLexicon): { sei: Set<string>; mei: Set<string> }
return cached;
}

const RE_KANJI = /[\p{Script=Han}々〆ヶ]/u;
const RE_HIRAGANA = /[\u3041-\u3096]/;
const RE_KATAKANA = /[\u30A1-\u30F6\u30FC]/;

function scriptOf(ch: string): "K" | "H" | "T" | "O" {
if (RE_KANJI.test(ch)) return "K";
if (RE_HIRAGANA.test(ch)) return "H";
if (RE_KATAKANA.test(ch)) return "T";
return "O";
}

function scriptPattern(s: string): string {
return [...s].map(scriptOf).join("");
}

/**
* Penalty for OOV surnames that contain kana (e.g. 宝鐘マ, 星街すい).
* Real Japanese surnames are almost always pure kanji.
* Only applied when the surname has no dictionary hit.
*/
function seiMixedScriptPenalty(sei: string, seiMatch: MatchType): number {
if (seiMatch !== "none") return 0;

const p = scriptPattern(sei);
if (!/^K+[HT]+$/.test(p)) return 0;

const suffix = p.match(/[HT]+$/)![0];
if (suffix.length === 1) {
return suffix[0] === "T"
? SEI_MIXED_SINGLE_KATA_PENALTY
: SEI_MIXED_SINGLE_HIRA_PENALTY;
}
return SEI_MIXED_MULTI_KANA_PENALTY;
}

/**
* Penalty for OOV given names that start with kana followed by kanji (e.g. モン閣下, イク眞木).
* When a kana→kanji boundary exists, the mei side should be pure kanji.
* Only applied when the given name has no dictionary hit.
*/
function meiMixedScriptPenalty(mei: string, meiMatch: MatchType): number {
if (meiMatch !== "none") return 0;

const p = scriptPattern(mei);
if (!/^[HT]+K+$/.test(p)) return 0;

const prefix = p.match(/^[HT]+/)![0];
if (prefix.length === 1) {
return prefix[0] === "T"
? SEI_MIXED_SINGLE_KATA_PENALTY
: SEI_MIXED_SINGLE_HIRA_PENALTY;
}
return SEI_MIXED_MULTI_KANA_PENALTY;
}

/**
* Look up a candidate string in the lexicon.
* Returns the match type: surface > folded > reading > none.
Expand Down Expand Up @@ -87,19 +157,33 @@ export function lookupMatch(

/**
* Calculate the score for a split candidate.
*
* @param sei - the surname candidate string
* @param splitIndex - the character index where this candidate splits (i.e. sei length)
* @param boundaryIndex - the unique kanji→kana boundary position, or undefined if none
*/
export function calcScore(
sei: string,
mei: string,
seiMatch: MatchType,
meiMatch: MatchType,
seiLen: number,
meiLen: number,
splitIndex: number,
boundaryIndex: number | undefined,
): number {
let score = 0;

// Match scores (primary signal)
score += MATCH_SCORE[seiMatch];
score += MATCH_SCORE[meiMatch];

// Surname hit bonus: surnames are a known finite set,
// so a dictionary hit on sei is stronger evidence than on mei
if (seiMatch === "surface" || seiMatch === "folded") {
score += SEI_HIT_BONUS;
}

// Length scores (secondary signal)
score += SEI_LENGTH_SCORE[Math.min(seiLen, 6)] ?? -0.5;
score += MEI_LENGTH_SCORE[Math.min(meiLen, 6)] ?? -0.5;
Expand All @@ -124,5 +208,23 @@ export function calcScore(
score = -Infinity;
}

// Script boundary scoring
if (boundaryIndex !== undefined) {
if (splitIndex === boundaryIndex) {
score += BOUNDARY_MATCH_BONUS;
if (seiMatch === "surface" || seiMatch === "folded") {
score += BOUNDARY_MATCH_WITH_DICT_BONUS;
}
} else if (splitIndex < boundaryIndex) {
score += BOUNDARY_BEFORE_PENALTY;
} else {
score += BOUNDARY_AFTER_PENALTY;
}
}

// OOV surname mixed-script penalty (mei side is not penalized —
// names like よね子, ルミ子, 美つ子 naturally mix scripts)
score += seiMixedScriptPenalty(sei, seiMatch);

return score;
}
70 changes: 64 additions & 6 deletions src/core/splitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,17 @@ import type {
SeimeiResult,
SplitOptions,
} from "./types.js";
import { isAllHiragana, isAllKatakana, isNonJapanese } from "./normalize.js";
import { isAllHiragana, isAllKatakana, isNonJapanese, findSingleScriptBoundary } from "./normalize.js";
import { calcScore, lookupMatch } from "./scorer.js";

const CONFIDENCE_THRESHOLD = 6.0;
const CONFIDENCE_GAP = 1.0;

// Boundary confidence: when the best candidate aligns with a script boundary
// and has dictionary evidence, grant confidence 0.8
const BOUNDARY_CONFIDENCE = 0.8;
const BOUNDARY_CONFIDENCE_GAP = 0.5;

let defaultLexicon: PackedLexicon | undefined;
let defaultReading: ReadingData | undefined;

Expand Down Expand Up @@ -73,7 +78,6 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult

const lexicon = options?.lexicon ?? defaultLexicon;
if (!lexicon) {
// No lexicon loaded: return unsplit
return {
best: { sei: trimmed, mei: "" },
confidence: 0,
Expand All @@ -94,6 +98,8 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult

const isKana = isAllHiragana(trimmed) || isAllKatakana(trimmed);
const maxSplit = Math.min(lexicon.maxSeiLen, n - 1);
const boundary = findSingleScriptBoundary(trimmed);
const boundaryIndex = boundary?.index;
const candidates: SeimeiCandidate[] = [];

for (let i = 1; i <= maxSplit; i++) {
Expand All @@ -107,7 +113,7 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
const readingData = options?.readingData ?? defaultReading;
const seiMatch = lookupMatch(sei, "sei", lexicon, isKana, readingData);
const meiMatch = lookupMatch(mei, "mei", lexicon, isKana, readingData);
const score = calcScore(seiMatch, meiMatch, seiLen, meiLen);
const score = calcScore(sei, mei, seiMatch, meiMatch, seiLen, meiLen, i, boundaryIndex);

candidates.push({ sei, mei, score, seiMatch, meiMatch });
}
Expand All @@ -129,15 +135,67 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
const confident =
best.score >= CONFIDENCE_THRESHOLD && gap >= CONFIDENCE_GAP;

if (confident || options?.allowLowConfidence) {
// 1. Normal confidence: dictionary-based high score
if (confident) {
return {
best: { sei: best.sei, mei: best.mei },
confidence: 1.0,
candidates,
};
}

// 2. Boundary confidence: best candidate aligns with script boundary
// and has dictionary evidence on the appropriate side
if (boundaryIndex !== undefined && [...best.sei].length === boundaryIndex) {
const hasDictEvidence = boundary?.direction === "kanji-to-kana"
? (best.seiMatch === "surface" || best.seiMatch === "folded")
: (best.meiMatch === "surface" || best.meiMatch === "folded");

if (
hasDictEvidence &&
best.score >= CONFIDENCE_THRESHOLD &&
gap >= BOUNDARY_CONFIDENCE_GAP
) {
return {
best: { sei: best.sei, mei: best.mei },
confidence: BOUNDARY_CONFIDENCE,
candidates,
};
}
}

// 3. Katakana-sei exception: when the best candidate's sei is all katakana
// (e.g. ジャガー/横田), real Japanese surnames are never pure katakana.
// Re-score the boundary candidate by looking up mei in sei dict instead.
if (boundary?.direction === "kana-to-kanji" && boundaryIndex !== undefined) {
const boundaryCandidate = candidates.find(
(c) => [...c.sei].length === boundaryIndex && isAllKatakana(c.sei)
);
if (boundaryCandidate) {
const meiAsSei = lookupMatch(
boundaryCandidate.mei, "sei", lexicon, false,
options?.readingData ?? defaultReading,
);
if (meiAsSei === "surface" || meiAsSei === "folded") {
return {
best: { sei: boundaryCandidate.sei, mei: boundaryCandidate.mei },
confidence: BOUNDARY_CONFIDENCE,
candidates,
};
}
}
}

// 4. Low confidence mode
if (options?.allowLowConfidence) {
return {
best: { sei: best.sei, mei: best.mei },
confidence: confident ? 1.0 : best.score / CONFIDENCE_THRESHOLD,
confidence: best.score / CONFIDENCE_THRESHOLD,
candidates,
};
}

// Not confident enough: return unsplit
// 4. Not confident enough: return unsplit
return {
best: { sei: trimmed, mei: "" },
confidence: 0,
Expand Down
Loading
Loading