Skip to content

Commit a904297

Browse files
authored
Merge pull request #261 from lcnetdev/korean_decompose
Korean decompose
2 parents 3e07723 + 2cc5d7e commit a904297

File tree

1 file changed

+9
-3
lines changed

1 file changed

+9
-3
lines changed

scriptshifter/hooks/korean/romanizer.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
from csv import reader
2727
from os import path
28+
from unicodedata import normalize
2829

2930
from scriptshifter.exceptions import BREAK
3031
from scriptshifter.hooks.korean import KCONF
@@ -92,6 +93,8 @@ def s2r_names_post_config(ctx):
9293
def _romanize_nonames(src, options):
9394
""" Main Romanization function for non-name strings. """
9495

96+
# Normalize to precomposed characters.
97+
src = normalize("NFC", src)
9598
# FKR038: Convert Chinese characters to Hangul
9699
if options.get("hancha", True):
97100
kor = _hancha2hangul(_marc8_hancha(src))
@@ -142,6 +145,8 @@ def _romanize_names(src, options):
142145
"""
143146
rom_ls = []
144147
warnings = []
148+
# Normalize to precomposed characters.
149+
src = normalize("NFC", src)
145150

146151
if "," in src and "·" in src:
147152
warnings.append(
@@ -386,9 +391,10 @@ def _romanize_oclc_auto(kor):
386391

387392
# FKR068: Exceptions, Exceptions to initial sound law, Proper names
388393
def _kor_rom(kor):
389-
# Only convert string if it contains CJK (i.e. do not change if already romanized)
390-
# \u3000 is the ideographic space, the lowest codepoint in the Unicode CJK range
391-
if max(kor) < '\u3000':
394+
# Only convert string if it contains CJK (i.e. do not change if already
395+
# romanized) \u3000 is the ideographic space, the lowest codepoint in the
396+
# Unicode CJK range
397+
if max(kor) < '\u3000':
392398
return kor
393399

394400
kor = re.sub(r"\s{2,}", " ", kor.strip())

0 commit comments

Comments
 (0)