Merge pull request #261 from lcnetdev/korean_decompose

scossu · web-flow · commit a9042971b84a · 2025-10-23T22:01:15.000-04:00
Korean decompose
diff --git a/scriptshifter/hooks/korean/romanizer.py b/scriptshifter/hooks/korean/romanizer.py
@@ -25,6 +25,7 @@
 
 from csv import reader
 from os import path
+from unicodedata import normalize
 
 from scriptshifter.exceptions import BREAK
 from scriptshifter.hooks.korean import KCONF
@@ -92,6 +93,8 @@ def s2r_names_post_config(ctx):
 def _romanize_nonames(src, options):
     """ Main Romanization function for non-name strings. """
 
+    # Normalize to precomposed characters.
+    src = normalize("NFC", src)
     # FKR038: Convert Chinese characters to Hangul
     if options.get("hancha", True):
         kor = _hancha2hangul(_marc8_hancha(src))
@@ -142,6 +145,8 @@ def _romanize_names(src, options):
     """
     rom_ls = []
     warnings = []
+    # Normalize to precomposed characters.
+    src = normalize("NFC", src)
 
     if "," in src and "·" in src:
         warnings.append(
@@ -386,9 +391,10 @@ def _romanize_oclc_auto(kor):
 
 # FKR068: Exceptions, Exceptions to initial sound law, Proper names
 def _kor_rom(kor):
-    # Only convert string if it contains CJK (i.e. do not change if already romanized)
-    # \u3000 is the ideographic space, the lowest codepoint in the Unicode CJK range
-    if max(kor) < '\u3000': 
+    # Only convert string if it contains CJK (i.e. do not change if already
+    # romanized) \u3000 is the ideographic space, the lowest codepoint in the
+    # Unicode CJK range
+    if max(kor) < '\u3000':
         return kor
 
     kor = re.sub(r"\s{2,}", " ", kor.strip())