|
25 | 25 |
|
26 | 26 | from csv import reader |
27 | 27 | from os import path |
| 28 | +from unicodedata import normalize |
28 | 29 |
|
29 | 30 | from scriptshifter.exceptions import BREAK |
30 | 31 | from scriptshifter.hooks.korean import KCONF |
@@ -92,6 +93,8 @@ def s2r_names_post_config(ctx): |
92 | 93 | def _romanize_nonames(src, options): |
93 | 94 | """ Main Romanization function for non-name strings. """ |
94 | 95 |
|
| 96 | + # Normalize to precomposed characters. |
| 97 | + src = normalize("NFC", src) |
95 | 98 | # FKR038: Convert Chinese characters to Hangul |
96 | 99 | if options.get("hancha", True): |
97 | 100 | kor = _hancha2hangul(_marc8_hancha(src)) |
@@ -142,6 +145,8 @@ def _romanize_names(src, options): |
142 | 145 | """ |
143 | 146 | rom_ls = [] |
144 | 147 | warnings = [] |
| 148 | + # Normalize to precomposed characters. |
| 149 | + src = normalize("NFC", src) |
145 | 150 |
|
146 | 151 | if "," in src and "·" in src: |
147 | 152 | warnings.append( |
@@ -386,9 +391,10 @@ def _romanize_oclc_auto(kor): |
386 | 391 |
|
387 | 392 | # FKR068: Exceptions, Exceptions to initial sound law, Proper names |
388 | 393 | def _kor_rom(kor): |
389 | | - # Only convert string if it contains CJK (i.e. do not change if already romanized) |
390 | | - # \u3000 is the ideographic space, the lowest codepoint in the Unicode CJK range |
391 | | - if max(kor) < '\u3000': |
| 394 | + # Only convert string if it contains CJK (i.e. do not change if already |
| 395 | + # romanized) \u3000 is the ideographic space, the lowest codepoint in the |
| 396 | + # Unicode CJK range |
| 397 | + if max(kor) < '\u3000': |
392 | 398 | return kor |
393 | 399 |
|
394 | 400 | kor = re.sub(r"\s{2,}", " ", kor.strip()) |
|
0 commit comments