lcnetdev · scossu · Oct 24, 2025 · Sep 5, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/.github/workflows/push-app-image.yml b/.github/workflows/push-app-image.yml
@@ -24,10 +24,10 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: recursive
+          fetch-tags: true
 
       - name: update version info
         run: |
-          git fetch --tags
           git describe --tags --always >| VERSION
           git rev-parse HEAD >> VERSION
 

diff --git a/.github/workflows/push-test-image.yml b/.github/workflows/push-test-image.yml
@@ -17,10 +17,10 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: recursive
+          fetch-tags: true
 
       - name: update version info
         run: |
-          git fetch --tags
           git describe --tags --always >| VERSION
           git rev-parse HEAD >> VERSION
 

diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,4 @@
 [submodule "ext/arabic_rom"]
 	path = ext/arabic_rom
 	url = https://github.com/fadhleryani/Arabic_ALA-LC_Romanization.git
+	branch = main
diff --git a/ext/arabic_rom b/ext/arabic_rom
diff --git a/scriptshifter/hooks/korean/romanizer.py b/scriptshifter/hooks/korean/romanizer.py
@@ -25,6 +25,7 @@
 
 from csv import reader
 from os import path
+from unicodedata import normalize
 
 from scriptshifter.exceptions import BREAK
 from scriptshifter.hooks.korean import KCONF
@@ -92,6 +93,8 @@ def s2r_names_post_config(ctx):
 def _romanize_nonames(src, options):
     """ Main Romanization function for non-name strings. """
 
+    # Normalize to precomposed characters.
+    src = normalize("NFC", src)
     # FKR038: Convert Chinese characters to Hangul
     if options.get("hancha", True):
         kor = _hancha2hangul(_marc8_hancha(src))
@@ -142,6 +145,8 @@ def _romanize_names(src, options):
     """
     rom_ls = []
     warnings = []
+    # Normalize to precomposed characters.
+    src = normalize("NFC", src)
 
     if "," in src and "·" in src:
         warnings.append(
@@ -386,9 +391,10 @@ def _romanize_oclc_auto(kor):
 
 # FKR068: Exceptions, Exceptions to initial sound law, Proper names
 def _kor_rom(kor):
-    # Only convert string if it contains CJK (i.e. do not change if already romanized)
-    # \u3000 is the ideographic space, the lowest codepoint in the Unicode CJK range
-    if max(kor) < '\u3000': 
+    # Only convert string if it contains CJK (i.e. do not change if already
+    # romanized) \u3000 is the ideographic space, the lowest codepoint in the
+    # Unicode CJK range
+    if max(kor) < '\u3000':
         return kor
 
     kor = re.sub(r"\s{2,}", " ", kor.strip())

diff --git a/scriptshifter/tables/data/chinese.yml b/scriptshifter/tables/data/chinese.yml
@@ -1,11 +1,9 @@
-# Chinese numerals map.
-#
-# All other Chinese mappings are kept in _chinese_base.yml. This mapping only
-# adds an overlay for parsing numerals and Scriptshifter-specific features.
-
 ---
 general:
   name: Chinese
+  description: >
+    Chinese transliteration table that does not convert Chinese numerals to
+    Indo-Arabic numerals.
   parents:
     - _chinese_base
   case_sensitive: false
@@ -29,39 +27,7 @@ script_to_roman:
 
   hooks:
     pre_assembly:
-      -
-        - chinese.parse_numerals_pre_assembly
       -
         - chinese.person_name_pre_assembly
 
-  map:
-    "〇": "ling#0 "
-    "零": "ling#0 "
-    "一": "yi#1 "
-    "二": "er#2 "
-    "兩": "liang#2 "
-    "两": "liang#2 "
-    "三": "san#3 "
-    "四": "si#4 "
-    "五": "wu#5 "
-    "六": "liu#6 "
-    "七": "qi#7 "
-    "八": "ba#8 "
-    "九": "jiu#9 "
-    "十": "shi#10 "
-    "廾": "gong#20 "
-    "廿": "nian#20 "
-    "卅": "sa#30 "
-    "卌": "xi#40 "
-    "百": "bai#100 "
-    "千": "qian#1000 "
-    "万": "wan#10000 "
-    "萬": "wan#10000 "
-    "亿": "yi#100000000 "
-    "億": "yi#100000000 "
-    "及": "ji# "
-    "至": "zhi# "
-    "年": "nian# "
-    "月": "yue# "
-    "日": "ri# "
-    "第": "di# "
+  map: {}
diff --git a/scriptshifter/tables/data/chinese_numerals.yml b/scriptshifter/tables/data/chinese_numerals.yml
@@ -0,0 +1,70 @@
+# Chinese numerals map.
+#
+# All other Chinese mappings are kept in _chinese_base.yml. This mapping only
+# adds an overlay for parsing numerals and Scriptshifter-specific features.
+
+---
+general:
+  name: Chinese (numerals transliteration)
+  description: >
+    Chinese transliteration table that includes romanization of Chinese
+    numerals.
+  parents:
+    - _chinese_base
+  case_sensitive: false
+
+options:
+  - id: marc_field
+    label: MARC field
+    description: >
+      Romanize according to a specific MARC field format. If indicating a
+      subfield, append it to the numeric field value , e.g. \'245n\'.
+      Leave blank if not applicable.
+    type: string
+    default:
+
+script_to_roman:
+  directives:
+    # Capitalize the first letter of the string only; TODO
+    # Implement a list that includes all punctuation marks that
+    # want the following letter capitalized.
+    capitalize: true
+
+  hooks:
+    pre_assembly:
+      -
+        - chinese.parse_numerals_pre_assembly
+      -
+        - chinese.person_name_pre_assembly
+
+  map:
+    "〇": "ling#0 "
+    "零": "ling#0 "
+    "一": "yi#1 "
+    "二": "er#2 "
+    "兩": "liang#2 "
+    "两": "liang#2 "
+    "三": "san#3 "
+    "四": "si#4 "
+    "五": "wu#5 "
+    "六": "liu#6 "
+    "七": "qi#7 "
+    "八": "ba#8 "
+    "九": "jiu#9 "
+    "十": "shi#10 "
+    "廾": "gong#20 "
+    "廿": "nian#20 "
+    "卅": "sa#30 "
+    "卌": "xi#40 "
+    "百": "bai#100 "
+    "千": "qian#1000 "
+    "万": "wan#10000 "
+    "萬": "wan#10000 "
+    "亿": "yi#100000000 "
+    "億": "yi#100000000 "
+    "及": "ji# "
+    "至": "zhi# "
+    "年": "nian# "
+    "月": "yue# "
+    "日": "ri# "
+    "第": "di# "