Merge branch 'test' of gh:lcnetdev/scriptshifter into test

scossu · scossu · commit b261044a8d9a · 2025-06-03T08:54:51.000-04:00
diff --git a/doc/hooks.md b/doc/hooks.md
@@ -114,6 +114,15 @@ after the hook function is executed. Possible return values are defined below
 for each hook. Some special return values, such as `BREAK` and `CONT`, are
 registered as constants under `scriptshifter.exceptions`.
 
+### Note on running multiple functions on a hook
+
+Currently, if multiple functions are defined for a hook, they are executed
+in the order specified in the configuration. There is no way to skip a function
+implicitly based on the outcome of the previous one. The only state that is
+passed around in this context, is the `ctx` instance of the `Transliterator`
+class. This may change in the future as specific needs arise. 
+
+
 ### Always available context members
 
 The following members of the context object are available in all the hooks:
@@ -191,7 +200,7 @@ ignore term and when or when not to trigger a match.
   at every character iteration. See "Cursor Flags" below.
 - `ctx.dest_ls`: destination token list.
 
-#### Output
+#### Return
 
 `CONT`, `BREAK`, or `None`. `CONT` skips the checks on the
 current ignore token. `BREAK` stops looking up ignore tokens for the current
@@ -217,7 +226,7 @@ scanning for more ignore tokens past the match.
 - `ctx.ignoring`: whether an ignore token matched. If set to `False`, the rest
   of the workflow will assume a non-match.
 
-#### Output
+#### Return
 
 `CONT`, `BREAK`, or `None`. `CONT` voids the match and keeps
 on looking up the ignore list. `BREAK` stops looking up ignore tokens for the
@@ -242,7 +251,7 @@ number of characters, and/or exit the text scanning loop altogether.
 - `ctx.src_tk`: the input token being looked up.
 - `ctx.dest_tk`: the transliterated string associated with the current token.
 
-#### Output
+#### Return
 
 `CONT`, `BREAK`, or `None`. `CONT` skips the checks on the
 current token. `BREAK` stops looking up all tokens for the current
@@ -269,7 +278,7 @@ also inject additional conditions and logic for the match, and revoke the
 - `ctx.match`: whether there was a match. If set to `False`, the rest of the
   workflow will assume a non-match.
 
-#### Output
+#### Return
 
 `CONT`, `BREAK`, or `None`. `CONT` voids the match and keeps
 on looking up the token list. `BREAK` stops looking up tokens for the
@@ -292,7 +301,7 @@ cursor position to the destination list, verbatim.
   at every character iteration. See "Cursor Flags" below.
 - `ctx.dest_ls`: destination token list.
 
-#### Output
+#### Return
 
 `CONT`, `BREAK`, or `None`. `CONT` skips to the next position in the input
 text. Int his case, the function **must** advance the cursor. `BREAK` stops all
@@ -311,10 +320,10 @@ bypass any further output handling.
 
 - `ctx.dest_ls`: destination token list.
 
-#### Output
+#### Return
 
-A string or `None`. If the output is a string, the transliteration function
-returns this string immediately; otherwise it proceeds with standard
+`BREAK` or `None`. If `BREAK`, the content of `ctx.dest`, which should be set
+by the function, is returned immediately; otherwise it proceeds with standard
 adjustments and assembly of the output list.
 
 ### `post_assembly`
@@ -333,9 +342,9 @@ and return it before any further default processing is done.
 
 #### Output
 
-String or `None`. If a string, the transliteration function returns that
-immediately; otherwise it proceeds with standard adjustments of the output
-string before returning.
+`BREAK` or `None`. If `BREAK`, the transliteration function returns the content
+of `ctx.dest` immediately; otherwise it proceeds with standard adjustments of
+the output string before returning.
 
 ## Cursor flags
 
diff --git a/scriptshifter/hooks/chinese/__init__.py b/scriptshifter/hooks/chinese/__init__.py
@@ -4,8 +4,6 @@
 from logging import getLogger
 from re import I, compile, search, sub
 
-from scriptshifter.hooks.general import normalize_spacing_post_assembly
-
 
 logger = getLogger(__name__)
 
@@ -21,7 +19,7 @@ def parse_numerals_pre_assembly(ctx):
     tk_ct = len(ctx.dest_ls)
     token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")
 
-    output = ""
+    output = []
 
     # Use manual loop as i is manipulated inside it.
     i = 0
@@ -36,7 +34,7 @@ def parse_numerals_pre_assembly(ctx):
             # characters representing numbers are converted to Arabic
             # numerals. When a non-numerical token (or end of string) is
             # encountered, the string of numerical tokens is evaluated to
-            # determine which version should be used in the output string.
+            # determine which version should be used in the output.
             # The outer loop then continues where the inner loop left off.
             logger.debug(f"Match number: {tk_i}.")
             text_v = num_v = ""
@@ -96,7 +94,7 @@ def parse_numerals_pre_assembly(ctx):
                             while search("[0-9] [0-9]", num_v):
                                 num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
 
-                    output += num_v if use_num_v else text_v
+                    output.append(num_v if use_num_v else text_v)
 
                     # if the end of the string is not reached, backtrack to the
                     # delimiter after the last numerical token (i.e. two tokens
@@ -117,16 +115,12 @@ def parse_numerals_pre_assembly(ctx):
 
         else:
             logger.debug(f"No numeric match: adding {tk_i}.")
-            output += tk_i
+            output.append(tk_i)
 
         i += 1
 
     logger.debug(f"Use num version: {use_num_v}")
-    ctx.dest = output
-
-    # Skip main transliterate function joining.
-
-    return normalize_spacing_post_assembly(ctx)
+    ctx.dest_ls = output
 
 
 def person_name_pre_assembly(ctx):
diff --git a/scriptshifter/hooks/general/__init__.py b/scriptshifter/hooks/general/__init__.py
@@ -5,13 +5,14 @@
 from logging import getLogger
 from re import compile
 
-from scriptshifter.trans import MULTI_WS_RE
 
+# Match multiple spaces.
+MULTI_WS_RE = compile(r"(\s){2,}")
 
 # Punctuation and brackets.
 # TODO add angled brackets, opening and closing quotes, etc.
 NORM1_RE = compile(r"\s([.,;:\)\]}])")
-NORM2_RE = compile(r"([.,;:\)\]}])(\S)")
+NORM2_RE = compile(r"([,;\)\]}])(\S)")
 NORM3_RE = compile(r"([\(\[\{])\s")
 NORM4_RE = compile(r"(\S)([\(\[\{])")
 
@@ -42,12 +43,15 @@ def capitalize_post_assembly(ctx):
 
     dest_ls = _capitalize(dest_ls, ctx.options.get("capitalize"))
 
-    return " ".join(dest_ls)
+    ctx.dest = " ".join(dest_ls)
 
 
 def normalize_spacing_post_assembly(ctx):
     """
     Remove duplicate and unwanted whitespace around punctuation.
+
+    NOTE: This is called by default by transliterate() immediately after the
+    `post_assembly` hook.
     """
     # De-duplicate whitespace.
     logger.debug(f"Dest pre manipulation: {ctx.dest}")
@@ -70,7 +74,7 @@ def normalize_spacing_post_assembly(ctx):
     # Remove multiple white space characters.
     # norm = NORM8_RE.sub(r"\1\2", norm)
 
-    return norm
+    ctx.dest = norm
 
 
 def _capitalize(src, which):
diff --git a/scriptshifter/hooks/hebrew/dicta_api.py b/scriptshifter/hooks/hebrew/dicta_api.py
@@ -27,6 +27,6 @@ def s2r_post_config(ctx):
 
     ctx.dest = rsp.json().get("transliteration")
     if ctx.dest:
-        ctx.dest = capitalize_post_assembly(ctx)
+        capitalize_post_assembly(ctx)
 
     return BREAK
diff --git a/scriptshifter/hooks/korean/romanizer.py b/scriptshifter/hooks/korean/romanizer.py
@@ -66,7 +66,7 @@ def s2r_nonames_post_config(ctx):
         # FKR042: Capitalize all first letters
         # FKR043: Capitalize the first letter
         logger.debug(f"Before capitalization: {ctx.dest}")
-        ctx.dest = capitalize_post_assembly(ctx)
+        capitalize_post_assembly(ctx)
 
     return BREAK
 
@@ -84,7 +84,7 @@ def s2r_names_post_config(ctx):
         # FKR042: Capitalize all first letters
         # FKR043: Capitalize the first letter
         logger.debug(f"Before capitalization: {ctx.dest}")
-        ctx.dest = capitalize_post_assembly(ctx)
+        capitalize_post_assembly(ctx)
 
     return BREAK
 
diff --git a/scriptshifter/tables/data/uighur_arabic.yml b/scriptshifter/tables/data/uighur_arabic.yml
@@ -6,8 +6,8 @@ roman_to_script:
   map:
     "%a": "\u0626\u0627"
     "a": "\u0627"
-    "%ă": "\u0626\u06D5"
-    "ă": "\u06D5"
+    "%ă": "\u0626\u06D5"
+    "ă": "\u06D5"
     "b": "\u0628"
     "ch": "\u0686"
     "d": "\u062F"
@@ -28,8 +28,8 @@ roman_to_script:
     "ng": "\u06AD"
     "%o": "\u0626\u0648"
     "o": "\u0648"
-    "%ö": "\u0626\u06C6"
-    "ö": "\u06C6"
+    "%ö": "\u0626\u06C6"
+    "ö": "\u06C6"
     "p": "\u067E"
     "q": "\u0642"
     "r": "\u0631"
@@ -38,8 +38,8 @@ roman_to_script:
     "t": "\u062A"
     "%u": "\u0626\u06C7"
     "u": "\u06C7"
-    "%ü": "\u0626\u06C8"
-    "ü": "\u06C8"
+    "%ü": "\u0626\u06C8"
+    "ü": "\u06C8"
     "v": "\u06CB"
     "y": "\u064A"
     "z": "\u0632"
@@ -53,11 +53,11 @@ script_to_roman:
     "%\u0626\u0627": "a"
     "\u0627": "a"
     "\uFE8E": "a"
-    "%\u0626\u06D5": "ă"
-    "\u06D5": "ă"
-    "%\u0626\u0647": "ă"
-    "\u0647": "ă"
-    "\uFEEA": "ă"
+    "%\u0626\u06D5": "ă"
+    "\u06D5": "ă"
+    "%\u0626\u0647": "ă"
+    "\u0647": "ă"
+    "\uFEEA": "ă"
     "\u0628": "b"
     "\uFE91": "b"
     "\uFE92": "b"
@@ -139,9 +139,9 @@ script_to_roman:
     "%\u0626\u0648": "o"
     "\u0648": "o"
     "\uFEEE": "o"
-    "%\u0626\u06C6": "ö"
-    "\u06C6": "ö"
-    "\uFBDA": "ö"
+    "%\u0626\u06C6": "ö"
+    "\u06C6": "ö"
+    "\uFBDA": "ö"
     "\u067E": "p"
     "\uFB58": "p"
     "\uFB59": "p"
@@ -173,9 +173,9 @@ script_to_roman:
     "%\u0626\u06C7": "u"
     "\u06C7": "u"
     "\uFBF0": "u"
-    "%\u0626\u06C8": "ü"
-    "\u06C8": "ü"
-    "\uFBF4": "ü"
+    "%\u0626\u06C8": "ü"
+    "\u06C8": "ü"
+    "\uFBF4": "ü"
     "\u06CB": "v"
     "\uFBDF": "v"
     "\u064A": "y"
diff --git a/scriptshifter/trans.py b/scriptshifter/trans.py
@@ -5,15 +5,12 @@
 from unicodedata import normalize as precomp_normalize
 
 from scriptshifter.exceptions import BREAK, CONT
+from scriptshifter.hooks.general import normalize_spacing_post_assembly
 from scriptshifter.tables import (
         BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
         get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
         get_lang_ignore, get_lang_map, get_lang_normalize)
 
-
-# Match multiple spaces.
-MULTI_WS_RE = compile(r"(\s){2,}")
-
 logger = logging.getLogger(__name__)
 
 
@@ -389,20 +386,17 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
         # This hook may take care of the assembly and cause the function to
         # return its own return value.
-        hret = ctx.run_hook("pre_assembly")
-        if hret is not None:
-            return hret, ctx.warnings
+        if ctx.run_hook("pre_assembly") == BREAK:
+            return ctx.dest, ctx.warnings
 
         logger.debug(f"Output list: {ctx.dest_ls}")
         ctx.dest = "".join(ctx.dest_ls)
 
         # This hook may reassign the output string and/or cause the function to
         # return it immediately.
-        hret = ctx.run_hook("post_assembly")
-        if hret is not None:
-            return hret, ctx.warnings
+        if ctx.run_hook("post_assembly") == BREAK:
+            return ctx.dest, ctx.warnings
 
-        # Strip multiple spaces and leading/trailing whitespace.
-        ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
+        normalize_spacing_post_assembly(ctx)
 
         return ctx.dest, ctx.warnings