Skip to content

Commit b261044

Browse files
committed
Merge branch 'test' of gh:lcnetdev/scriptshifter into test
2 parents 838c99d + 32ee454 commit b261044

File tree

7 files changed

+59
-58
lines changed

7 files changed

+59
-58
lines changed

doc/hooks.md

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,15 @@ after the hook function is executed. Possible return values are defined below
114114
for each hook. Some special return values, such as `BREAK` and `CONT`, are
115115
registered as constants under `scriptshifter.exceptions`.
116116

117+
### Note on running multiple functions on a hook
118+
119+
Currently, if multiple functions are defined for a hook, they are executed
120+
in the order specified in the configuration. There is no way to skip a function
121+
implicitly based on the outcome of the previous one. The only state that is
122+
passed around in this context, is the `ctx` instance of the `Transliterator`
123+
class. This may change in the future as specific needs arise.
124+
125+
117126
### Always available context members
118127

119128
The following members of the context object are available in all the hooks:
@@ -191,7 +200,7 @@ ignore term and when or when not to trigger a match.
191200
at every character iteration. See "Cursor Flags" below.
192201
- `ctx.dest_ls`: destination token list.
193202

194-
#### Output
203+
#### Return
195204

196205
`CONT`, `BREAK`, or `None`. `CONT` skips the checks on the
197206
current ignore token. `BREAK` stops looking up ignore tokens for the current
@@ -217,7 +226,7 @@ scanning for more ignore tokens past the match.
217226
- `ctx.ignoring`: whether an ignore token matched. If set to `False`, the rest
218227
of the workflow will assume a non-match.
219228

220-
#### Output
229+
#### Return
221230

222231
`CONT`, `BREAK`, or `None`. `CONT` voids the match and keeps
223232
on looking up the ignore list. `BREAK` stops looking up ignore tokens for the
@@ -242,7 +251,7 @@ number of characters, and/or exit the text scanning loop altogether.
242251
- `ctx.src_tk`: the input token being looked up.
243252
- `ctx.dest_tk`: the transliterated string associated with the current token.
244253

245-
#### Output
254+
#### Return
246255

247256
`CONT`, `BREAK`, or `None`. `CONT` skips the checks on the
248257
current token. `BREAK` stops looking up all tokens for the current
@@ -269,7 +278,7 @@ also inject additional conditions and logic for the match, and revoke the
269278
- `ctx.match`: whether there was a match. If set to `False`, the rest of the
270279
workflow will assume a non-match.
271280

272-
#### Output
281+
#### Return
273282

274283
`CONT`, `BREAK`, or `None`. `CONT` voids the match and keeps
275284
on looking up the token list. `BREAK` stops looking up tokens for the
@@ -292,7 +301,7 @@ cursor position to the destination list, verbatim.
292301
at every character iteration. See "Cursor Flags" below.
293302
- `ctx.dest_ls`: destination token list.
294303

295-
#### Output
304+
#### Return
296305

297306
`CONT`, `BREAK`, or `None`. `CONT` skips to the next position in the input
298307
text. Int his case, the function **must** advance the cursor. `BREAK` stops all
@@ -311,10 +320,10 @@ bypass any further output handling.
311320

312321
- `ctx.dest_ls`: destination token list.
313322

314-
#### Output
323+
#### Return
315324

316-
A string or `None`. If the output is a string, the transliteration function
317-
returns this string immediately; otherwise it proceeds with standard
325+
`BREAK` or `None`. If `BREAK`, the content of `ctx.dest`, which should be set
326+
by the function, is returned immediately; otherwise it proceeds with standard
318327
adjustments and assembly of the output list.
319328

320329
### `post_assembly`
@@ -333,9 +342,9 @@ and return it before any further default processing is done.
333342

334343
#### Output
335344

336-
String or `None`. If a string, the transliteration function returns that
337-
immediately; otherwise it proceeds with standard adjustments of the output
338-
string before returning.
345+
`BREAK` or `None`. If `BREAK`, the transliteration function returns the content
346+
of `ctx.dest` immediately; otherwise it proceeds with standard adjustments of
347+
the output string before returning.
339348

340349
## Cursor flags
341350

scriptshifter/hooks/chinese/__init__.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
from logging import getLogger
55
from re import I, compile, search, sub
66

7-
from scriptshifter.hooks.general import normalize_spacing_post_assembly
8-
97

108
logger = getLogger(__name__)
119

@@ -21,7 +19,7 @@ def parse_numerals_pre_assembly(ctx):
2119
tk_ct = len(ctx.dest_ls)
2220
token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")
2321

24-
output = ""
22+
output = []
2523

2624
# Use manual loop as i is manipulated inside it.
2725
i = 0
@@ -36,7 +34,7 @@ def parse_numerals_pre_assembly(ctx):
3634
# characters representing numbers are converted to Arabic
3735
# numerals. When a non-numerical token (or end of string) is
3836
# encountered, the string of numerical tokens is evaluated to
39-
# determine which version should be used in the output string.
37+
# determine which version should be used in the output.
4038
# The outer loop then continues where the inner loop left off.
4139
logger.debug(f"Match number: {tk_i}.")
4240
text_v = num_v = ""
@@ -96,7 +94,7 @@ def parse_numerals_pre_assembly(ctx):
9694
while search("[0-9] [0-9]", num_v):
9795
num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
9896

99-
output += num_v if use_num_v else text_v
97+
output.append(num_v if use_num_v else text_v)
10098

10199
# if the end of the string is not reached, backtrack to the
102100
# delimiter after the last numerical token (i.e. two tokens
@@ -117,16 +115,12 @@ def parse_numerals_pre_assembly(ctx):
117115

118116
else:
119117
logger.debug(f"No numeric match: adding {tk_i}.")
120-
output += tk_i
118+
output.append(tk_i)
121119

122120
i += 1
123121

124122
logger.debug(f"Use num version: {use_num_v}")
125-
ctx.dest = output
126-
127-
# Skip main transliterate function joining.
128-
129-
return normalize_spacing_post_assembly(ctx)
123+
ctx.dest_ls = output
130124

131125

132126
def person_name_pre_assembly(ctx):

scriptshifter/hooks/general/__init__.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55
from logging import getLogger
66
from re import compile
77

8-
from scriptshifter.trans import MULTI_WS_RE
98

9+
# Match multiple spaces.
10+
MULTI_WS_RE = compile(r"(\s){2,}")
1011

1112
# Punctuation and brackets.
1213
# TODO add angled brackets, opening and closing quotes, etc.
1314
NORM1_RE = compile(r"\s([.,;:\)\]}])")
14-
NORM2_RE = compile(r"([.,;:\)\]}])(\S)")
15+
NORM2_RE = compile(r"([,;\)\]}])(\S)")
1516
NORM3_RE = compile(r"([\(\[\{])\s")
1617
NORM4_RE = compile(r"(\S)([\(\[\{])")
1718

@@ -42,12 +43,15 @@ def capitalize_post_assembly(ctx):
4243

4344
dest_ls = _capitalize(dest_ls, ctx.options.get("capitalize"))
4445

45-
return " ".join(dest_ls)
46+
ctx.dest = " ".join(dest_ls)
4647

4748

4849
def normalize_spacing_post_assembly(ctx):
4950
"""
5051
Remove duplicate and unwanted whitespace around punctuation.
52+
53+
NOTE: This is called by default by transliterate() immediately after the
54+
`post_assembly` hook.
5155
"""
5256
# De-duplicate whitespace.
5357
logger.debug(f"Dest pre manipulation: {ctx.dest}")
@@ -70,7 +74,7 @@ def normalize_spacing_post_assembly(ctx):
7074
# Remove multiple white space characters.
7175
# norm = NORM8_RE.sub(r"\1\2", norm)
7276

73-
return norm
77+
ctx.dest = norm
7478

7579

7680
def _capitalize(src, which):

scriptshifter/hooks/hebrew/dicta_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,6 @@ def s2r_post_config(ctx):
2727

2828
ctx.dest = rsp.json().get("transliteration")
2929
if ctx.dest:
30-
ctx.dest = capitalize_post_assembly(ctx)
30+
capitalize_post_assembly(ctx)
3131

3232
return BREAK

scriptshifter/hooks/korean/romanizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def s2r_nonames_post_config(ctx):
6666
# FKR042: Capitalize all first letters
6767
# FKR043: Capitalize the first letter
6868
logger.debug(f"Before capitalization: {ctx.dest}")
69-
ctx.dest = capitalize_post_assembly(ctx)
69+
capitalize_post_assembly(ctx)
7070

7171
return BREAK
7272

@@ -84,7 +84,7 @@ def s2r_names_post_config(ctx):
8484
# FKR042: Capitalize all first letters
8585
# FKR043: Capitalize the first letter
8686
logger.debug(f"Before capitalization: {ctx.dest}")
87-
ctx.dest = capitalize_post_assembly(ctx)
87+
capitalize_post_assembly(ctx)
8888

8989
return BREAK
9090

scriptshifter/tables/data/uighur_arabic.yml

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ roman_to_script:
66
map:
77
"%a": "\u0626\u0627"
88
"a": "\u0627"
9-
"%ă": "\u0626\u06D5"
10-
"ă": "\u06D5"
9+
"%": "\u0626\u06D5"
10+
"": "\u06D5"
1111
"b": "\u0628"
1212
"ch": "\u0686"
1313
"d": "\u062F"
@@ -28,8 +28,8 @@ roman_to_script:
2828
"ng": "\u06AD"
2929
"%o": "\u0626\u0648"
3030
"o": "\u0648"
31-
"%ö": "\u0626\u06C6"
32-
"ö": "\u06C6"
31+
"%": "\u0626\u06C6"
32+
"": "\u06C6"
3333
"p": "\u067E"
3434
"q": "\u0642"
3535
"r": "\u0631"
@@ -38,8 +38,8 @@ roman_to_script:
3838
"t": "\u062A"
3939
"%u": "\u0626\u06C7"
4040
"u": "\u06C7"
41-
"%ü": "\u0626\u06C8"
42-
"ü": "\u06C8"
41+
"%": "\u0626\u06C8"
42+
"": "\u06C8"
4343
"v": "\u06CB"
4444
"y": "\u064A"
4545
"z": "\u0632"
@@ -53,11 +53,11 @@ script_to_roman:
5353
"%\u0626\u0627": "a"
5454
"\u0627": "a"
5555
"\uFE8E": "a"
56-
"%\u0626\u06D5": "ă"
57-
"\u06D5": "ă"
58-
"%\u0626\u0647": "ă"
59-
"\u0647": "ă"
60-
"\uFEEA": "ă"
56+
"%\u0626\u06D5": ""
57+
"\u06D5": ""
58+
"%\u0626\u0647": ""
59+
"\u0647": ""
60+
"\uFEEA": ""
6161
"\u0628": "b"
6262
"\uFE91": "b"
6363
"\uFE92": "b"
@@ -139,9 +139,9 @@ script_to_roman:
139139
"%\u0626\u0648": "o"
140140
"\u0648": "o"
141141
"\uFEEE": "o"
142-
"%\u0626\u06C6": "ö"
143-
"\u06C6": "ö"
144-
"\uFBDA": "ö"
142+
"%\u0626\u06C6": ""
143+
"\u06C6": ""
144+
"\uFBDA": ""
145145
"\u067E": "p"
146146
"\uFB58": "p"
147147
"\uFB59": "p"
@@ -173,9 +173,9 @@ script_to_roman:
173173
"%\u0626\u06C7": "u"
174174
"\u06C7": "u"
175175
"\uFBF0": "u"
176-
"%\u0626\u06C8": "ü"
177-
"\u06C8": "ü"
178-
"\uFBF4": "ü"
176+
"%\u0626\u06C8": ""
177+
"\u06C8": ""
178+
"\uFBF4": ""
179179
"\u06CB": "v"
180180
"\uFBDF": "v"
181181
"\u064A": "y"

scriptshifter/trans.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,12 @@
55
from unicodedata import normalize as precomp_normalize
66

77
from scriptshifter.exceptions import BREAK, CONT
8+
from scriptshifter.hooks.general import normalize_spacing_post_assembly
89
from scriptshifter.tables import (
910
BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
1011
get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
1112
get_lang_ignore, get_lang_map, get_lang_normalize)
1213

13-
14-
# Match multiple spaces.
15-
MULTI_WS_RE = compile(r"(\s){2,}")
16-
1714
logger = logging.getLogger(__name__)
1815

1916

@@ -389,20 +386,17 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
389386

390387
# This hook may take care of the assembly and cause the function to
391388
# return its own return value.
392-
hret = ctx.run_hook("pre_assembly")
393-
if hret is not None:
394-
return hret, ctx.warnings
389+
if ctx.run_hook("pre_assembly") == BREAK:
390+
return ctx.dest, ctx.warnings
395391

396392
logger.debug(f"Output list: {ctx.dest_ls}")
397393
ctx.dest = "".join(ctx.dest_ls)
398394

399395
# This hook may reassign the output string and/or cause the function to
400396
# return it immediately.
401-
hret = ctx.run_hook("post_assembly")
402-
if hret is not None:
403-
return hret, ctx.warnings
397+
if ctx.run_hook("post_assembly") == BREAK:
398+
return ctx.dest, ctx.warnings
404399

405-
# Strip multiple spaces and leading/trailing whitespace.
406-
ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
400+
normalize_spacing_post_assembly(ctx)
407401

408402
return ctx.dest, ctx.warnings

0 commit comments

Comments
 (0)