From 71ce4654f03aa4b0eb23b084c336a88192bb53e4 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Fri, 20 Jun 2025 23:18:15 -0400 Subject: [PATCH] Fix spacing around end markers --- .../place_markers_usfm_update_block_handler.py | 14 +++++++++++++- ...test_place_markers_usfm_update_block_handler.py | 8 +++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index cdb59142..72e8c827 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -35,6 +35,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ( e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE] and not e.marked_for_removal + and len(e.tokens) == 1 ) for e in elements ) @@ -114,7 +115,18 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: adj_trg_tok = self._predict_marker_location( self._align_info[ref]["alignment"], adj_src_tok, src_toks, trg_toks ) - trg_str_idx = trg_tok_starts[adj_trg_tok] if adj_trg_tok < len(trg_tok_starts) else len(trg_sent) + + if ( + adj_trg_tok > 0 + and element.type == UsfmUpdateBlockElementType.STYLE + and element.tokens[0].marker[-1] == "*" + ): + # Insert end tokens directly after the token they follow + trg_str_idx = trg_tok_starts[adj_trg_tok - 1] + len(trg_toks[adj_trg_tok - 1]) + elif adj_trg_tok < len(trg_tok_starts): + trg_str_idx = trg_tok_starts[adj_trg_tok] + else: + trg_str_idx = len(trg_sent) to_insert.append((trg_str_idx, element)) to_insert.sort(key=lambda x: x[0]) diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py index 641445b3..27c97461 100644 --- a/tests/corpora/test_place_markers_usfm_update_block_handler.py +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -80,10 +80,8 @@ def test_style_markers() -> None: ) result = r"""\id MAT \c 1 -\v 1 Esta es la \w primera \w*oración. Este texto está en \w inglés \w*y esta prueba es \w para \w*marcadores de estilo. +\v 1 Esta es la \w primera\w* oración. Este texto está en \w inglés\w* y esta prueba es \w para\w* marcadores de estilo. """ - # NOTE: the spacing before/after end markers is incorrect, - # but this is an issue with how the is USFM is generated from the tokens assess(target, result) target = update_usfm( @@ -269,7 +267,7 @@ def test_consecutive_markers() -> None: usfm = r"""\id MAT \c 1 \v 1 Old verse 1 -\p \qt \+w word \+w* \qt* +\p \qt \+w word\+w*\qt* """ align_info = [ @@ -290,7 +288,7 @@ def test_consecutive_markers() -> None: result = r"""\id MAT \c 1 \v 1 New verse 1 -\p \qt \+w WORD \+w*\qt* +\p \qt \+w WORD\+w*\qt* """ assess(target, result)