diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 314b35b2..45eb628b 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -51,7 +51,12 @@ normalize, unescape_spaces, ) -from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior +from .update_usfm_parser_handler import ( + UpdateUsfmMarkerBehavior, + UpdateUsfmParserHandler, + UpdateUsfmRow, + UpdateUsfmTextBehavior, +) from .usfm_file_text import UsfmFileText from .usfm_file_text_corpus import UsfmFileTextCorpus from .usfm_memory_text import UsfmMemoryText @@ -135,6 +140,7 @@ "UpdateUsfmMarkerBehavior", "UpdateUsfmParserHandler", "UpdateUsfmTextBehavior", + "UpdateUsfmRow", "UsfmAttribute", "UsfmElementType", "UsfmFileText", diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 97a0578a..c29d1efa 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,11 +1,15 @@ from abc import ABC, abstractmethod -from typing import BinaryIO, Iterable, Optional, Sequence, Tuple, Union +from typing import BinaryIO, Iterable, Optional, Sequence, Union from ..utils.typeshed import StrPath from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase -from .scripture_ref import ScriptureRef -from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior +from .update_usfm_parser_handler import ( + UpdateUsfmMarkerBehavior, + UpdateUsfmParserHandler, + UpdateUsfmRow, + UpdateUsfmTextBehavior, +) from .usfm_parser import parse_usfm from .usfm_update_block_handler import UsfmUpdateBlockHandler @@ -20,7 +24,7 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti def update_usfm( self, book_id: str, - rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, + rows: Optional[Sequence[UpdateUsfmRow]] = None, full_name: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING, paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py index 72e8c827..b86992d9 100644 --- a/machine/corpora/place_markers_usfm_update_block_handler.py +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Iterable, List, TypedDict +from typing import List, TypedDict, cast from ..translation.word_alignment_matrix import WordAlignmentMatrix from .usfm_token import UsfmToken, UsfmTokenType @@ -8,9 +8,10 @@ from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType from .usfm_update_block_handler import UsfmUpdateBlockHandler +PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info" + class PlaceMarkersAlignmentInfo(TypedDict): - refs: List[str] source_tokens: List[str] translation_tokens: List[str] alignment: WordAlignmentMatrix @@ -18,19 +19,18 @@ class PlaceMarkersAlignmentInfo(TypedDict): class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): - def __init__(self, align_info: Iterable[PlaceMarkersAlignmentInfo]) -> None: - self._align_info = {info["refs"][0]: info for info in align_info} - def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: - ref = str(block.refs[0]) elements = list(block.elements) # Nothing to do if there are no markers to place or no alignment to use + if PLACE_MARKERS_ALIGNMENT_INFO_KEY not in block.metadata: + return block + + alignment_info = cast(PlaceMarkersAlignmentInfo, block.metadata[PLACE_MARKERS_ALIGNMENT_INFO_KEY]) if ( len(elements) == 0 - or ref not in self._align_info.keys() - or self._align_info[ref]["alignment"].row_count == 0 - or self._align_info[ref]["alignment"].column_count == 0 + or alignment_info["alignment"].row_count == 0 + or alignment_info["alignment"].column_count == 0 or not any( ( e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE] @@ -65,8 +65,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ): eob_empty_paras = False - src_toks = self._align_info[ref]["source_tokens"] - trg_toks = self._align_info[ref]["translation_tokens"] + src_toks: List[str] = alignment_info["source_tokens"] + trg_toks: List[str] = alignment_info["translation_tokens"] src_tok_idx = 0 src_sent = "" @@ -112,9 +112,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: # Predict marker placements and get insertion order to_insert = [] for element, adj_src_tok in zip(to_place, adj_src_toks): - adj_trg_tok = self._predict_marker_location( - self._align_info[ref]["alignment"], adj_src_tok, src_toks, trg_toks - ) + adj_trg_tok = self._predict_marker_location(alignment_info["alignment"], adj_src_tok, src_toks, trg_toks) if ( adj_trg_tok > 0 diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index e9603480..a51021b2 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -24,10 +24,17 @@ class UpdateUsfmMarkerBehavior(Enum): STRIP = auto() +class UpdateUsfmRow: + def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[dict[str, object]] = None): + self.refs = refs + self.text = text + self.metadata = metadata + + class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler): def __init__( self, - rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, + rows: Optional[Sequence[UpdateUsfmRow]] = None, id_text: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING, paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, @@ -284,12 +291,14 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: tokenizer = UsfmTokenizer(stylesheet) return tokenizer.detokenize(self._tokens) - def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> List[str]: + def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: row_texts: List[str] = [] + row_metadata = None source_index: int = 0 while self._row_index < len(self._rows) and source_index < len(seg_scr_refs): compare: int = 0 - row_scr_refs, text = self._rows[self._row_index] + row = self._rows[self._row_index] + row_scr_refs, text, metadata = row.refs, row.text, row.metadata for row_scr_ref in row_scr_refs: while source_index < len(seg_scr_refs): compare = row_scr_ref.compare_to(seg_scr_refs[source_index], compare_segments=False) @@ -302,11 +311,12 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> List[str]: # source and row match # grab the text - both source and row will be incremented in due time... row_texts.append(text) + row_metadata = metadata break if compare <= 0: # source is ahead of row, increment row self._row_index += 1 - return row_texts + return row_texts, row_metadata def _collect_updatable_tokens(self, state: UsfmParserState) -> None: self._use_updated_text() @@ -377,8 +387,10 @@ def _has_new_text(self) -> bool: return any(self._replace_stack) and self._replace_stack[-1] def _start_update_block(self, scripture_refs: Sequence[ScriptureRef]) -> None: - self._update_block_stack.append(UsfmUpdateBlock(scripture_refs)) - row_texts: List[str] = self._advance_rows(scripture_refs) + row_texts, metadata = self._advance_rows(scripture_refs) + self._update_block_stack.append( + UsfmUpdateBlock(scripture_refs, metadata=metadata if metadata is not None else {}) + ) self._push_updated_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) def _end_update_block(self, state: UsfmParserState, scripture_refs: Sequence[ScriptureRef]) -> None: diff --git a/machine/corpora/usfm_update_block.py b/machine/corpora/usfm_update_block.py index 3206495c..977e82b2 100644 --- a/machine/corpora/usfm_update_block.py +++ b/machine/corpora/usfm_update_block.py @@ -8,9 +8,15 @@ class UsfmUpdateBlock: - def __init__(self, refs: Iterable[ScriptureRef] = [], elements: Iterable[UsfmUpdateBlockElement] = []) -> None: + def __init__( + self, + refs: Iterable[ScriptureRef] = [], + elements: Iterable[UsfmUpdateBlockElement] = [], + metadata: dict[str, object] = {}, + ) -> None: self._refs: list[ScriptureRef] = list(refs) self._elements: list[UsfmUpdateBlockElement] = list(elements) + self._metadata: dict[str, object] = metadata @property def refs(self) -> Sequence[ScriptureRef]: @@ -20,6 +26,10 @@ def refs(self) -> Sequence[ScriptureRef]: def elements(self) -> Sequence[UsfmUpdateBlockElement]: return self._elements + @property + def metadata(self) -> dict[str, object]: + return self._metadata + def add_text(self, tokens: Iterable[UsfmToken]) -> None: self._elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, list(tokens))) @@ -58,7 +68,7 @@ def get_tokens(self) -> list[UsfmToken]: return [token for element in self._elements for token in element.get_tokens()] def __eq__(self, other: UsfmUpdateBlock) -> bool: - return self._refs == other._refs and self._elements == other._elements + return self._refs == other._refs and self._elements == other._elements and self._metadata == other._metadata def copy(self) -> UsfmUpdateBlock: - return UsfmUpdateBlock(self._refs, self._elements) + return UsfmUpdateBlock(self._refs, self._elements, self._metadata) diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py index 27c97461..60510bad 100644 --- a/tests/corpora/test_place_markers_usfm_update_block_handler.py +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Sequence, Tuple +from typing import List, Optional, Sequence from machine.corpora import ( AlignedWordPair, @@ -7,6 +7,7 @@ ScriptureRef, UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, + UpdateUsfmRow, UpdateUsfmTextBehavior, UsfmUpdateBlockHandler, parse_usfm, @@ -20,7 +21,14 @@ def test_paragraph_markers() -> None: source = "This is the first paragraph. This text is in English, and this test is for paragraph markers." pretranslation = "Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo." - rows = [(scr_ref("MAT 1:1"), str(pretranslation))] + align_info = PlaceMarkersAlignmentInfo( + source_tokens=[t for t in TOKENIZER.tokenize(source)], + translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)], + alignment=to_word_alignment_matrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), + ) + rows = [UpdateUsfmRow(scr_ref("MAT 1:1"), str(pretranslation), {"alignment_info": align_info})] usfm = r"""\id MAT \c 1 \v 1 This is the first paragraph. @@ -28,21 +36,11 @@ def test_paragraph_markers() -> None: \p and this test is for paragraph markers. """ - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=[t for t in TOKENIZER.tokenize(source)], - translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)], - alignment=to_word_alignment_matrix( - "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" - ), - ), - ] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -56,27 +54,24 @@ def test_paragraph_markers() -> None: def test_style_markers() -> None: source = "This is the first sentence. This text is in English, and this test is for style markers." pretranslation = "Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo." - rows = [(scr_ref("MAT 1:1"), str(pretranslation))] + align_info = PlaceMarkersAlignmentInfo( + source_tokens=[t for t in TOKENIZER.tokenize(source)], + translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)], + alignment=to_word_alignment_matrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), + ) + rows = [UpdateUsfmRow(scr_ref("MAT 1:1"), str(pretranslation), metadata={"alignment_info": align_info})] usfm = r"""\id MAT \c 1 \v 1 This is the \w first\w* sentence. This text is in \w English\w*, and this test is \w for\w* style markers. """ - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=[t for t in TOKENIZER.tokenize(source)], - translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)], - alignment=to_word_alignment_matrix( - "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" - ), - ), - ] target = update_usfm( rows, usfm, style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -88,7 +83,7 @@ def test_style_markers() -> None: rows, usfm, style_behavior=UpdateUsfmMarkerBehavior.STRIP, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -100,14 +95,14 @@ def test_style_markers() -> None: # NOTE: Not currently updating embeds, will need to change test when we do def test_embeds() -> None: rows = [ - (scr_ref("MAT 1:1"), "New verse 1"), - (scr_ref("MAT 1:2"), "New verse 2"), - (scr_ref("MAT 1:3"), "New verse 3"), - (scr_ref("MAT 1:4"), "New verse 4"), - (scr_ref("MAT 1:4/1:f"), "New embed text"), - (scr_ref("MAT 1:5"), "New verse 5"), - (scr_ref("MAT 1:6"), "New verse 6"), - (scr_ref("MAT 1:6/1:f"), "New verse 6 embed text"), + UpdateUsfmRow(scr_ref("MAT 1:1"), "New verse 1"), + UpdateUsfmRow(scr_ref("MAT 1:2"), "New verse 2"), + UpdateUsfmRow(scr_ref("MAT 1:3"), "New verse 3"), + UpdateUsfmRow(scr_ref("MAT 1:4"), "New verse 4"), + UpdateUsfmRow(scr_ref("MAT 1:4/1:f"), "New embed text"), + UpdateUsfmRow(scr_ref("MAT 1:5"), "New verse 5"), + UpdateUsfmRow(scr_ref("MAT 1:6"), "New verse 6"), + UpdateUsfmRow(scr_ref("MAT 1:6/1:f"), "New verse 6 embed text"), ] usfm = r"""\id MAT \c 1 @@ -119,12 +114,11 @@ def test_embeds() -> None: \v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f* """ - align_info = [] target = update_usfm( rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -141,7 +135,7 @@ def test_embeds() -> None: rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.STRIP, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -156,7 +150,19 @@ def test_embeds() -> None: def test_trailing_empty_paragraphs() -> None: - rows = [(scr_ref("MAT 1:1"), "New verse 1")] + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "New verse 1", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=["Verse", "1"], + translation_tokens=["New", "verse", "1"], + alignment=to_word_alignment_matrix("0-1 1-2"), + ) + }, + ) + ] usfm = r"""\id MAT \c 1 \v 1 \f embed 1 \f*Verse 1 @@ -165,19 +171,11 @@ def test_trailing_empty_paragraphs() -> None: \q1 \f embed 2 \f* """ - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=["Verse", "1"], - translation_tokens=["New", "verse", "1"], - alignment=to_word_alignment_matrix("0-1 1-2"), - ), - ] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -191,10 +189,30 @@ def test_trailing_empty_paragraphs() -> None: def test_headers() -> None: rows = [ - (scr_ref("MAT 1:1"), "X Y Z"), - (scr_ref("MAT 1:2"), "X"), - (scr_ref("MAT 1:3"), "Y"), - (scr_ref("MAT 1:3/1:s1"), "Updated header"), + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "X Y Z", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=["A", "B", "C"], + translation_tokens=["X", "Y", "Z"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2"), + ) + }, + ), + UpdateUsfmRow( + scr_ref("MAT 1:2"), + "X", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=["A"], + translation_tokens=["X"], + alignment=to_word_alignment_matrix("0-0"), + ) + }, + ), + UpdateUsfmRow(scr_ref("MAT 1:3"), "Y"), + UpdateUsfmRow(scr_ref("MAT 1:3/1:s1"), "Updated header"), ] usfm = r"""\id MAT \c 1 @@ -218,25 +236,11 @@ def test_headers() -> None: \s1 Header to be updated """ - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=["A", "B", "C"], - translation_tokens=["X", "Y", "Z"], - alignment=to_word_alignment_matrix("0-0 1-1 2-2"), - ), - PlaceMarkersAlignmentInfo( - refs=["MAT 1:2"], - source_tokens=["A"], - translation_tokens=["X"], - alignment=to_word_alignment_matrix("0-0"), - ), - ] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -263,27 +267,31 @@ def test_headers() -> None: def test_consecutive_markers() -> None: - rows = [(scr_ref("MAT 1:1"), "New verse 1 WORD")] + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "New verse 1 WORD", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=["Old", "verse", "1", "word"], + translation_tokens=["New", "verse", "1", "WORD"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3"), + ) + }, + ) + ] usfm = r"""\id MAT \c 1 \v 1 Old verse 1 \p \qt \+w word\+w*\qt* """ - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=["Old", "verse", "1", "word"], - translation_tokens=["New", "verse", "1", "WORD"], - alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3"), - ), - ] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -294,26 +302,30 @@ def test_consecutive_markers() -> None: def test_verse_ranges() -> None: - rows = [([ScriptureRef.parse(f"MAT 1:{i}") for i in range(1, 6)], "New verse range text new paragraph 2")] + rows = [ + UpdateUsfmRow( + [ScriptureRef.parse(f"MAT 1:{i}") for i in range(1, 6)], + "New verse range text new paragraph 2", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=["Verse", "range", "old", "paragraph", "2"], + translation_tokens=["New", "verse", "range", "text", "new", "paragraph", "2"], + alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5 4-6"), + ) + }, + ) + ] usfm = r"""\id MAT \c 1 \v 1-5 Verse range \p old paragraph 2 """ - align_info = [ - PlaceMarkersAlignmentInfo( - refs=[str(ScriptureRef.parse(f"MAT 1:{i}")) for i in range(1, 6)], - source_tokens=["Verse", "range", "old", "paragraph", "2"], - translation_tokens=["New", "verse", "range", "text", "new", "paragraph", "2"], - alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5 4-6"), - ), - ] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -324,27 +336,31 @@ def test_verse_ranges() -> None: def test_no_update() -> None: - rows = [(scr_ref("MAT 1:1"), "New paragraph 1 New paragraph 2")] + # Strip paragraphs + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "New paragraph 1 New paragraph 2", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=["Old", "paragraph", "1", "Old", "paragraph", "2"], + translation_tokens=["New", "paragraph", "1", "New", "paragraph", "2"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"), + ) + }, + ) + ] usfm = r"""\id MAT \c 1 \v 1 Old paragraph 1 \p Old paragraph 2 """ - # Strip paragraphs - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=["Old", "paragraph", "1", "Old", "paragraph", "2"], - translation_tokens=["New", "paragraph", "1", "New", "paragraph", "2"], - alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"), - ), - ] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -353,19 +369,25 @@ def test_no_update() -> None: assess(target, result) # No alignment - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=[], - translation_tokens=[], - alignment=to_word_alignment_matrix(""), - ), + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "New paragraph 1 New paragraph 2", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=[], + translation_tokens=[], + alignment=to_word_alignment_matrix(""), + ) + }, + ) ] + target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -376,12 +398,11 @@ def test_no_update() -> None: # No text update rows = [] - align_info = [] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -392,7 +413,19 @@ def test_no_update() -> None: def test_split_tokens() -> None: - rows = [(scr_ref("MAT 1:1"), "words split words split words split")] + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "words split words split words split", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=["words", "split", "words", "split", "words", "split"], + translation_tokens=["words", "split", "words", "split", "words", "split"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"), + ) + }, + ) + ] usfm = r"""\id MAT \c 1 \v 1 words spl @@ -400,19 +433,11 @@ def test_split_tokens() -> None: \p it words split """ - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=["words", "split", "words", "split", "words", "split"], - translation_tokens=["words", "split", "words", "split", "words", "split"], - alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"), - ), - ] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -424,26 +449,30 @@ def test_split_tokens() -> None: def test_no_text() -> None: - rows = [(scr_ref("MAT 1:1"), "")] + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=[], + translation_tokens=[], + alignment=to_word_alignment_matrix(""), + ) + }, + ) + ] usfm = r"""\id MAT \c 1 \v 1 \w \w* """ - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=[], - translation_tokens=[], - alignment=to_word_alignment_matrix(""), - ), - ] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -453,26 +482,30 @@ def test_no_text() -> None: def test_consecutive_substring() -> None: - rows = [(scr_ref("MAT 1:1"), "string ring")] + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "string ring", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=["string", "ring"], + translation_tokens=["string", "ring"], + alignment=to_word_alignment_matrix("0-0 1-1"), + ) + }, + ) + ] usfm = r"""\id MAT \c 1 \v 1 string \p ring """ - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=["string", "ring"], - translation_tokens=["string", "ring"], - alignment=to_word_alignment_matrix("0-0 1-1"), - ), - ] target = update_usfm( rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -483,7 +516,30 @@ def test_consecutive_substring() -> None: def test_verses_out_of_order() -> None: - rows = [(scr_ref("MAT 1:1"), "new verse 1 new paragraph 2"), (scr_ref("MAT 1:2"), "new verse 2")] + rows = [ + UpdateUsfmRow( + scr_ref("MAT 1:1"), + "new verse 1 new paragraph 2", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=["verse", "1", "paragraph", "2"], + translation_tokens=["new", "verse", "1", "new", "paragraph", "2"], + alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5"), + ) + }, + ), + UpdateUsfmRow( + scr_ref("MAT 1:2"), + "new verse 2", + metadata={ + "alignment_info": PlaceMarkersAlignmentInfo( + source_tokens=["verse", "2"], + translation_tokens=["new", "verse", "2"], + alignment=to_word_alignment_matrix("0-1 1-2"), + ) + }, + ), + ] usfm = r"""\id MAT \c 1 \v 2 verse 2 @@ -491,25 +547,11 @@ def test_verses_out_of_order() -> None: \p paragraph 2 """ - align_info = [ - PlaceMarkersAlignmentInfo( - refs=["MAT 1:1"], - source_tokens=["verse", "1", "paragraph", "2"], - translation_tokens=["new", "verse", "1", "new", "paragraph", "2"], - alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5"), - ), - PlaceMarkersAlignmentInfo( - refs=["MAT 1:2"], - source_tokens=["verse", "2"], - translation_tokens=["new", "verse", "2"], - alignment=to_word_alignment_matrix("0-1 1-2"), - ), - ] target = update_usfm( rows, usfm, text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING, - update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()], ) result = r"""\id MAT \c 1 @@ -537,7 +579,7 @@ def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix: def update_usfm( - rows: Sequence[Tuple[Sequence[ScriptureRef], str]], + rows: Sequence[UpdateUsfmRow], source: str, id_text: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 84422905..47cd6280 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Optional, Sequence, Tuple, Union +from typing import Iterable, List, Optional, Sequence, Union from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, ignore_line_endings @@ -7,6 +7,7 @@ ScriptureRef, UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, + UpdateUsfmRow, UpdateUsfmTextBehavior, UsfmUpdateBlock, UsfmUpdateBlockElementType, @@ -17,7 +18,7 @@ def test_get_usfm_verse_char_style() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("First verse of the first chapter."), ) @@ -39,11 +40,11 @@ def test_get_usfm_id_text() -> None: def test_get_usfm_strip_all_text() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), - ( + UpdateUsfmRow( scr_ref("MAT 1:3"), str("Update 3"), ), @@ -108,9 +109,9 @@ def test_get_usfm_strip_all_text() -> None: def test_get_usfm_strip_paragraphs_preserve_paragraph_styles(): rows = [ - (scr_ref("MAT 1:0/1:rem"), "New remark"), - (scr_ref("MAT 1:0/3:ip"), "Another new remark"), - (scr_ref("MAT 1:1"), "Update 1"), + UpdateUsfmRow(scr_ref("MAT 1:0/1:rem"), "New remark"), + UpdateUsfmRow(scr_ref("MAT 1:0/3:ip"), "Another new remark"), + UpdateUsfmRow(scr_ref("MAT 1:1"), "Update 1"), ] usfm = r"""\id MAT \c 1 @@ -156,11 +157,11 @@ def test_get_usfm_strip_paragraphs_preserve_paragraph_styles(): def test_preserve_paragraphs(): rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:0/1:rem"), str("Update remark"), ), - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -200,7 +201,7 @@ def test_preserve_paragraphs(): def test_paragraph_in_verse(): rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -245,11 +246,11 @@ def test_paragraph_in_verse(): def test_get_usfm_prefer_existing(): rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), - ( + UpdateUsfmRow( scr_ref("MAT 1:2"), str("Update 2"), ), @@ -274,11 +275,11 @@ def test_get_usfm_prefer_existing(): def test_get_usfm_prefer_rows(): rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:6"), str("Text 6"), ), - ( + UpdateUsfmRow( scr_ref("MAT 1:7"), str("Text 7"), ), @@ -292,7 +293,7 @@ def test_get_usfm_prefer_rows(): def test_get_usfm_verse_strip_note() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:1"), str("First verse of the second chapter."), ) @@ -304,7 +305,7 @@ def test_get_usfm_verse_strip_note() -> None: def test_get_usfm_verse_replace_with_note() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("updated text"), ), @@ -323,7 +324,7 @@ def test_get_usfm_verse_replace_with_note() -> None: def test_get_usfm_row_verse_segment() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:1a"), str("First verse of the second chapter."), ) @@ -335,7 +336,7 @@ def test_get_usfm_row_verse_segment() -> None: def test_get_usfm_verse_segment() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:7"), str("Seventh verse of the second chapter."), ) @@ -347,7 +348,7 @@ def test_get_usfm_verse_segment() -> None: def test_get_usfm_verse_multiple_paras() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:2"), str("Second verse of the first chapter."), ) @@ -362,7 +363,7 @@ def test_get_usfm_verse_multiple_paras() -> None: def test_get_usfm_verse_table() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:9"), str("Ninth verse of the second chapter."), ) @@ -374,7 +375,7 @@ def test_get_usfm_verse_table() -> None: def test_get_usfm_verse_range_single_row_multiple_verses() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:11", "MAT 2:12"), str("Eleventh verse of the second chapter. Twelfth verse of the second chapter."), ) @@ -386,7 +387,7 @@ def test_get_usfm_verse_range_single_row_multiple_verses() -> None: def test_get_usfm_verse_range_single_row_single_verse() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:11"), str("Eleventh verse of the second chapter."), ) @@ -398,11 +399,11 @@ def test_get_usfm_verse_range_single_row_single_verse() -> None: def test_get_usfm_verse_range_multiple_rows_single_verse() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:11"), str("Eleventh verse of the second chapter."), ), - ( + UpdateUsfmRow( scr_ref("MAT 2:12"), str("Twelfth verse of the second chapter."), ), @@ -414,15 +415,15 @@ def test_get_usfm_verse_range_multiple_rows_single_verse() -> None: def test_get_usfm_merge_verse_segments() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:2"), str("Verse 2."), ), - ( + UpdateUsfmRow( scr_ref("MAT 2:2a"), str("Verse 2a."), ), - ( + UpdateUsfmRow( scr_ref("MAT 2:2b"), str("Verse 2b."), ), @@ -434,11 +435,11 @@ def test_get_usfm_merge_verse_segments() -> None: def test_get_usfm_verse_opt_break() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:2"), str("Second verse of the second chapter."), ), - ( + UpdateUsfmRow( scr_ref("MAT 2:3"), str("Third verse of the second chapter."), ), @@ -450,7 +451,7 @@ def test_get_usfm_verse_opt_break() -> None: def test_get_usfm_verse_milestone() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:10"), str("Tenth verse of the second chapter."), ) @@ -462,7 +463,7 @@ def test_get_usfm_verse_milestone() -> None: def test_get_usfm_verse_unmatched() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:3"), str("Third verse of the first chapter."), ) @@ -474,7 +475,7 @@ def test_get_usfm_verse_unmatched() -> None: def test_get_usfm_nonverse_char_style() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:0/3:s1"), str("The second chapter."), ) @@ -486,7 +487,7 @@ def test_get_usfm_nonverse_char_style() -> None: def test_get_usfm_nonverse_paragraph() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:0/8:s"), str("The first chapter."), ) @@ -498,23 +499,23 @@ def test_get_usfm_nonverse_paragraph() -> None: def test_get_usfm_nonverse_relaxed() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:0/s"), str("The first chapter."), ), - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("First verse of the first chapter."), ), - ( + UpdateUsfmRow( scr_ref("MAT 2:0/tr/tc1"), str("The first cell of the table."), ), - ( + UpdateUsfmRow( scr_ref("MAT 2:0/tr/tc2"), str("The second cell of the table."), ), - ( + UpdateUsfmRow( scr_ref("MAT 2:0/tr/tc1"), str("The third cell of the table."), ), @@ -529,7 +530,7 @@ def test_get_usfm_nonverse_relaxed() -> None: def test_get_usfm_nonverse_sidebar() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:3/1:esb/1:ms"), str("The first paragraph of the sidebar."), ) @@ -541,11 +542,11 @@ def test_get_usfm_nonverse_sidebar() -> None: def test_get_usfm_nonverse_table() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:0/1:tr/1:tc1"), str("The first cell of the table."), ), - ( + UpdateUsfmRow( scr_ref("MAT 2:0/2:tr/1:tc1"), str("The third cell of the table."), ), @@ -557,7 +558,7 @@ def test_get_usfm_nonverse_table() -> None: def test_get_usfm_nonverse_optbreak() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:3/1:esb/2:p"), str("The second paragraph of the sidebar."), ) @@ -569,7 +570,7 @@ def test_get_usfm_nonverse_optbreak() -> None: def test_get_usfm_nonverse_milestone() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 2:7a/1:s"), str("A new section header."), ) @@ -581,7 +582,7 @@ def test_get_usfm_nonverse_milestone() -> None: def test_get_usfm_nonverse_skip_note() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:0/3:ip"), str("The introductory paragraph."), ) @@ -593,7 +594,7 @@ def test_get_usfm_nonverse_skip_note() -> None: def test_get_usfm_nonverse_replace_with_note() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:0/3:ip"), str("The introductory paragraph."), ), @@ -605,7 +606,7 @@ def test_get_usfm_nonverse_replace_with_note() -> None: def test_get_usfm_verse_double_va_vp() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 3:1"), str("Updating later in the book to start."), ) @@ -618,7 +619,7 @@ def test_get_usfm_verse_double_va_vp() -> None: def test_get_usfm_verse_last_segment() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Updating the last verse."), ) @@ -640,27 +641,27 @@ def test_get_usfm_verse_last_segment() -> None: def test_get_usfm_verse_pretranslations_before_text() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("GEN 1:1"), str("Pretranslations before the start"), ), - ( + UpdateUsfmRow( scr_ref("GEN 1:2"), str("Pretranslations before the start"), ), - ( + UpdateUsfmRow( scr_ref("GEN 1:3"), str("Pretranslations before the start"), ), - ( + UpdateUsfmRow( scr_ref("GEN 1:4"), str("Pretranslations before the start"), ), - ( + UpdateUsfmRow( scr_ref("GEN 1:5"), str("Pretranslations before the start"), ), - ( + UpdateUsfmRow( scr_ref("MAT 1:0/3:ip"), str("The introductory paragraph."), ), @@ -673,11 +674,11 @@ def test_get_usfm_verse_pretranslations_before_text() -> None: def test_strip_paragraphs() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:0/2:p"), str("Update Paragraph"), ), - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update Verse 1"), ), @@ -721,7 +722,7 @@ def test_strip_paragraphs() -> None: def test_preservation_raw_strings() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str(r"Update all in one row \f \fr 1.1 \ft Some note \f*"), ) @@ -741,7 +742,7 @@ def test_preservation_raw_strings() -> None: def test_beginning_of_verse_embed() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str(r"Updated text"), ) @@ -761,7 +762,7 @@ def test_beginning_of_verse_embed() -> None: def test_cross_reference_dont_update() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1/1:x"), str("Update the cross reference"), ) @@ -781,7 +782,7 @@ def test_cross_reference_dont_update() -> None: def test_preserve_fig() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update"), ) @@ -801,11 +802,11 @@ def test_preserve_fig() -> None: def test_note_explicit_end_markers() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update text"), ), - ( + UpdateUsfmRow( scr_ref("MAT 1:1/1:f"), str("Update note"), ), @@ -832,7 +833,7 @@ def test_note_explicit_end_markers() -> None: def test_update_block_verse_preserve_paras() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -861,7 +862,7 @@ def test_update_block_verse_preserve_paras() -> None: def test_update_block_verse_strip_paras() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -890,7 +891,7 @@ def test_update_block_verse_strip_paras() -> None: def test_update_block_verse_range() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -917,7 +918,7 @@ def test_update_block_verse_range() -> None: def test_update_block_footnote_preserve_embeds() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -946,7 +947,7 @@ def test_update_block_footnote_preserve_embeds() -> None: def test_update_block_footnote_strip_embeds() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -973,7 +974,7 @@ def test_update_block_footnote_strip_embeds() -> None: def test_update_block_nonverse() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:0/1:s"), str("Updated section Header"), ), @@ -999,7 +1000,7 @@ def test_update_block_nonverse() -> None: def test_update_block_verse_preserve_styles() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -1030,7 +1031,7 @@ def test_update_block_verse_preserve_styles() -> None: def test_update_block_verse_strip_styles() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -1059,7 +1060,7 @@ def test_update_block_verse_strip_styles() -> None: def test_update_block_verse_section_header() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -1100,7 +1101,7 @@ def test_update_block_verse_section_header() -> None: def test_update_block_verse_section_header_in_verse() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -1135,7 +1136,7 @@ def test_update_block_verse_section_header_in_verse() -> None: def test_update_block_nonverse_paragraph_end_of_verse() -> None: rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -1166,11 +1167,11 @@ def test_update_block_nonverse_paragraph_end_of_verse() -> None: def test_header_reference_paragraphs() -> None: rows = [ - (scr_ref("MAT 1:1"), "new verse 1"), - (scr_ref("MAT 1:2"), "new verse 2"), - (scr_ref("MAT 1:3"), "new verse 3"), - (scr_ref("MAT 2:1"), "new verse 1"), - (scr_ref("MAT 2:2"), "new verse 2"), + UpdateUsfmRow(scr_ref("MAT 1:1"), "new verse 1"), + UpdateUsfmRow(scr_ref("MAT 1:2"), "new verse 2"), + UpdateUsfmRow(scr_ref("MAT 1:3"), "new verse 3"), + UpdateUsfmRow(scr_ref("MAT 2:1"), "new verse 1"), + UpdateUsfmRow(scr_ref("MAT 2:2"), "new verse 2"), ] usfm = r"""\id MAT \c 1 @@ -1215,7 +1216,7 @@ def test_header_reference_paragraphs() -> None: def test_pass_remark(): rows = [ - ( + UpdateUsfmRow( scr_ref("MAT 1:1"), str("Update 1"), ), @@ -1240,7 +1241,7 @@ def scr_ref(*refs: str) -> List[ScriptureRef]: def update_usfm( - rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, + rows: Optional[Sequence[UpdateUsfmRow]] = None, source: Optional[str] = None, id_text: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, diff --git a/tests/corpora/test_usfm_manual.py b/tests/corpora/test_usfm_manual.py index e742cd02..b795997b 100644 --- a/tests/corpora/test_usfm_manual.py +++ b/tests/corpora/test_usfm_manual.py @@ -2,7 +2,7 @@ import zipfile from dataclasses import dataclass from pathlib import Path -from typing import List, Optional, Tuple +from typing import List, Optional import pytest from testutils.corpora_test_helpers import TEST_DATA_PATH, USFM_SOURCE_PROJECT_PATH, USFM_TARGET_PROJECT_PATH @@ -13,6 +13,7 @@ ParatextTextCorpus, ScriptureRef, StandardParallelTextCorpus, + UpdateUsfmRow, UpdateUsfmTextBehavior, ZipParatextProjectSettingsParser, ZipParatextProjectTextUpdater, @@ -28,8 +29,8 @@ def test_parse_parallel_corpus(): rows = list(p_corpus.get_rows()) assert rows - pretranslations: List[Tuple[List[ScriptureRef], str]] = [ - ([ScriptureRef() for s in r.source_refs], r.source_text) for r in rows + pretranslations: List[UpdateUsfmRow] = [ + (UpdateUsfmRow(refs=[ScriptureRef.parse(s) for s in r.source_refs], text=r.source_text)) for r in rows ] target_settings = FileParatextProjectSettingsParser(USFM_TARGET_PROJECT_PATH).parse() @@ -84,8 +85,10 @@ def get_usfm(project_path: Path): with open(PRETRANSLATION_PATH, "r") as pretranslation_stream: pretranslations = [ ( - [ScriptureRef.parse(r, settings.versification).to_relaxed() for r in p["refs"] or []], - p.get("translation", ""), + UpdateUsfmRow( + refs=[ScriptureRef.parse(r, settings.versification).to_relaxed() for r in p["refs"] or []], + text=p.get("translation", ""), + ) ) for p in json.load(pretranslation_stream) ] diff --git a/tests/corpora/test_usfm_memory_text.py b/tests/corpora/test_usfm_memory_text.py index 6b1d50c3..76bdea76 100644 --- a/tests/corpora/test_usfm_memory_text.py +++ b/tests/corpora/test_usfm_memory_text.py @@ -1,14 +1,8 @@ -from typing import List, Optional, Sequence, Tuple +from typing import List from testutils.corpora_test_helpers import scripture_ref from machine.corpora import ScriptureRef, TextRow, UsfmMemoryText -from machine.corpora.update_usfm_parser_handler import ( - UpdateUsfmMarkerBehavior, - UpdateUsfmParserHandler, - UpdateUsfmTextBehavior, -) -from machine.corpora.usfm_parser import parse_usfm from machine.corpora.usfm_stylesheet import UsfmStylesheet @@ -201,16 +195,3 @@ def get_rows(usfm: str, include_markers: bool = False, include_all_text: bool = ) return list(text.get_rows()) - - -def update_usfm( - usfm: str, - rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, - text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING, - embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, - style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, -) -> str: - - handler = UpdateUsfmParserHandler(rows, "MAT", text_behavior, embed_behavior, style_behavior) - parse_usfm(usfm, handler) - return handler.get_usfm()