Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,12 @@
normalize,
unescape_spaces,
)
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
from .update_usfm_parser_handler import (
UpdateUsfmMarkerBehavior,
UpdateUsfmParserHandler,
UpdateUsfmRow,
UpdateUsfmTextBehavior,
)
from .usfm_file_text import UsfmFileText
from .usfm_file_text_corpus import UsfmFileTextCorpus
from .usfm_memory_text import UsfmMemoryText
Expand Down Expand Up @@ -135,6 +140,7 @@
"UpdateUsfmMarkerBehavior",
"UpdateUsfmParserHandler",
"UpdateUsfmTextBehavior",
"UpdateUsfmRow",
"UsfmAttribute",
"UsfmElementType",
"UsfmFileText",
Expand Down
12 changes: 8 additions & 4 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, Iterable, Optional, Sequence, Tuple, Union
from typing import BinaryIO, Iterable, Optional, Sequence, Union

from ..utils.typeshed import StrPath
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .scripture_ref import ScriptureRef
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
from .update_usfm_parser_handler import (
UpdateUsfmMarkerBehavior,
UpdateUsfmParserHandler,
UpdateUsfmRow,
UpdateUsfmTextBehavior,
)
from .usfm_parser import parse_usfm
from .usfm_update_block_handler import UsfmUpdateBlockHandler

Expand All @@ -20,7 +24,7 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
def update_usfm(
self,
book_id: str,
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
rows: Optional[Sequence[UpdateUsfmRow]] = None,
full_name: Optional[str] = None,
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
Expand Down
26 changes: 12 additions & 14 deletions machine/corpora/place_markers_usfm_update_block_handler.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,36 @@
from __future__ import annotations

from typing import Iterable, List, TypedDict
from typing import List, TypedDict, cast

from ..translation.word_alignment_matrix import WordAlignmentMatrix
from .usfm_token import UsfmToken, UsfmTokenType
from .usfm_update_block import UsfmUpdateBlock
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
from .usfm_update_block_handler import UsfmUpdateBlockHandler

PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info"


class PlaceMarkersAlignmentInfo(TypedDict):
refs: List[str]
source_tokens: List[str]
translation_tokens: List[str]
alignment: WordAlignmentMatrix


class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):

def __init__(self, align_info: Iterable[PlaceMarkersAlignmentInfo]) -> None:
self._align_info = {info["refs"][0]: info for info in align_info}

def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
ref = str(block.refs[0])
elements = list(block.elements)

# Nothing to do if there are no markers to place or no alignment to use
if PLACE_MARKERS_ALIGNMENT_INFO_KEY not in block.metadata:
return block

alignment_info = cast(PlaceMarkersAlignmentInfo, block.metadata[PLACE_MARKERS_ALIGNMENT_INFO_KEY])
if (
len(elements) == 0
or ref not in self._align_info.keys()
or self._align_info[ref]["alignment"].row_count == 0
or self._align_info[ref]["alignment"].column_count == 0
or alignment_info["alignment"].row_count == 0
or alignment_info["alignment"].column_count == 0
or not any(
(
e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
Expand Down Expand Up @@ -65,8 +65,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
):
eob_empty_paras = False

src_toks = self._align_info[ref]["source_tokens"]
trg_toks = self._align_info[ref]["translation_tokens"]
src_toks: List[str] = alignment_info["source_tokens"]
trg_toks: List[str] = alignment_info["translation_tokens"]
src_tok_idx = 0

src_sent = ""
Expand Down Expand Up @@ -112,9 +112,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
# Predict marker placements and get insertion order
to_insert = []
for element, adj_src_tok in zip(to_place, adj_src_toks):
adj_trg_tok = self._predict_marker_location(
self._align_info[ref]["alignment"], adj_src_tok, src_toks, trg_toks
)
adj_trg_tok = self._predict_marker_location(alignment_info["alignment"], adj_src_tok, src_toks, trg_toks)

if (
adj_trg_tok > 0
Expand Down
24 changes: 18 additions & 6 deletions machine/corpora/update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,17 @@ class UpdateUsfmMarkerBehavior(Enum):
STRIP = auto()


class UpdateUsfmRow:
def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[dict[str, object]] = None):
self.refs = refs
self.text = text
self.metadata = metadata


class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
def __init__(
self,
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
rows: Optional[Sequence[UpdateUsfmRow]] = None,
id_text: Optional[str] = None,
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
Expand Down Expand Up @@ -284,12 +291,14 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
tokenizer = UsfmTokenizer(stylesheet)
return tokenizer.detokenize(self._tokens)

def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> List[str]:
def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
row_texts: List[str] = []
row_metadata = None
source_index: int = 0
while self._row_index < len(self._rows) and source_index < len(seg_scr_refs):
compare: int = 0
row_scr_refs, text = self._rows[self._row_index]
row = self._rows[self._row_index]
row_scr_refs, text, metadata = row.refs, row.text, row.metadata
for row_scr_ref in row_scr_refs:
while source_index < len(seg_scr_refs):
compare = row_scr_ref.compare_to(seg_scr_refs[source_index], compare_segments=False)
Expand All @@ -302,11 +311,12 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> List[str]:
# source and row match
# grab the text - both source and row will be incremented in due time...
row_texts.append(text)
row_metadata = metadata
break
if compare <= 0:
# source is ahead of row, increment row
self._row_index += 1
return row_texts
return row_texts, row_metadata

def _collect_updatable_tokens(self, state: UsfmParserState) -> None:
self._use_updated_text()
Expand Down Expand Up @@ -377,8 +387,10 @@ def _has_new_text(self) -> bool:
return any(self._replace_stack) and self._replace_stack[-1]

def _start_update_block(self, scripture_refs: Sequence[ScriptureRef]) -> None:
self._update_block_stack.append(UsfmUpdateBlock(scripture_refs))
row_texts: List[str] = self._advance_rows(scripture_refs)
row_texts, metadata = self._advance_rows(scripture_refs)
self._update_block_stack.append(
UsfmUpdateBlock(scripture_refs, metadata=metadata if metadata is not None else {})
)
self._push_updated_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts])

def _end_update_block(self, state: UsfmParserState, scripture_refs: Sequence[ScriptureRef]) -> None:
Expand Down
16 changes: 13 additions & 3 deletions machine/corpora/usfm_update_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,15 @@


class UsfmUpdateBlock:
def __init__(self, refs: Iterable[ScriptureRef] = [], elements: Iterable[UsfmUpdateBlockElement] = []) -> None:
def __init__(
self,
refs: Iterable[ScriptureRef] = [],
elements: Iterable[UsfmUpdateBlockElement] = [],
metadata: dict[str, object] = {},
) -> None:
self._refs: list[ScriptureRef] = list(refs)
self._elements: list[UsfmUpdateBlockElement] = list(elements)
self._metadata: dict[str, object] = metadata

@property
def refs(self) -> Sequence[ScriptureRef]:
Expand All @@ -20,6 +26,10 @@ def refs(self) -> Sequence[ScriptureRef]:
def elements(self) -> Sequence[UsfmUpdateBlockElement]:
return self._elements

@property
def metadata(self) -> dict[str, object]:
return self._metadata

def add_text(self, tokens: Iterable[UsfmToken]) -> None:
self._elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, list(tokens)))

Expand Down Expand Up @@ -58,7 +68,7 @@ def get_tokens(self) -> list[UsfmToken]:
return [token for element in self._elements for token in element.get_tokens()]

def __eq__(self, other: UsfmUpdateBlock) -> bool:
return self._refs == other._refs and self._elements == other._elements
return self._refs == other._refs and self._elements == other._elements and self._metadata == other._metadata

def copy(self) -> UsfmUpdateBlock:
return UsfmUpdateBlock(self._refs, self._elements)
return UsfmUpdateBlock(self._refs, self._elements, self._metadata)
Loading