From 905a05633c1dca20bda02e52d17ad67dd966c52e Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 11 Sep 2025 17:44:06 -0400 Subject: [PATCH 1/6] Port unicode string-related tests to Python; address discrepancy regarding combining characters in Python strings --- machine/corpora/__init__.py | 2 + .../quotation_mark_finder.py | 8 +- .../quotation_mark_string_match.py | 12 +-- machine/punctuation_analysis/text_segment.py | 81 +++++++++++++++++-- ...tion_changing_usfm_block_update_handler.py | 23 +++--- tests/corpora/test_usfm_manual.py | 29 ++++++- .../test_quotation_mark_finder.py | 4 + .../test_quotation_mark_metadata.py | 10 +-- .../test_quotation_mark_string_match.py | 10 +++ .../punctuation_analysis/test_text_segment.py | 36 +++++---- tests/testutils/corpora_test_helpers.py | 2 + 11 files changed, 172 insertions(+), 45 deletions(-) diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index dd540f3b..0a7c8f19 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -85,6 +85,7 @@ from .usx_file_text_corpus import UsxFileTextCorpus from .usx_memory_text import UsxMemoryText from .usx_zip_text import UsxZipText +from .zip_paratext_project_quote_convention_detector import ZipParatextProjectQuoteConventionDetector from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser @@ -188,6 +189,7 @@ "UsxFileTextCorpus", "UsxMemoryText", "UsxZipText", + "ZipParatextProjectQuoteConventionDetector", "ZipParatextProjectSettingsParser", "ZipParatextProjectSettingsParserBase", "ZipParatextProjectTermsParser", diff --git a/machine/punctuation_analysis/quotation_mark_finder.py b/machine/punctuation_analysis/quotation_mark_finder.py index 73c95368..14e8e495 100644 --- a/machine/punctuation_analysis/quotation_mark_finder.py +++ b/machine/punctuation_analysis/quotation_mark_finder.py @@ -36,11 +36,15 @@ def find_all_potential_quotation_marks_in_text_segment( self, text_segment: TextSegment ) -> List[QuotationMarkStringMatch]: quotation_matches: List[QuotationMarkStringMatch] = [] - for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text): + for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text.string): if self._quote_conventions.is_valid_opening_quotation_mark( quotation_mark_match.group() ) or self._quote_conventions.is_valid_closing_quotation_mark(quotation_mark_match.group()): quotation_matches.append( - QuotationMarkStringMatch(text_segment, quotation_mark_match.start(), quotation_mark_match.end()) + QuotationMarkStringMatch( + text_segment, + text_segment.text.string_index_to_grapheme_index(quotation_mark_match.start()), + text_segment.text.string_index_to_grapheme_index(quotation_mark_match.end()), + ) ) return quotation_matches diff --git a/machine/punctuation_analysis/quotation_mark_string_match.py b/machine/punctuation_analysis/quotation_mark_string_match.py index 573e37c7..e9a82d8d 100644 --- a/machine/punctuation_analysis/quotation_mark_string_match.py +++ b/machine/punctuation_analysis/quotation_mark_string_match.py @@ -35,7 +35,7 @@ def __eq__(self, value): @property def quotation_mark(self) -> str: - return self._text_segment.text[self._start_index : self._end_index] + return self._text_segment.text[self._start_index : self._end_index].string def is_valid_opening_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool: return quote_conventions.is_valid_opening_quotation_mark(self.quotation_mark) @@ -59,18 +59,18 @@ def previous_character(self) -> Optional[str]: if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context( UsfmMarkerType.PARAGRAPH ): - return previous_segment.text[-1] + return previous_segment.text[-1].string return None - return self._text_segment.text[self._start_index - 1] + return self._text_segment.text[self._start_index - 1].string @property def next_character(self) -> Optional[str]: if self.is_at_end_of_segment(): next_segment = self._text_segment.next_segment if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH): - return next_segment.text[0] + return next_segment.text[0].string return None - return self._text_segment.text[self._end_index] + return self._text_segment.text[self._end_index].string def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool: return regex_pattern.search(self._text_segment.substring_before(self._start_index)) is not None @@ -102,7 +102,7 @@ def end_index(self) -> int: def context(self) -> str: return self._text_segment.text[ max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text)) - ] + ].string def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata: return QuotationMarkMetadata( diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py index 78e63d4a..e2271f9a 100644 --- a/machine/punctuation_analysis/text_segment.py +++ b/machine/punctuation_analysis/text_segment.py @@ -1,3 +1,4 @@ +import unicodedata from typing import Optional, Set from ..corpora.usfm_token import UsfmToken @@ -6,7 +7,7 @@ class TextSegment: def __init__(self): - self._text = "" + self._text: GraphemeString = GraphemeString("") self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER self._markers_in_preceding_context: Set[UsfmMarkerType] = set() self.previous_segment: Optional[TextSegment] = None @@ -31,7 +32,7 @@ def __eq__(self, value): return True @property - def text(self) -> str: + def text(self) -> "GraphemeString": return self._text @property @@ -39,10 +40,10 @@ def length(self) -> int: return len(self._text) def substring_before(self, index: int) -> str: - return self._text[:index] + return self._text[:index].string def substring_after(self, index: int) -> str: - return self._text[index:] + return self._text[index:].string def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool: return marker in self._markers_in_preceding_context @@ -54,9 +55,9 @@ def is_last_segment_in_verse(self) -> bool: return self.index_in_verse == self.num_segments_in_verse - 1 def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None: - self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index) + self._text = GraphemeString(self.substring_before(start_index) + replacement + self.substring_after(end_index)) if self._usfm_token is not None: - self._usfm_token.text = self._text + self._usfm_token.text = self._text.string class Builder: def __init__(self): @@ -76,8 +77,74 @@ def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder": return self def set_text(self, text: str) -> "TextSegment.Builder": - self._text_segment._text = text + self._text_segment._text = GraphemeString(text) return self def build(self) -> "TextSegment": return self._text_segment + + +class GraphemeString: + def __init__(self, string: str) -> None: + self._string = string + self._string_index_by_grapheme_index = { + grapheme_index: string_index + for grapheme_index, string_index in enumerate( + [i for i, c in enumerate(string) if unicodedata.category(c) not in ["Mc", "Mn"]] + ) + } + + def __len__(self) -> int: + return len(self._string_index_by_grapheme_index) + + @property + def string(self) -> str: + return self._string + + def __str__(self): + return self._string + + def __eq__(self, other) -> bool: + if not isinstance(other, GraphemeString): + return False + return self._string == other.string + + def __getitem__(self, key) -> "GraphemeString": + if isinstance(key, int): + grapheme_start = self._normalize_start_index(key) + grapheme_stop = self._normalize_stop_index(grapheme_start + 1) + string_start = self._string_index_by_grapheme_index.get(grapheme_start, len(self)) + string_stop = self._string_index_by_grapheme_index.get(grapheme_stop, None) + return GraphemeString(self._string[string_start:string_stop]) + elif isinstance(key, slice): + if key.step is not None and key.step != 1: + raise TypeError("Steps are not allowed in _GraphemeString slices") + grapheme_start = self._normalize_start_index(key.start) + grapheme_stop = self._normalize_stop_index(key.stop) + string_start = self._string_index_by_grapheme_index.get(grapheme_start, len(self)) + string_stop = self._string_index_by_grapheme_index.get(grapheme_stop, None) + return GraphemeString(self._string[string_start:string_stop]) + else: + raise TypeError("Indices must be integers or slices") + + def _normalize_start_index(self, index: int | None) -> int: + if index is None: + return 0 + if index < 0: + return len(self) + index + return index + + def _normalize_stop_index(self, index: int | None) -> int: + if index is None: + return len(self) + if index < 0: + return len(self) + index + return index + + def string_index_to_grapheme_index(self, string_index: int) -> int: + if string_index == len(self._string): + return len(self) + for g_index, s_index in self._string_index_by_grapheme_index.items(): + if s_index == string_index: + return g_index + raise ValueError(f"No corresponding grapheme index found for string index {string_index}.") diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py index 5aca5561..6f3bdfe0 100644 --- a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py +++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py @@ -476,9 +476,12 @@ def test_process_scripture_element() -> None: assert quote_convention_changer._quotation_mark_finder.num_times_called == 1 assert mock_quotation_mark_resolver.num_times_called == 1 - assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment._text == "this is a ‘test" assert ( - quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment._text + str(quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text) + == "this is a ‘test" + ) + assert ( + str(quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text) == "the test ends” here" ) @@ -494,7 +497,7 @@ def test_create_text_segments_basic() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 1 - assert text_segments[0]._text == "test segment" + assert str(text_segments[0].text) == "test segment" assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER assert text_segments[0]._markers_in_preceding_context == set() assert text_segments[0].previous_segment is None @@ -517,7 +520,7 @@ def test_create_text_segments_with_preceding_markers() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 1 - assert text_segments[0]._text == "test segment" + assert str(text_segments[0].text) == "test segment" assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH assert text_segments[0]._markers_in_preceding_context == { UsfmMarkerType.VERSE, @@ -547,7 +550,7 @@ def test_create_text_segments_with_multiple_text_tokens() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 2 - assert text_segments[0]._text == "test segment1" + assert str(text_segments[0].text) == "test segment1" assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH assert text_segments[0]._markers_in_preceding_context == { UsfmMarkerType.VERSE, @@ -555,7 +558,7 @@ def test_create_text_segments_with_multiple_text_tokens() -> None: } assert text_segments[0].previous_segment is None assert text_segments[0].next_segment == text_segments[1] - assert text_segments[1]._text == "test segment2" + assert str(text_segments[1].text) == "test segment2" assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER assert text_segments[1]._markers_in_preceding_context == { UsfmMarkerType.VERSE, @@ -574,7 +577,7 @@ def test_create_text_segment() -> None: segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token) assert segment is not None - assert segment._text == "test segment" + assert str(segment.text) == "test segment" assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER assert segment._markers_in_preceding_context == set() assert segment._usfm_token == usfm_token @@ -644,7 +647,7 @@ def test_update_quotation_marks() -> None: multi_char_to_single_char_quote_convention_changer._update_quotation_marks(multi_character_quotation_marks) - assert multi_character_text_segment.text == "this “is ‘a test segment’ ”" + assert str(multi_character_text_segment.text) == "this “is ‘a test segment’ ”" assert multi_character_quotation_marks[0].start_index == 5 assert multi_character_quotation_marks[0].end_index == 6 @@ -704,7 +707,7 @@ def test_update_quotation_marks() -> None: single_char_to_multi_char_quote_convention_changer._update_quotation_marks(single_character_quotation_marks) - assert single_character_text_segment.text == "this < >>" + assert str(single_character_text_segment.text) == "this < >>" assert single_character_quotation_marks[0].start_index == 5 assert single_character_quotation_marks[0].end_index == 7 @@ -765,7 +768,7 @@ def test_start_new_chapter() -> None: segment = quote_convention_changer._next_scripture_text_segment_builder.build() assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER - assert segment._text == "" + assert str(segment.text) == "" assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set() diff --git a/tests/corpora/test_usfm_manual.py b/tests/corpora/test_usfm_manual.py index b795997b..45166d26 100644 --- a/tests/corpora/test_usfm_manual.py +++ b/tests/corpora/test_usfm_manual.py @@ -5,7 +5,13 @@ from typing import List, Optional import pytest -from testutils.corpora_test_helpers import TEST_DATA_PATH, USFM_SOURCE_PROJECT_PATH, USFM_TARGET_PROJECT_PATH +from testutils.corpora_test_helpers import ( + TEST_DATA_PATH, + USFM_SOURCE_PROJECT_PATH, + USFM_SOURCE_PROJECT_ZIP_PATH, + USFM_TARGET_PROJECT_PATH, + USFM_TARGET_PROJECT_ZIP_PATH, +) from machine.corpora import ( FileParatextProjectSettingsParser, @@ -15,9 +21,11 @@ StandardParallelTextCorpus, UpdateUsfmRow, UpdateUsfmTextBehavior, + ZipParatextProjectQuoteConventionDetector, ZipParatextProjectSettingsParser, ZipParatextProjectTextUpdater, ) +from machine.punctuation_analysis import QuoteConventionDetector @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") @@ -124,3 +132,22 @@ def get_usfm(project_path: Path): assert False, f"Failed to process {subdir}: {e}" else: get_usfm(PARATEXT_PROJECT_PATH) + + +@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") +def test_analyze_corpora_quote_conventions(): + source_handler = QuoteConventionDetector() + source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r") + source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive) + source_quote_convention_detector.get_quote_convention_analysis(source_handler) + + target_handler = QuoteConventionDetector() + target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r") + target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive) + target_quote_convention_detector.get_quote_convention_analysis(target_handler) + + source_analysis = source_handler.detect_quote_convention() + target_analysis = target_handler.detect_quote_convention() + + assert source_analysis is not None + assert target_analysis is not None diff --git a/tests/punctuation_analysis/test_quotation_mark_finder.py b/tests/punctuation_analysis/test_quotation_mark_finder.py index 5d1a709c..f3e26c21 100644 --- a/tests/punctuation_analysis/test_quotation_mark_finder.py +++ b/tests/punctuation_analysis/test_quotation_mark_finder.py @@ -175,6 +175,10 @@ def test_that_all_possible_quotation_marks_are_identified() -> None: ), ] + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build() + ) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 6, 7)] + def test_that_it_uses_the_quote_convention_set() -> None: standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") diff --git a/tests/punctuation_analysis/test_quotation_mark_metadata.py b/tests/punctuation_analysis/test_quotation_mark_metadata.py index 5f2b265d..44e43518 100644 --- a/tests/punctuation_analysis/test_quotation_mark_metadata.py +++ b/tests/punctuation_analysis/test_quotation_mark_metadata.py @@ -19,7 +19,7 @@ def test_update_quotation_mark() -> None: end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english")) - assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said," + assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said," quotation_mark_metadata = QuotationMarkMetadata( quotation_mark='"', @@ -30,7 +30,7 @@ def test_update_quotation_mark() -> None: end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) - assert quotation_mark_metadata.text_segment._text == "He said to the woman, «Has God really said," + assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, «Has God really said," quotation_mark_metadata = QuotationMarkMetadata( quotation_mark='"', @@ -41,7 +41,7 @@ def test_update_quotation_mark() -> None: end_index=24, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) - assert quotation_mark_metadata.text_segment._text == 'He said to the woman, "«as God really said,' + assert str(quotation_mark_metadata.text_segment.text) == 'He said to the woman, "«as God really said,' def test_update_quotation_mark_with_multi_character_quotation_marks() -> None: @@ -54,7 +54,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None: end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french")) - assert quotation_mark_metadata.text_segment._text == "He said to the woman, < None: end_index=24, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english")) - assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said," + assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said," assert quotation_mark_metadata.start_index == 22 assert quotation_mark_metadata.end_index == 23 diff --git a/tests/punctuation_analysis/test_quotation_mark_string_match.py b/tests/punctuation_analysis/test_quotation_mark_string_match.py index 39485493..c24e3c3e 100644 --- a/tests/punctuation_analysis/test_quotation_mark_string_match.py +++ b/tests/punctuation_analysis/test_quotation_mark_string_match.py @@ -121,6 +121,16 @@ def test_get_previous_character() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) assert quotation_mark_string_match.previous_character == "“" + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder() + .set_text('"उत्पत्ति पुस्तकले') + .set_previous_segment(TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build()) + .build(), + 0, + 1, + ) + assert quotation_mark_string_match.previous_character == "ले" + def test_get_next_character() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) diff --git a/tests/punctuation_analysis/test_text_segment.py b/tests/punctuation_analysis/test_text_segment.py index aa215d67..9d2b8744 100644 --- a/tests/punctuation_analysis/test_text_segment.py +++ b/tests/punctuation_analysis/test_text_segment.py @@ -5,7 +5,7 @@ def test_builder_initialization() -> None: builder = TextSegment.Builder() - assert builder._text_segment._text == "" + assert str(builder._text_segment.text) == "" assert builder._text_segment.previous_segment is None assert builder._text_segment.next_segment is None assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER @@ -20,7 +20,7 @@ def test_builder_set_text() -> None: text = "Example text" builder.set_text(text) - assert builder._text_segment._text == text + assert str(builder._text_segment.text) == text def test_builder_set_previous_segment() -> None: @@ -62,7 +62,7 @@ def test_builder_set_usfm_token() -> None: assert builder._text_segment._usfm_token is not None assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT assert builder._text_segment._usfm_token.text == "USFM token text" - assert builder._text_segment._text == "" + assert str(builder._text_segment.text) == "" assert builder._text_segment.previous_segment is None assert builder._text_segment.next_segment is None @@ -148,10 +148,10 @@ def test_equals() -> None: def test_get_text() -> None: text_segment = TextSegment.Builder().set_text("example text").build() - assert text_segment.text == "example text" + assert str(text_segment.text) == "example text" text_segment = TextSegment.Builder().set_text("new example text").build() - assert text_segment.text == "new example text" + assert str(text_segment.text) == "new example text" def test_length() -> None: @@ -161,6 +161,14 @@ def test_length() -> None: text_segment = TextSegment.Builder().set_text("new example text").build() assert text_segment.length == len("new example text") + # Combining characters + text_segment = TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build() + assert text_segment.length == 11 + + # Surrogate pairs + text_segment = TextSegment.Builder().set_text("𝜺𝜺").build() + assert text_segment.length == 2 + def test_substring_before() -> None: text_segment = TextSegment.Builder().set_text("example text").build() @@ -243,28 +251,28 @@ def test_is_last_segment_in_verse() -> None: def test_replace_substring() -> None: text_segment = TextSegment.Builder().set_text("example text").build() text_segment.replace_substring(0, 7, "sample") - assert text_segment.text == "sample text" + assert str(text_segment.text) == "sample text" text_segment.replace_substring(7, 11, "text") - assert text_segment.text == "sample text" + assert str(text_segment.text) == "sample text" text_segment.replace_substring(0, 7, "") - assert text_segment.text == "text" + assert str(text_segment.text) == "text" text_segment.replace_substring(0, 4, "new'") - assert text_segment.text == "new'" + assert str(text_segment.text) == "new'" text_segment.replace_substring(3, 4, "\u2019") - assert text_segment.text == "new\u2019" + assert str(text_segment.text) == "new\u2019" text_segment.replace_substring(0, 0, "prefix ") - assert text_segment.text == "prefix new\u2019" + assert str(text_segment.text) == "prefix new\u2019" text_segment.replace_substring(0, 0, "") - assert text_segment.text == "prefix new\u2019" + assert str(text_segment.text) == "prefix new\u2019" text_segment.replace_substring(11, 11, " suffix") - assert text_segment.text == "prefix new\u2019 suffix" + assert str(text_segment.text) == "prefix new\u2019 suffix" text_segment.replace_substring(6, 6, "-") - assert text_segment.text == "prefix- new\u2019 suffix" + assert str(text_segment.text) == "prefix- new\u2019 suffix" diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py index e2875605..6c1e7d8b 100644 --- a/tests/testutils/corpora_test_helpers.py +++ b/tests/testutils/corpora_test_helpers.py @@ -8,7 +8,9 @@ USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes" USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target" +USFM_TARGET_PROJECT_ZIP_PATH = TEST_DATA_PATH / "project" / "target" USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source" +USFM_SOURCE_PROJECT_ZIP_PATH = TEST_DATA_PATH / "project" / "source" USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id" USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id" USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes" From c7c8c3afb9d2a9496b3425e5b2738b7cad91e644 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 11 Sep 2025 17:47:08 -0400 Subject: [PATCH 2/6] Remove redundant string property --- .../punctuation_analysis/quotation_mark_finder.py | 2 +- .../quotation_mark_string_match.py | 14 +++++++------- machine/punctuation_analysis/text_segment.py | 12 ++++-------- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/machine/punctuation_analysis/quotation_mark_finder.py b/machine/punctuation_analysis/quotation_mark_finder.py index 14e8e495..09982770 100644 --- a/machine/punctuation_analysis/quotation_mark_finder.py +++ b/machine/punctuation_analysis/quotation_mark_finder.py @@ -36,7 +36,7 @@ def find_all_potential_quotation_marks_in_text_segment( self, text_segment: TextSegment ) -> List[QuotationMarkStringMatch]: quotation_matches: List[QuotationMarkStringMatch] = [] - for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text.string): + for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(str(text_segment.text)): if self._quote_conventions.is_valid_opening_quotation_mark( quotation_mark_match.group() ) or self._quote_conventions.is_valid_closing_quotation_mark(quotation_mark_match.group()): diff --git a/machine/punctuation_analysis/quotation_mark_string_match.py b/machine/punctuation_analysis/quotation_mark_string_match.py index e9a82d8d..d73c5543 100644 --- a/machine/punctuation_analysis/quotation_mark_string_match.py +++ b/machine/punctuation_analysis/quotation_mark_string_match.py @@ -35,7 +35,7 @@ def __eq__(self, value): @property def quotation_mark(self) -> str: - return self._text_segment.text[self._start_index : self._end_index].string + return str(self._text_segment.text[self._start_index : self._end_index]) def is_valid_opening_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool: return quote_conventions.is_valid_opening_quotation_mark(self.quotation_mark) @@ -59,18 +59,18 @@ def previous_character(self) -> Optional[str]: if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context( UsfmMarkerType.PARAGRAPH ): - return previous_segment.text[-1].string + return str(previous_segment.text[-1]) return None - return self._text_segment.text[self._start_index - 1].string + return str(self._text_segment.text[self._start_index - 1]) @property def next_character(self) -> Optional[str]: if self.is_at_end_of_segment(): next_segment = self._text_segment.next_segment if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH): - return next_segment.text[0].string + return str(next_segment.text[0]) return None - return self._text_segment.text[self._end_index].string + return str(self._text_segment.text[self._end_index]) def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool: return regex_pattern.search(self._text_segment.substring_before(self._start_index)) is not None @@ -100,9 +100,9 @@ def end_index(self) -> int: # Not used, but a useful method for debugging @property def context(self) -> str: - return self._text_segment.text[ + return str(self._text_segment.text[ max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text)) - ].string + ]) def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata: return QuotationMarkMetadata( diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py index e2271f9a..b232653a 100644 --- a/machine/punctuation_analysis/text_segment.py +++ b/machine/punctuation_analysis/text_segment.py @@ -40,10 +40,10 @@ def length(self) -> int: return len(self._text) def substring_before(self, index: int) -> str: - return self._text[:index].string + return str(self._text[:index]) def substring_after(self, index: int) -> str: - return self._text[index:].string + return str(self._text[index:]) def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool: return marker in self._markers_in_preceding_context @@ -57,7 +57,7 @@ def is_last_segment_in_verse(self) -> bool: def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None: self._text = GraphemeString(self.substring_before(start_index) + replacement + self.substring_after(end_index)) if self._usfm_token is not None: - self._usfm_token.text = self._text.string + self._usfm_token.text = str(self._text) class Builder: def __init__(self): @@ -97,17 +97,13 @@ def __init__(self, string: str) -> None: def __len__(self) -> int: return len(self._string_index_by_grapheme_index) - @property - def string(self) -> str: - return self._string - def __str__(self): return self._string def __eq__(self, other) -> bool: if not isinstance(other, GraphemeString): return False - return self._string == other.string + return self._string == other._string def __getitem__(self, key) -> "GraphemeString": if isinstance(key, int): From f9300507b54f44ba1c870ba02851930122aef7c1 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 12 Sep 2025 09:26:36 -0400 Subject: [PATCH 3/6] Fix formatting --- .../punctuation_analysis/quotation_mark_string_match.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/machine/punctuation_analysis/quotation_mark_string_match.py b/machine/punctuation_analysis/quotation_mark_string_match.py index d73c5543..dcafa869 100644 --- a/machine/punctuation_analysis/quotation_mark_string_match.py +++ b/machine/punctuation_analysis/quotation_mark_string_match.py @@ -100,9 +100,11 @@ def end_index(self) -> int: # Not used, but a useful method for debugging @property def context(self) -> str: - return str(self._text_segment.text[ - max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text)) - ]) + return str( + self._text_segment.text[ + max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text)) + ] + ) def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata: return QuotationMarkMetadata( From a12b917865df830b86d4996f1a7a630b98744262 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 12 Sep 2025 09:32:20 -0400 Subject: [PATCH 4/6] Use Optional[] --- machine/punctuation_analysis/text_segment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py index b232653a..cfa0b1a1 100644 --- a/machine/punctuation_analysis/text_segment.py +++ b/machine/punctuation_analysis/text_segment.py @@ -123,14 +123,14 @@ def __getitem__(self, key) -> "GraphemeString": else: raise TypeError("Indices must be integers or slices") - def _normalize_start_index(self, index: int | None) -> int: + def _normalize_start_index(self, index: Optional[int]) -> int: if index is None: return 0 if index < 0: return len(self) + index return index - def _normalize_stop_index(self, index: int | None) -> int: + def _normalize_stop_index(self, index: Optional[int]) -> int: if index is None: return len(self) if index < 0: From 38292fdfb68ad7e2e0171a48cb75c17696a42952 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 15 Sep 2025 08:17:10 -0400 Subject: [PATCH 5/6] Change class name --- .../quotation_mark_finder.py | 4 +- machine/punctuation_analysis/text_segment.py | 50 +++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/machine/punctuation_analysis/quotation_mark_finder.py b/machine/punctuation_analysis/quotation_mark_finder.py index 09982770..65b7e784 100644 --- a/machine/punctuation_analysis/quotation_mark_finder.py +++ b/machine/punctuation_analysis/quotation_mark_finder.py @@ -43,8 +43,8 @@ def find_all_potential_quotation_marks_in_text_segment( quotation_matches.append( QuotationMarkStringMatch( text_segment, - text_segment.text.string_index_to_grapheme_index(quotation_mark_match.start()), - text_segment.text.string_index_to_grapheme_index(quotation_mark_match.end()), + text_segment.text.string_index_to_glyph_index(quotation_mark_match.start()), + text_segment.text.string_index_to_glyph_index(quotation_mark_match.end()), ) ) return quotation_matches diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py index cfa0b1a1..c4348c2d 100644 --- a/machine/punctuation_analysis/text_segment.py +++ b/machine/punctuation_analysis/text_segment.py @@ -7,7 +7,7 @@ class TextSegment: def __init__(self): - self._text: GraphemeString = GraphemeString("") + self._text: GlyphString = GlyphString("") self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER self._markers_in_preceding_context: Set[UsfmMarkerType] = set() self.previous_segment: Optional[TextSegment] = None @@ -32,7 +32,7 @@ def __eq__(self, value): return True @property - def text(self) -> "GraphemeString": + def text(self) -> "GlyphString": return self._text @property @@ -55,7 +55,7 @@ def is_last_segment_in_verse(self) -> bool: return self.index_in_verse == self.num_segments_in_verse - 1 def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None: - self._text = GraphemeString(self.substring_before(start_index) + replacement + self.substring_after(end_index)) + self._text = GlyphString(self.substring_before(start_index) + replacement + self.substring_after(end_index)) if self._usfm_token is not None: self._usfm_token.text = str(self._text) @@ -77,49 +77,49 @@ def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder": return self def set_text(self, text: str) -> "TextSegment.Builder": - self._text_segment._text = GraphemeString(text) + self._text_segment._text = GlyphString(text) return self def build(self) -> "TextSegment": return self._text_segment -class GraphemeString: +class GlyphString: def __init__(self, string: str) -> None: self._string = string - self._string_index_by_grapheme_index = { - grapheme_index: string_index - for grapheme_index, string_index in enumerate( + self._string_index_by_glyph_index = { + glyph_index: string_index + for glyph_index, string_index in enumerate( [i for i, c in enumerate(string) if unicodedata.category(c) not in ["Mc", "Mn"]] ) } def __len__(self) -> int: - return len(self._string_index_by_grapheme_index) + return len(self._string_index_by_glyph_index) def __str__(self): return self._string def __eq__(self, other) -> bool: - if not isinstance(other, GraphemeString): + if not isinstance(other, GlyphString): return False return self._string == other._string - def __getitem__(self, key) -> "GraphemeString": + def __getitem__(self, key) -> "GlyphString": if isinstance(key, int): - grapheme_start = self._normalize_start_index(key) - grapheme_stop = self._normalize_stop_index(grapheme_start + 1) - string_start = self._string_index_by_grapheme_index.get(grapheme_start, len(self)) - string_stop = self._string_index_by_grapheme_index.get(grapheme_stop, None) - return GraphemeString(self._string[string_start:string_stop]) + glyph_start = self._normalize_start_index(key) + glyph_stop = self._normalize_stop_index(glyph_start + 1) + string_start = self._string_index_by_glyph_index.get(glyph_start, len(self)) + string_stop = self._string_index_by_glyph_index.get(glyph_stop, None) + return GlyphString(self._string[string_start:string_stop]) elif isinstance(key, slice): if key.step is not None and key.step != 1: - raise TypeError("Steps are not allowed in _GraphemeString slices") - grapheme_start = self._normalize_start_index(key.start) - grapheme_stop = self._normalize_stop_index(key.stop) - string_start = self._string_index_by_grapheme_index.get(grapheme_start, len(self)) - string_stop = self._string_index_by_grapheme_index.get(grapheme_stop, None) - return GraphemeString(self._string[string_start:string_stop]) + raise TypeError("Steps are not allowed in _glyphString slices") + glyph_start = self._normalize_start_index(key.start) + glyph_stop = self._normalize_stop_index(key.stop) + string_start = self._string_index_by_glyph_index.get(glyph_start, len(self)) + string_stop = self._string_index_by_glyph_index.get(glyph_stop, None) + return GlyphString(self._string[string_start:string_stop]) else: raise TypeError("Indices must be integers or slices") @@ -137,10 +137,10 @@ def _normalize_stop_index(self, index: Optional[int]) -> int: return len(self) + index return index - def string_index_to_grapheme_index(self, string_index: int) -> int: + def string_index_to_glyph_index(self, string_index: int) -> int: if string_index == len(self._string): return len(self) - for g_index, s_index in self._string_index_by_grapheme_index.items(): + for g_index, s_index in self._string_index_by_glyph_index.items(): if s_index == string_index: return g_index - raise ValueError(f"No corresponding grapheme index found for string index {string_index}.") + raise ValueError(f"No corresponding glyph index found for string index {string_index}.") From 433c57916d71fcfd91865a296ba8f69aa6bb186c Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 15 Sep 2025 18:16:21 -0400 Subject: [PATCH 6/6] Remove custom combining character handling --- .../quotation_mark_finder.py | 4 +- machine/punctuation_analysis/text_segment.py | 71 ++----------------- ...tion_changing_usfm_block_update_handler.py | 24 +++---- .../test_quotation_mark_finder.py | 2 +- .../test_quotation_mark_metadata.py | 10 +-- .../test_quotation_mark_string_match.py | 2 +- .../punctuation_analysis/test_text_segment.py | 30 ++++---- 7 files changed, 38 insertions(+), 105 deletions(-) diff --git a/machine/punctuation_analysis/quotation_mark_finder.py b/machine/punctuation_analysis/quotation_mark_finder.py index 65b7e784..6d7303e6 100644 --- a/machine/punctuation_analysis/quotation_mark_finder.py +++ b/machine/punctuation_analysis/quotation_mark_finder.py @@ -43,8 +43,8 @@ def find_all_potential_quotation_marks_in_text_segment( quotation_matches.append( QuotationMarkStringMatch( text_segment, - text_segment.text.string_index_to_glyph_index(quotation_mark_match.start()), - text_segment.text.string_index_to_glyph_index(quotation_mark_match.end()), + quotation_mark_match.start(), + quotation_mark_match.end(), ) ) return quotation_matches diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py index c4348c2d..c8d44bde 100644 --- a/machine/punctuation_analysis/text_segment.py +++ b/machine/punctuation_analysis/text_segment.py @@ -1,4 +1,3 @@ -import unicodedata from typing import Optional, Set from ..corpora.usfm_token import UsfmToken @@ -7,7 +6,7 @@ class TextSegment: def __init__(self): - self._text: GlyphString = GlyphString("") + self._text = "" self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER self._markers_in_preceding_context: Set[UsfmMarkerType] = set() self.previous_segment: Optional[TextSegment] = None @@ -32,7 +31,7 @@ def __eq__(self, value): return True @property - def text(self) -> "GlyphString": + def text(self) -> str: return self._text @property @@ -55,7 +54,7 @@ def is_last_segment_in_verse(self) -> bool: return self.index_in_verse == self.num_segments_in_verse - 1 def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None: - self._text = GlyphString(self.substring_before(start_index) + replacement + self.substring_after(end_index)) + self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index) if self._usfm_token is not None: self._usfm_token.text = str(self._text) @@ -77,70 +76,8 @@ def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder": return self def set_text(self, text: str) -> "TextSegment.Builder": - self._text_segment._text = GlyphString(text) + self._text_segment._text = text return self def build(self) -> "TextSegment": return self._text_segment - - -class GlyphString: - def __init__(self, string: str) -> None: - self._string = string - self._string_index_by_glyph_index = { - glyph_index: string_index - for glyph_index, string_index in enumerate( - [i for i, c in enumerate(string) if unicodedata.category(c) not in ["Mc", "Mn"]] - ) - } - - def __len__(self) -> int: - return len(self._string_index_by_glyph_index) - - def __str__(self): - return self._string - - def __eq__(self, other) -> bool: - if not isinstance(other, GlyphString): - return False - return self._string == other._string - - def __getitem__(self, key) -> "GlyphString": - if isinstance(key, int): - glyph_start = self._normalize_start_index(key) - glyph_stop = self._normalize_stop_index(glyph_start + 1) - string_start = self._string_index_by_glyph_index.get(glyph_start, len(self)) - string_stop = self._string_index_by_glyph_index.get(glyph_stop, None) - return GlyphString(self._string[string_start:string_stop]) - elif isinstance(key, slice): - if key.step is not None and key.step != 1: - raise TypeError("Steps are not allowed in _glyphString slices") - glyph_start = self._normalize_start_index(key.start) - glyph_stop = self._normalize_stop_index(key.stop) - string_start = self._string_index_by_glyph_index.get(glyph_start, len(self)) - string_stop = self._string_index_by_glyph_index.get(glyph_stop, None) - return GlyphString(self._string[string_start:string_stop]) - else: - raise TypeError("Indices must be integers or slices") - - def _normalize_start_index(self, index: Optional[int]) -> int: - if index is None: - return 0 - if index < 0: - return len(self) + index - return index - - def _normalize_stop_index(self, index: Optional[int]) -> int: - if index is None: - return len(self) - if index < 0: - return len(self) + index - return index - - def string_index_to_glyph_index(self, string_index: int) -> int: - if string_index == len(self._string): - return len(self) - for g_index, s_index in self._string_index_by_glyph_index.items(): - if s_index == string_index: - return g_index - raise ValueError(f"No corresponding glyph index found for string index {string_index}.") diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py index 6f3bdfe0..baadf8eb 100644 --- a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py +++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py @@ -476,13 +476,9 @@ def test_process_scripture_element() -> None: assert quote_convention_changer._quotation_mark_finder.num_times_called == 1 assert mock_quotation_mark_resolver.num_times_called == 1 + assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text == "this is a ‘test" assert ( - str(quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text) - == "this is a ‘test" - ) - assert ( - str(quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text) - == "the test ends” here" + quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text == "the test ends” here" ) @@ -497,7 +493,7 @@ def test_create_text_segments_basic() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 1 - assert str(text_segments[0].text) == "test segment" + assert text_segments[0].text == "test segment" assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER assert text_segments[0]._markers_in_preceding_context == set() assert text_segments[0].previous_segment is None @@ -520,7 +516,7 @@ def test_create_text_segments_with_preceding_markers() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 1 - assert str(text_segments[0].text) == "test segment" + assert text_segments[0].text == "test segment" assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH assert text_segments[0]._markers_in_preceding_context == { UsfmMarkerType.VERSE, @@ -550,7 +546,7 @@ def test_create_text_segments_with_multiple_text_tokens() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 2 - assert str(text_segments[0].text) == "test segment1" + assert text_segments[0].text == "test segment1" assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH assert text_segments[0]._markers_in_preceding_context == { UsfmMarkerType.VERSE, @@ -558,7 +554,7 @@ def test_create_text_segments_with_multiple_text_tokens() -> None: } assert text_segments[0].previous_segment is None assert text_segments[0].next_segment == text_segments[1] - assert str(text_segments[1].text) == "test segment2" + assert text_segments[1].text == "test segment2" assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER assert text_segments[1]._markers_in_preceding_context == { UsfmMarkerType.VERSE, @@ -577,7 +573,7 @@ def test_create_text_segment() -> None: segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token) assert segment is not None - assert str(segment.text) == "test segment" + assert segment.text == "test segment" assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER assert segment._markers_in_preceding_context == set() assert segment._usfm_token == usfm_token @@ -647,7 +643,7 @@ def test_update_quotation_marks() -> None: multi_char_to_single_char_quote_convention_changer._update_quotation_marks(multi_character_quotation_marks) - assert str(multi_character_text_segment.text) == "this “is ‘a test segment’ ”" + assert multi_character_text_segment.text == "this “is ‘a test segment’ ”" assert multi_character_quotation_marks[0].start_index == 5 assert multi_character_quotation_marks[0].end_index == 6 @@ -707,7 +703,7 @@ def test_update_quotation_marks() -> None: single_char_to_multi_char_quote_convention_changer._update_quotation_marks(single_character_quotation_marks) - assert str(single_character_text_segment.text) == "this < >>" + assert single_character_text_segment.text == "this < >>" assert single_character_quotation_marks[0].start_index == 5 assert single_character_quotation_marks[0].end_index == 7 @@ -768,7 +764,7 @@ def test_start_new_chapter() -> None: segment = quote_convention_changer._next_scripture_text_segment_builder.build() assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER - assert str(segment.text) == "" + assert segment.text == "" assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set() diff --git a/tests/punctuation_analysis/test_quotation_mark_finder.py b/tests/punctuation_analysis/test_quotation_mark_finder.py index f3e26c21..035f50f0 100644 --- a/tests/punctuation_analysis/test_quotation_mark_finder.py +++ b/tests/punctuation_analysis/test_quotation_mark_finder.py @@ -177,7 +177,7 @@ def test_that_all_possible_quotation_marks_are_identified() -> None: assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build() - ) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 6, 7)] + ) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 9, 10)] def test_that_it_uses_the_quote_convention_set() -> None: diff --git a/tests/punctuation_analysis/test_quotation_mark_metadata.py b/tests/punctuation_analysis/test_quotation_mark_metadata.py index 44e43518..4e4bab1a 100644 --- a/tests/punctuation_analysis/test_quotation_mark_metadata.py +++ b/tests/punctuation_analysis/test_quotation_mark_metadata.py @@ -19,7 +19,7 @@ def test_update_quotation_mark() -> None: end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english")) - assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said," + assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said," quotation_mark_metadata = QuotationMarkMetadata( quotation_mark='"', @@ -30,7 +30,7 @@ def test_update_quotation_mark() -> None: end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) - assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, «Has God really said," + assert quotation_mark_metadata.text_segment.text == "He said to the woman, «Has God really said," quotation_mark_metadata = QuotationMarkMetadata( quotation_mark='"', @@ -41,7 +41,7 @@ def test_update_quotation_mark() -> None: end_index=24, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) - assert str(quotation_mark_metadata.text_segment.text) == 'He said to the woman, "«as God really said,' + assert quotation_mark_metadata.text_segment.text == 'He said to the woman, "«as God really said,' def test_update_quotation_mark_with_multi_character_quotation_marks() -> None: @@ -54,7 +54,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None: end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french")) - assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, < None: end_index=24, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english")) - assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said," + assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said," assert quotation_mark_metadata.start_index == 22 assert quotation_mark_metadata.end_index == 23 diff --git a/tests/punctuation_analysis/test_quotation_mark_string_match.py b/tests/punctuation_analysis/test_quotation_mark_string_match.py index c24e3c3e..7f478f73 100644 --- a/tests/punctuation_analysis/test_quotation_mark_string_match.py +++ b/tests/punctuation_analysis/test_quotation_mark_string_match.py @@ -129,7 +129,7 @@ def test_get_previous_character() -> None: 0, 1, ) - assert quotation_mark_string_match.previous_character == "ले" + assert quotation_mark_string_match.previous_character == "\u0947" def test_get_next_character() -> None: diff --git a/tests/punctuation_analysis/test_text_segment.py b/tests/punctuation_analysis/test_text_segment.py index 9d2b8744..11932f00 100644 --- a/tests/punctuation_analysis/test_text_segment.py +++ b/tests/punctuation_analysis/test_text_segment.py @@ -5,7 +5,7 @@ def test_builder_initialization() -> None: builder = TextSegment.Builder() - assert str(builder._text_segment.text) == "" + assert builder._text_segment.text == "" assert builder._text_segment.previous_segment is None assert builder._text_segment.next_segment is None assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER @@ -20,7 +20,7 @@ def test_builder_set_text() -> None: text = "Example text" builder.set_text(text) - assert str(builder._text_segment.text) == text + assert builder._text_segment.text == text def test_builder_set_previous_segment() -> None: @@ -62,7 +62,7 @@ def test_builder_set_usfm_token() -> None: assert builder._text_segment._usfm_token is not None assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT assert builder._text_segment._usfm_token.text == "USFM token text" - assert str(builder._text_segment.text) == "" + assert builder._text_segment.text == "" assert builder._text_segment.previous_segment is None assert builder._text_segment.next_segment is None @@ -148,10 +148,10 @@ def test_equals() -> None: def test_get_text() -> None: text_segment = TextSegment.Builder().set_text("example text").build() - assert str(text_segment.text) == "example text" + assert text_segment.text == "example text" text_segment = TextSegment.Builder().set_text("new example text").build() - assert str(text_segment.text) == "new example text" + assert text_segment.text == "new example text" def test_length() -> None: @@ -163,7 +163,7 @@ def test_length() -> None: # Combining characters text_segment = TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build() - assert text_segment.length == 11 + assert text_segment.length == 17 # Surrogate pairs text_segment = TextSegment.Builder().set_text("𝜺𝜺").build() @@ -251,28 +251,28 @@ def test_is_last_segment_in_verse() -> None: def test_replace_substring() -> None: text_segment = TextSegment.Builder().set_text("example text").build() text_segment.replace_substring(0, 7, "sample") - assert str(text_segment.text) == "sample text" + assert text_segment.text == "sample text" text_segment.replace_substring(7, 11, "text") - assert str(text_segment.text) == "sample text" + assert text_segment.text == "sample text" text_segment.replace_substring(0, 7, "") - assert str(text_segment.text) == "text" + assert text_segment.text == "text" text_segment.replace_substring(0, 4, "new'") - assert str(text_segment.text) == "new'" + assert text_segment.text == "new'" text_segment.replace_substring(3, 4, "\u2019") - assert str(text_segment.text) == "new\u2019" + assert text_segment.text == "new\u2019" text_segment.replace_substring(0, 0, "prefix ") - assert str(text_segment.text) == "prefix new\u2019" + assert text_segment.text == "prefix new\u2019" text_segment.replace_substring(0, 0, "") - assert str(text_segment.text) == "prefix new\u2019" + assert text_segment.text == "prefix new\u2019" text_segment.replace_substring(11, 11, " suffix") - assert str(text_segment.text) == "prefix new\u2019 suffix" + assert text_segment.text == "prefix new\u2019 suffix" text_segment.replace_substring(6, 6, "-") - assert str(text_segment.text) == "prefix- new\u2019 suffix" + assert text_segment.text == "prefix- new\u2019 suffix"