From 905a05633c1dca20bda02e52d17ad67dd966c52e Mon Sep 17 00:00:00 2001
From: Enkidu93 <lowryec17@gcc.edu>
Date: Thu, 11 Sep 2025 17:44:06 -0400
Subject: [PATCH 1/6] Port unicode string-related tests to Python; address
 discrepancy regarding combining characters in Python strings

---
 machine/corpora/__init__.py                   |  2 +
 .../quotation_mark_finder.py                  |  8 +-
 .../quotation_mark_string_match.py            | 12 +--
 machine/punctuation_analysis/text_segment.py  | 81 +++++++++++++++++--
 ...tion_changing_usfm_block_update_handler.py | 23 +++---
 tests/corpora/test_usfm_manual.py             | 29 ++++++-
 .../test_quotation_mark_finder.py             |  4 +
 .../test_quotation_mark_metadata.py           | 10 +--
 .../test_quotation_mark_string_match.py       | 10 +++
 .../punctuation_analysis/test_text_segment.py | 36 +++++----
 tests/testutils/corpora_test_helpers.py       |  2 +
 11 files changed, 172 insertions(+), 45 deletions(-)

diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
index dd540f3b..0a7c8f19 100644
--- a/machine/corpora/__init__.py
+++ b/machine/corpora/__init__.py
@@ -85,6 +85,7 @@
 from .usx_file_text_corpus import UsxFileTextCorpus
 from .usx_memory_text import UsxMemoryText
 from .usx_zip_text import UsxZipText
+from .zip_paratext_project_quote_convention_detector import ZipParatextProjectQuoteConventionDetector
 from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
 from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
 from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
@@ -188,6 +189,7 @@
     "UsxFileTextCorpus",
     "UsxMemoryText",
     "UsxZipText",
+    "ZipParatextProjectQuoteConventionDetector",
     "ZipParatextProjectSettingsParser",
     "ZipParatextProjectSettingsParserBase",
     "ZipParatextProjectTermsParser",
diff --git a/machine/punctuation_analysis/quotation_mark_finder.py b/machine/punctuation_analysis/quotation_mark_finder.py
index 73c95368..14e8e495 100644
--- a/machine/punctuation_analysis/quotation_mark_finder.py
+++ b/machine/punctuation_analysis/quotation_mark_finder.py
@@ -36,11 +36,15 @@ def find_all_potential_quotation_marks_in_text_segment(
         self, text_segment: TextSegment
     ) -> List[QuotationMarkStringMatch]:
         quotation_matches: List[QuotationMarkStringMatch] = []
-        for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text):
+        for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text.string):
             if self._quote_conventions.is_valid_opening_quotation_mark(
                 quotation_mark_match.group()
             ) or self._quote_conventions.is_valid_closing_quotation_mark(quotation_mark_match.group()):
                 quotation_matches.append(
-                    QuotationMarkStringMatch(text_segment, quotation_mark_match.start(), quotation_mark_match.end())
+                    QuotationMarkStringMatch(
+                        text_segment,
+                        text_segment.text.string_index_to_grapheme_index(quotation_mark_match.start()),
+                        text_segment.text.string_index_to_grapheme_index(quotation_mark_match.end()),
+                    )
                 )
         return quotation_matches
diff --git a/machine/punctuation_analysis/quotation_mark_string_match.py b/machine/punctuation_analysis/quotation_mark_string_match.py
index 573e37c7..e9a82d8d 100644
--- a/machine/punctuation_analysis/quotation_mark_string_match.py
+++ b/machine/punctuation_analysis/quotation_mark_string_match.py
@@ -35,7 +35,7 @@ def __eq__(self, value):
 
     @property
     def quotation_mark(self) -> str:
-        return self._text_segment.text[self._start_index : self._end_index]
+        return self._text_segment.text[self._start_index : self._end_index].string
 
     def is_valid_opening_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool:
         return quote_conventions.is_valid_opening_quotation_mark(self.quotation_mark)
@@ -59,18 +59,18 @@ def previous_character(self) -> Optional[str]:
             if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context(
                 UsfmMarkerType.PARAGRAPH
             ):
-                return previous_segment.text[-1]
+                return previous_segment.text[-1].string
             return None
-        return self._text_segment.text[self._start_index - 1]
+        return self._text_segment.text[self._start_index - 1].string
 
     @property
     def next_character(self) -> Optional[str]:
         if self.is_at_end_of_segment():
             next_segment = self._text_segment.next_segment
             if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH):
-                return next_segment.text[0]
+                return next_segment.text[0].string
             return None
-        return self._text_segment.text[self._end_index]
+        return self._text_segment.text[self._end_index].string
 
     def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool:
         return regex_pattern.search(self._text_segment.substring_before(self._start_index)) is not None
@@ -102,7 +102,7 @@ def end_index(self) -> int:
     def context(self) -> str:
         return self._text_segment.text[
             max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text))
-        ]
+        ].string
 
     def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata:
         return QuotationMarkMetadata(
diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py
index 78e63d4a..e2271f9a 100644
--- a/machine/punctuation_analysis/text_segment.py
+++ b/machine/punctuation_analysis/text_segment.py
@@ -1,3 +1,4 @@
+import unicodedata
 from typing import Optional, Set
 
 from ..corpora.usfm_token import UsfmToken
@@ -6,7 +7,7 @@
 
 class TextSegment:
     def __init__(self):
-        self._text = ""
+        self._text: GraphemeString = GraphemeString("")
         self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER
         self._markers_in_preceding_context: Set[UsfmMarkerType] = set()
         self.previous_segment: Optional[TextSegment] = None
@@ -31,7 +32,7 @@ def __eq__(self, value):
         return True
 
     @property
-    def text(self) -> str:
+    def text(self) -> "GraphemeString":
         return self._text
 
     @property
@@ -39,10 +40,10 @@ def length(self) -> int:
         return len(self._text)
 
     def substring_before(self, index: int) -> str:
-        return self._text[:index]
+        return self._text[:index].string
 
     def substring_after(self, index: int) -> str:
-        return self._text[index:]
+        return self._text[index:].string
 
     def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool:
         return marker in self._markers_in_preceding_context
@@ -54,9 +55,9 @@ def is_last_segment_in_verse(self) -> bool:
         return self.index_in_verse == self.num_segments_in_verse - 1
 
     def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None:
-        self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index)
+        self._text = GraphemeString(self.substring_before(start_index) + replacement + self.substring_after(end_index))
         if self._usfm_token is not None:
-            self._usfm_token.text = self._text
+            self._usfm_token.text = self._text.string
 
     class Builder:
         def __init__(self):
@@ -76,8 +77,74 @@ def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
             return self
 
         def set_text(self, text: str) -> "TextSegment.Builder":
-            self._text_segment._text = text
+            self._text_segment._text = GraphemeString(text)
             return self
 
         def build(self) -> "TextSegment":
             return self._text_segment
+
+
+class GraphemeString:
+    def __init__(self, string: str) -> None:
+        self._string = string
+        self._string_index_by_grapheme_index = {
+            grapheme_index: string_index
+            for grapheme_index, string_index in enumerate(
+                [i for i, c in enumerate(string) if unicodedata.category(c) not in ["Mc", "Mn"]]
+            )
+        }
+
+    def __len__(self) -> int:
+        return len(self._string_index_by_grapheme_index)
+
+    @property
+    def string(self) -> str:
+        return self._string
+
+    def __str__(self):
+        return self._string
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, GraphemeString):
+            return False
+        return self._string == other.string
+
+    def __getitem__(self, key) -> "GraphemeString":
+        if isinstance(key, int):
+            grapheme_start = self._normalize_start_index(key)
+            grapheme_stop = self._normalize_stop_index(grapheme_start + 1)
+            string_start = self._string_index_by_grapheme_index.get(grapheme_start, len(self))
+            string_stop = self._string_index_by_grapheme_index.get(grapheme_stop, None)
+            return GraphemeString(self._string[string_start:string_stop])
+        elif isinstance(key, slice):
+            if key.step is not None and key.step != 1:
+                raise TypeError("Steps are not allowed in _GraphemeString slices")
+            grapheme_start = self._normalize_start_index(key.start)
+            grapheme_stop = self._normalize_stop_index(key.stop)
+            string_start = self._string_index_by_grapheme_index.get(grapheme_start, len(self))
+            string_stop = self._string_index_by_grapheme_index.get(grapheme_stop, None)
+            return GraphemeString(self._string[string_start:string_stop])
+        else:
+            raise TypeError("Indices must be integers or slices")
+
+    def _normalize_start_index(self, index: int | None) -> int:
+        if index is None:
+            return 0
+        if index < 0:
+            return len(self) + index
+        return index
+
+    def _normalize_stop_index(self, index: int | None) -> int:
+        if index is None:
+            return len(self)
+        if index < 0:
+            return len(self) + index
+        return index
+
+    def string_index_to_grapheme_index(self, string_index: int) -> int:
+        if string_index == len(self._string):
+            return len(self)
+        for g_index, s_index in self._string_index_by_grapheme_index.items():
+            if s_index == string_index:
+                return g_index
+        raise ValueError(f"No corresponding grapheme index found for string index {string_index}.")
diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py
index 5aca5561..6f3bdfe0 100644
--- a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py
+++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py
@@ -476,9 +476,12 @@ def test_process_scripture_element() -> None:
 
     assert quote_convention_changer._quotation_mark_finder.num_times_called == 1
     assert mock_quotation_mark_resolver.num_times_called == 1
-    assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment._text == "this is a ‘test"
     assert (
-        quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment._text
+        str(quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text)
+        == "this is a ‘test"
+    )
+    assert (
+        str(quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text)
         == "the test ends” here"
     )
 
@@ -494,7 +497,7 @@ def test_create_text_segments_basic() -> None:
     text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
 
     assert len(text_segments) == 1
-    assert text_segments[0]._text == "test segment"
+    assert str(text_segments[0].text) == "test segment"
     assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
     assert text_segments[0]._markers_in_preceding_context == set()
     assert text_segments[0].previous_segment is None
@@ -517,7 +520,7 @@ def test_create_text_segments_with_preceding_markers() -> None:
     text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
 
     assert len(text_segments) == 1
-    assert text_segments[0]._text == "test segment"
+    assert str(text_segments[0].text) == "test segment"
     assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
     assert text_segments[0]._markers_in_preceding_context == {
         UsfmMarkerType.VERSE,
@@ -547,7 +550,7 @@ def test_create_text_segments_with_multiple_text_tokens() -> None:
     text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
 
     assert len(text_segments) == 2
-    assert text_segments[0]._text == "test segment1"
+    assert str(text_segments[0].text) == "test segment1"
     assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
     assert text_segments[0]._markers_in_preceding_context == {
         UsfmMarkerType.VERSE,
@@ -555,7 +558,7 @@ def test_create_text_segments_with_multiple_text_tokens() -> None:
     }
     assert text_segments[0].previous_segment is None
     assert text_segments[0].next_segment == text_segments[1]
-    assert text_segments[1]._text == "test segment2"
+    assert str(text_segments[1].text) == "test segment2"
     assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER
     assert text_segments[1]._markers_in_preceding_context == {
         UsfmMarkerType.VERSE,
@@ -574,7 +577,7 @@ def test_create_text_segment() -> None:
     segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token)
 
     assert segment is not None
-    assert segment._text == "test segment"
+    assert str(segment.text) == "test segment"
     assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
     assert segment._markers_in_preceding_context == set()
     assert segment._usfm_token == usfm_token
@@ -644,7 +647,7 @@ def test_update_quotation_marks() -> None:
 
     multi_char_to_single_char_quote_convention_changer._update_quotation_marks(multi_character_quotation_marks)
 
-    assert multi_character_text_segment.text == "this “is ‘a test segment’ ”"
+    assert str(multi_character_text_segment.text) == "this “is ‘a test segment’ ”"
 
     assert multi_character_quotation_marks[0].start_index == 5
     assert multi_character_quotation_marks[0].end_index == 6
@@ -704,7 +707,7 @@ def test_update_quotation_marks() -> None:
 
     single_char_to_multi_char_quote_convention_changer._update_quotation_marks(single_character_quotation_marks)
 
-    assert single_character_text_segment.text == "this <<is <a test segment> >>"
+    assert str(single_character_text_segment.text) == "this <<is <a test segment> >>"
 
     assert single_character_quotation_marks[0].start_index == 5
     assert single_character_quotation_marks[0].end_index == 7
@@ -765,7 +768,7 @@ def test_start_new_chapter() -> None:
     segment = quote_convention_changer._next_scripture_text_segment_builder.build()
     assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP
     assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER
-    assert segment._text == ""
+    assert str(segment.text) == ""
     assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context
     assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set()
 
diff --git a/tests/corpora/test_usfm_manual.py b/tests/corpora/test_usfm_manual.py
index b795997b..45166d26 100644
--- a/tests/corpora/test_usfm_manual.py
+++ b/tests/corpora/test_usfm_manual.py
@@ -5,7 +5,13 @@
 from typing import List, Optional
 
 import pytest
-from testutils.corpora_test_helpers import TEST_DATA_PATH, USFM_SOURCE_PROJECT_PATH, USFM_TARGET_PROJECT_PATH
+from testutils.corpora_test_helpers import (
+    TEST_DATA_PATH,
+    USFM_SOURCE_PROJECT_PATH,
+    USFM_SOURCE_PROJECT_ZIP_PATH,
+    USFM_TARGET_PROJECT_PATH,
+    USFM_TARGET_PROJECT_ZIP_PATH,
+)
 
 from machine.corpora import (
     FileParatextProjectSettingsParser,
@@ -15,9 +21,11 @@
     StandardParallelTextCorpus,
     UpdateUsfmRow,
     UpdateUsfmTextBehavior,
+    ZipParatextProjectQuoteConventionDetector,
     ZipParatextProjectSettingsParser,
     ZipParatextProjectTextUpdater,
 )
+from machine.punctuation_analysis import QuoteConventionDetector
 
 
 @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
@@ -124,3 +132,22 @@ def get_usfm(project_path: Path):
                 assert False, f"Failed to process {subdir}: {e}"
     else:
         get_usfm(PARATEXT_PROJECT_PATH)
+
+
+@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
+def test_analyze_corpora_quote_conventions():
+    source_handler = QuoteConventionDetector()
+    source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r")
+    source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive)
+    source_quote_convention_detector.get_quote_convention_analysis(source_handler)
+
+    target_handler = QuoteConventionDetector()
+    target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r")
+    target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive)
+    target_quote_convention_detector.get_quote_convention_analysis(target_handler)
+
+    source_analysis = source_handler.detect_quote_convention()
+    target_analysis = target_handler.detect_quote_convention()
+
+    assert source_analysis is not None
+    assert target_analysis is not None
diff --git a/tests/punctuation_analysis/test_quotation_mark_finder.py b/tests/punctuation_analysis/test_quotation_mark_finder.py
index 5d1a709c..f3e26c21 100644
--- a/tests/punctuation_analysis/test_quotation_mark_finder.py
+++ b/tests/punctuation_analysis/test_quotation_mark_finder.py
@@ -175,6 +175,10 @@ def test_that_all_possible_quotation_marks_are_identified() -> None:
         ),
     ]
 
+    assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment(
+        TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build()
+    ) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 6, 7)]
+
 
 def test_that_it_uses_the_quote_convention_set() -> None:
     standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english")
diff --git a/tests/punctuation_analysis/test_quotation_mark_metadata.py b/tests/punctuation_analysis/test_quotation_mark_metadata.py
index 5f2b265d..44e43518 100644
--- a/tests/punctuation_analysis/test_quotation_mark_metadata.py
+++ b/tests/punctuation_analysis/test_quotation_mark_metadata.py
@@ -19,7 +19,7 @@ def test_update_quotation_mark() -> None:
         end_index=23,
     )
     quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
-    assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said,"
+    assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said,"
 
     quotation_mark_metadata = QuotationMarkMetadata(
         quotation_mark='"',
@@ -30,7 +30,7 @@ def test_update_quotation_mark() -> None:
         end_index=23,
     )
     quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
-    assert quotation_mark_metadata.text_segment._text == "He said to the woman, «Has God really said,"
+    assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, «Has God really said,"
 
     quotation_mark_metadata = QuotationMarkMetadata(
         quotation_mark='"',
@@ -41,7 +41,7 @@ def test_update_quotation_mark() -> None:
         end_index=24,
     )
     quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
-    assert quotation_mark_metadata.text_segment._text == 'He said to the woman, "«as God really said,'
+    assert str(quotation_mark_metadata.text_segment.text) == 'He said to the woman, "«as God really said,'
 
 
 def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
@@ -54,7 +54,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
         end_index=23,
     )
     quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french"))
-    assert quotation_mark_metadata.text_segment._text == "He said to the woman, <<Has God really said,"
+    assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, <<Has God really said,"
     assert quotation_mark_metadata.start_index == 22
     assert quotation_mark_metadata.end_index == 24
 
@@ -67,7 +67,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
         end_index=24,
     )
     quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
-    assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said,"
+    assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said,"
     assert quotation_mark_metadata.start_index == 22
     assert quotation_mark_metadata.end_index == 23
 
diff --git a/tests/punctuation_analysis/test_quotation_mark_string_match.py b/tests/punctuation_analysis/test_quotation_mark_string_match.py
index 39485493..c24e3c3e 100644
--- a/tests/punctuation_analysis/test_quotation_mark_string_match.py
+++ b/tests/punctuation_analysis/test_quotation_mark_string_match.py
@@ -121,6 +121,16 @@ def test_get_previous_character() -> None:
     quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2)
     assert quotation_mark_string_match.previous_character == "“"
 
+    quotation_mark_string_match = QuotationMarkStringMatch(
+        TextSegment.Builder()
+        .set_text('"उत्पत्ति पुस्तकले')
+        .set_previous_segment(TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build())
+        .build(),
+        0,
+        1,
+    )
+    assert quotation_mark_string_match.previous_character == "ले"
+
 
 def test_get_next_character() -> None:
     quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2)
diff --git a/tests/punctuation_analysis/test_text_segment.py b/tests/punctuation_analysis/test_text_segment.py
index aa215d67..9d2b8744 100644
--- a/tests/punctuation_analysis/test_text_segment.py
+++ b/tests/punctuation_analysis/test_text_segment.py
@@ -5,7 +5,7 @@
 def test_builder_initialization() -> None:
     builder = TextSegment.Builder()
 
-    assert builder._text_segment._text == ""
+    assert str(builder._text_segment.text) == ""
     assert builder._text_segment.previous_segment is None
     assert builder._text_segment.next_segment is None
     assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
@@ -20,7 +20,7 @@ def test_builder_set_text() -> None:
     text = "Example text"
     builder.set_text(text)
 
-    assert builder._text_segment._text == text
+    assert str(builder._text_segment.text) == text
 
 
 def test_builder_set_previous_segment() -> None:
@@ -62,7 +62,7 @@ def test_builder_set_usfm_token() -> None:
     assert builder._text_segment._usfm_token is not None
     assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT
     assert builder._text_segment._usfm_token.text == "USFM token text"
-    assert builder._text_segment._text == ""
+    assert str(builder._text_segment.text) == ""
     assert builder._text_segment.previous_segment is None
     assert builder._text_segment.next_segment is None
 
@@ -148,10 +148,10 @@ def test_equals() -> None:
 
 def test_get_text() -> None:
     text_segment = TextSegment.Builder().set_text("example text").build()
-    assert text_segment.text == "example text"
+    assert str(text_segment.text) == "example text"
 
     text_segment = TextSegment.Builder().set_text("new example text").build()
-    assert text_segment.text == "new example text"
+    assert str(text_segment.text) == "new example text"
 
 
 def test_length() -> None:
@@ -161,6 +161,14 @@ def test_length() -> None:
     text_segment = TextSegment.Builder().set_text("new example text").build()
     assert text_segment.length == len("new example text")
 
+    # Combining characters
+    text_segment = TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build()
+    assert text_segment.length == 11
+
+    # Surrogate pairs
+    text_segment = TextSegment.Builder().set_text("𝜺𝜺").build()
+    assert text_segment.length == 2
+
 
 def test_substring_before() -> None:
     text_segment = TextSegment.Builder().set_text("example text").build()
@@ -243,28 +251,28 @@ def test_is_last_segment_in_verse() -> None:
 def test_replace_substring() -> None:
     text_segment = TextSegment.Builder().set_text("example text").build()
     text_segment.replace_substring(0, 7, "sample")
-    assert text_segment.text == "sample text"
+    assert str(text_segment.text) == "sample text"
 
     text_segment.replace_substring(7, 11, "text")
-    assert text_segment.text == "sample text"
+    assert str(text_segment.text) == "sample text"
 
     text_segment.replace_substring(0, 7, "")
-    assert text_segment.text == "text"
+    assert str(text_segment.text) == "text"
 
     text_segment.replace_substring(0, 4, "new'")
-    assert text_segment.text == "new'"
+    assert str(text_segment.text) == "new'"
 
     text_segment.replace_substring(3, 4, "\u2019")
-    assert text_segment.text == "new\u2019"
+    assert str(text_segment.text) == "new\u2019"
 
     text_segment.replace_substring(0, 0, "prefix ")
-    assert text_segment.text == "prefix new\u2019"
+    assert str(text_segment.text) == "prefix new\u2019"
 
     text_segment.replace_substring(0, 0, "")
-    assert text_segment.text == "prefix new\u2019"
+    assert str(text_segment.text) == "prefix new\u2019"
 
     text_segment.replace_substring(11, 11, " suffix")
-    assert text_segment.text == "prefix new\u2019 suffix"
+    assert str(text_segment.text) == "prefix new\u2019 suffix"
 
     text_segment.replace_substring(6, 6, "-")
-    assert text_segment.text == "prefix- new\u2019 suffix"
+    assert str(text_segment.text) == "prefix- new\u2019 suffix"
diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py
index e2875605..6c1e7d8b 100644
--- a/tests/testutils/corpora_test_helpers.py
+++ b/tests/testutils/corpora_test_helpers.py
@@ -8,7 +8,9 @@
 
 USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes"
 USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target"
+USFM_TARGET_PROJECT_ZIP_PATH = TEST_DATA_PATH / "project" / "target"
 USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source"
+USFM_SOURCE_PROJECT_ZIP_PATH = TEST_DATA_PATH / "project" / "source"
 USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id"
 USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id"
 USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes"

From c7c8c3afb9d2a9496b3425e5b2738b7cad91e644 Mon Sep 17 00:00:00 2001
From: Enkidu93 <lowryec17@gcc.edu>
Date: Thu, 11 Sep 2025 17:47:08 -0400
Subject: [PATCH 2/6] Remove redundant string property

---
 .../punctuation_analysis/quotation_mark_finder.py  |  2 +-
 .../quotation_mark_string_match.py                 | 14 +++++++-------
 machine/punctuation_analysis/text_segment.py       | 12 ++++--------
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/machine/punctuation_analysis/quotation_mark_finder.py b/machine/punctuation_analysis/quotation_mark_finder.py
index 14e8e495..09982770 100644
--- a/machine/punctuation_analysis/quotation_mark_finder.py
+++ b/machine/punctuation_analysis/quotation_mark_finder.py
@@ -36,7 +36,7 @@ def find_all_potential_quotation_marks_in_text_segment(
         self, text_segment: TextSegment
     ) -> List[QuotationMarkStringMatch]:
         quotation_matches: List[QuotationMarkStringMatch] = []
-        for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text.string):
+        for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(str(text_segment.text)):
             if self._quote_conventions.is_valid_opening_quotation_mark(
                 quotation_mark_match.group()
             ) or self._quote_conventions.is_valid_closing_quotation_mark(quotation_mark_match.group()):
diff --git a/machine/punctuation_analysis/quotation_mark_string_match.py b/machine/punctuation_analysis/quotation_mark_string_match.py
index e9a82d8d..d73c5543 100644
--- a/machine/punctuation_analysis/quotation_mark_string_match.py
+++ b/machine/punctuation_analysis/quotation_mark_string_match.py
@@ -35,7 +35,7 @@ def __eq__(self, value):
 
     @property
     def quotation_mark(self) -> str:
-        return self._text_segment.text[self._start_index : self._end_index].string
+        return str(self._text_segment.text[self._start_index : self._end_index])
 
     def is_valid_opening_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool:
         return quote_conventions.is_valid_opening_quotation_mark(self.quotation_mark)
@@ -59,18 +59,18 @@ def previous_character(self) -> Optional[str]:
             if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context(
                 UsfmMarkerType.PARAGRAPH
             ):
-                return previous_segment.text[-1].string
+                return str(previous_segment.text[-1])
             return None
-        return self._text_segment.text[self._start_index - 1].string
+        return str(self._text_segment.text[self._start_index - 1])
 
     @property
     def next_character(self) -> Optional[str]:
         if self.is_at_end_of_segment():
             next_segment = self._text_segment.next_segment
             if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH):
-                return next_segment.text[0].string
+                return str(next_segment.text[0])
             return None
-        return self._text_segment.text[self._end_index].string
+        return str(self._text_segment.text[self._end_index])
 
     def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool:
         return regex_pattern.search(self._text_segment.substring_before(self._start_index)) is not None
@@ -100,9 +100,9 @@ def end_index(self) -> int:
     # Not used, but a useful method for debugging
     @property
     def context(self) -> str:
-        return self._text_segment.text[
+        return str(self._text_segment.text[
             max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text))
-        ].string
+        ])
 
     def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata:
         return QuotationMarkMetadata(
diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py
index e2271f9a..b232653a 100644
--- a/machine/punctuation_analysis/text_segment.py
+++ b/machine/punctuation_analysis/text_segment.py
@@ -40,10 +40,10 @@ def length(self) -> int:
         return len(self._text)
 
     def substring_before(self, index: int) -> str:
-        return self._text[:index].string
+        return str(self._text[:index])
 
     def substring_after(self, index: int) -> str:
-        return self._text[index:].string
+        return str(self._text[index:])
 
     def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool:
         return marker in self._markers_in_preceding_context
@@ -57,7 +57,7 @@ def is_last_segment_in_verse(self) -> bool:
     def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None:
         self._text = GraphemeString(self.substring_before(start_index) + replacement + self.substring_after(end_index))
         if self._usfm_token is not None:
-            self._usfm_token.text = self._text.string
+            self._usfm_token.text = str(self._text)
 
     class Builder:
         def __init__(self):
@@ -97,17 +97,13 @@ def __init__(self, string: str) -> None:
     def __len__(self) -> int:
         return len(self._string_index_by_grapheme_index)
 
-    @property
-    def string(self) -> str:
-        return self._string
-
     def __str__(self):
         return self._string
 
     def __eq__(self, other) -> bool:
         if not isinstance(other, GraphemeString):
             return False
-        return self._string == other.string
+        return self._string == other._string
 
     def __getitem__(self, key) -> "GraphemeString":
         if isinstance(key, int):

From f9300507b54f44ba1c870ba02851930122aef7c1 Mon Sep 17 00:00:00 2001
From: Enkidu93 <lowryec17@gcc.edu>
Date: Fri, 12 Sep 2025 09:26:36 -0400
Subject: [PATCH 3/6] Fix formatting

---
 .../punctuation_analysis/quotation_mark_string_match.py   | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/machine/punctuation_analysis/quotation_mark_string_match.py b/machine/punctuation_analysis/quotation_mark_string_match.py
index d73c5543..dcafa869 100644
--- a/machine/punctuation_analysis/quotation_mark_string_match.py
+++ b/machine/punctuation_analysis/quotation_mark_string_match.py
@@ -100,9 +100,11 @@ def end_index(self) -> int:
     # Not used, but a useful method for debugging
     @property
     def context(self) -> str:
-        return str(self._text_segment.text[
-            max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text))
-        ])
+        return str(
+            self._text_segment.text[
+                max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text))
+            ]
+        )
 
     def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata:
         return QuotationMarkMetadata(

From a12b917865df830b86d4996f1a7a630b98744262 Mon Sep 17 00:00:00 2001
From: Enkidu93 <lowryec17@gcc.edu>
Date: Fri, 12 Sep 2025 09:32:20 -0400
Subject: [PATCH 4/6] Use Optional[]

---
 machine/punctuation_analysis/text_segment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py
index b232653a..cfa0b1a1 100644
--- a/machine/punctuation_analysis/text_segment.py
+++ b/machine/punctuation_analysis/text_segment.py
@@ -123,14 +123,14 @@ def __getitem__(self, key) -> "GraphemeString":
         else:
             raise TypeError("Indices must be integers or slices")
 
-    def _normalize_start_index(self, index: int | None) -> int:
+    def _normalize_start_index(self, index: Optional[int]) -> int:
         if index is None:
             return 0
         if index < 0:
             return len(self) + index
         return index
 
-    def _normalize_stop_index(self, index: int | None) -> int:
+    def _normalize_stop_index(self, index: Optional[int]) -> int:
         if index is None:
             return len(self)
         if index < 0:

From 38292fdfb68ad7e2e0171a48cb75c17696a42952 Mon Sep 17 00:00:00 2001
From: Enkidu93 <lowryec17@gcc.edu>
Date: Mon, 15 Sep 2025 08:17:10 -0400
Subject: [PATCH 5/6] Change class name

---
 .../quotation_mark_finder.py                  |  4 +-
 machine/punctuation_analysis/text_segment.py  | 50 +++++++++----------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/machine/punctuation_analysis/quotation_mark_finder.py b/machine/punctuation_analysis/quotation_mark_finder.py
index 09982770..65b7e784 100644
--- a/machine/punctuation_analysis/quotation_mark_finder.py
+++ b/machine/punctuation_analysis/quotation_mark_finder.py
@@ -43,8 +43,8 @@ def find_all_potential_quotation_marks_in_text_segment(
                 quotation_matches.append(
                     QuotationMarkStringMatch(
                         text_segment,
-                        text_segment.text.string_index_to_grapheme_index(quotation_mark_match.start()),
-                        text_segment.text.string_index_to_grapheme_index(quotation_mark_match.end()),
+                        text_segment.text.string_index_to_glyph_index(quotation_mark_match.start()),
+                        text_segment.text.string_index_to_glyph_index(quotation_mark_match.end()),
                     )
                 )
         return quotation_matches
diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py
index cfa0b1a1..c4348c2d 100644
--- a/machine/punctuation_analysis/text_segment.py
+++ b/machine/punctuation_analysis/text_segment.py
@@ -7,7 +7,7 @@
 
 class TextSegment:
     def __init__(self):
-        self._text: GraphemeString = GraphemeString("")
+        self._text: GlyphString = GlyphString("")
         self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER
         self._markers_in_preceding_context: Set[UsfmMarkerType] = set()
         self.previous_segment: Optional[TextSegment] = None
@@ -32,7 +32,7 @@ def __eq__(self, value):
         return True
 
     @property
-    def text(self) -> "GraphemeString":
+    def text(self) -> "GlyphString":
         return self._text
 
     @property
@@ -55,7 +55,7 @@ def is_last_segment_in_verse(self) -> bool:
         return self.index_in_verse == self.num_segments_in_verse - 1
 
     def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None:
-        self._text = GraphemeString(self.substring_before(start_index) + replacement + self.substring_after(end_index))
+        self._text = GlyphString(self.substring_before(start_index) + replacement + self.substring_after(end_index))
         if self._usfm_token is not None:
             self._usfm_token.text = str(self._text)
 
@@ -77,49 +77,49 @@ def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
             return self
 
         def set_text(self, text: str) -> "TextSegment.Builder":
-            self._text_segment._text = GraphemeString(text)
+            self._text_segment._text = GlyphString(text)
             return self
 
         def build(self) -> "TextSegment":
             return self._text_segment
 
 
-class GraphemeString:
+class GlyphString:
     def __init__(self, string: str) -> None:
         self._string = string
-        self._string_index_by_grapheme_index = {
-            grapheme_index: string_index
-            for grapheme_index, string_index in enumerate(
+        self._string_index_by_glyph_index = {
+            glyph_index: string_index
+            for glyph_index, string_index in enumerate(
                 [i for i, c in enumerate(string) if unicodedata.category(c) not in ["Mc", "Mn"]]
             )
         }
 
     def __len__(self) -> int:
-        return len(self._string_index_by_grapheme_index)
+        return len(self._string_index_by_glyph_index)
 
     def __str__(self):
         return self._string
 
     def __eq__(self, other) -> bool:
-        if not isinstance(other, GraphemeString):
+        if not isinstance(other, GlyphString):
             return False
         return self._string == other._string
 
-    def __getitem__(self, key) -> "GraphemeString":
+    def __getitem__(self, key) -> "GlyphString":
         if isinstance(key, int):
-            grapheme_start = self._normalize_start_index(key)
-            grapheme_stop = self._normalize_stop_index(grapheme_start + 1)
-            string_start = self._string_index_by_grapheme_index.get(grapheme_start, len(self))
-            string_stop = self._string_index_by_grapheme_index.get(grapheme_stop, None)
-            return GraphemeString(self._string[string_start:string_stop])
+            glyph_start = self._normalize_start_index(key)
+            glyph_stop = self._normalize_stop_index(glyph_start + 1)
+            string_start = self._string_index_by_glyph_index.get(glyph_start, len(self))
+            string_stop = self._string_index_by_glyph_index.get(glyph_stop, None)
+            return GlyphString(self._string[string_start:string_stop])
         elif isinstance(key, slice):
             if key.step is not None and key.step != 1:
-                raise TypeError("Steps are not allowed in _GraphemeString slices")
-            grapheme_start = self._normalize_start_index(key.start)
-            grapheme_stop = self._normalize_stop_index(key.stop)
-            string_start = self._string_index_by_grapheme_index.get(grapheme_start, len(self))
-            string_stop = self._string_index_by_grapheme_index.get(grapheme_stop, None)
-            return GraphemeString(self._string[string_start:string_stop])
+                raise TypeError("Steps are not allowed in _glyphString slices")
+            glyph_start = self._normalize_start_index(key.start)
+            glyph_stop = self._normalize_stop_index(key.stop)
+            string_start = self._string_index_by_glyph_index.get(glyph_start, len(self))
+            string_stop = self._string_index_by_glyph_index.get(glyph_stop, None)
+            return GlyphString(self._string[string_start:string_stop])
         else:
             raise TypeError("Indices must be integers or slices")
 
@@ -137,10 +137,10 @@ def _normalize_stop_index(self, index: Optional[int]) -> int:
             return len(self) + index
         return index
 
-    def string_index_to_grapheme_index(self, string_index: int) -> int:
+    def string_index_to_glyph_index(self, string_index: int) -> int:
         if string_index == len(self._string):
             return len(self)
-        for g_index, s_index in self._string_index_by_grapheme_index.items():
+        for g_index, s_index in self._string_index_by_glyph_index.items():
             if s_index == string_index:
                 return g_index
-        raise ValueError(f"No corresponding grapheme index found for string index {string_index}.")
+        raise ValueError(f"No corresponding glyph index found for string index {string_index}.")

From 433c57916d71fcfd91865a296ba8f69aa6bb186c Mon Sep 17 00:00:00 2001
From: Enkidu93 <lowryec17@gcc.edu>
Date: Mon, 15 Sep 2025 18:16:21 -0400
Subject: [PATCH 6/6] Remove custom combining character handling

---
 .../quotation_mark_finder.py                  |  4 +-
 machine/punctuation_analysis/text_segment.py  | 71 ++-----------------
 ...tion_changing_usfm_block_update_handler.py | 24 +++----
 .../test_quotation_mark_finder.py             |  2 +-
 .../test_quotation_mark_metadata.py           | 10 +--
 .../test_quotation_mark_string_match.py       |  2 +-
 .../punctuation_analysis/test_text_segment.py | 30 ++++----
 7 files changed, 38 insertions(+), 105 deletions(-)

diff --git a/machine/punctuation_analysis/quotation_mark_finder.py b/machine/punctuation_analysis/quotation_mark_finder.py
index 65b7e784..6d7303e6 100644
--- a/machine/punctuation_analysis/quotation_mark_finder.py
+++ b/machine/punctuation_analysis/quotation_mark_finder.py
@@ -43,8 +43,8 @@ def find_all_potential_quotation_marks_in_text_segment(
                 quotation_matches.append(
                     QuotationMarkStringMatch(
                         text_segment,
-                        text_segment.text.string_index_to_glyph_index(quotation_mark_match.start()),
-                        text_segment.text.string_index_to_glyph_index(quotation_mark_match.end()),
+                        quotation_mark_match.start(),
+                        quotation_mark_match.end(),
                     )
                 )
         return quotation_matches
diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py
index c4348c2d..c8d44bde 100644
--- a/machine/punctuation_analysis/text_segment.py
+++ b/machine/punctuation_analysis/text_segment.py
@@ -1,4 +1,3 @@
-import unicodedata
 from typing import Optional, Set
 
 from ..corpora.usfm_token import UsfmToken
@@ -7,7 +6,7 @@
 
 class TextSegment:
     def __init__(self):
-        self._text: GlyphString = GlyphString("")
+        self._text = ""
         self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER
         self._markers_in_preceding_context: Set[UsfmMarkerType] = set()
         self.previous_segment: Optional[TextSegment] = None
@@ -32,7 +31,7 @@ def __eq__(self, value):
         return True
 
     @property
-    def text(self) -> "GlyphString":
+    def text(self) -> str:
         return self._text
 
     @property
@@ -55,7 +54,7 @@ def is_last_segment_in_verse(self) -> bool:
         return self.index_in_verse == self.num_segments_in_verse - 1
 
     def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None:
-        self._text = GlyphString(self.substring_before(start_index) + replacement + self.substring_after(end_index))
+        self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index)
         if self._usfm_token is not None:
             self._usfm_token.text = str(self._text)
 
@@ -77,70 +76,8 @@ def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
             return self
 
         def set_text(self, text: str) -> "TextSegment.Builder":
-            self._text_segment._text = GlyphString(text)
+            self._text_segment._text = text
             return self
 
         def build(self) -> "TextSegment":
             return self._text_segment
-
-
-class GlyphString:
-    def __init__(self, string: str) -> None:
-        self._string = string
-        self._string_index_by_glyph_index = {
-            glyph_index: string_index
-            for glyph_index, string_index in enumerate(
-                [i for i, c in enumerate(string) if unicodedata.category(c) not in ["Mc", "Mn"]]
-            )
-        }
-
-    def __len__(self) -> int:
-        return len(self._string_index_by_glyph_index)
-
-    def __str__(self):
-        return self._string
-
-    def __eq__(self, other) -> bool:
-        if not isinstance(other, GlyphString):
-            return False
-        return self._string == other._string
-
-    def __getitem__(self, key) -> "GlyphString":
-        if isinstance(key, int):
-            glyph_start = self._normalize_start_index(key)
-            glyph_stop = self._normalize_stop_index(glyph_start + 1)
-            string_start = self._string_index_by_glyph_index.get(glyph_start, len(self))
-            string_stop = self._string_index_by_glyph_index.get(glyph_stop, None)
-            return GlyphString(self._string[string_start:string_stop])
-        elif isinstance(key, slice):
-            if key.step is not None and key.step != 1:
-                raise TypeError("Steps are not allowed in _glyphString slices")
-            glyph_start = self._normalize_start_index(key.start)
-            glyph_stop = self._normalize_stop_index(key.stop)
-            string_start = self._string_index_by_glyph_index.get(glyph_start, len(self))
-            string_stop = self._string_index_by_glyph_index.get(glyph_stop, None)
-            return GlyphString(self._string[string_start:string_stop])
-        else:
-            raise TypeError("Indices must be integers or slices")
-
-    def _normalize_start_index(self, index: Optional[int]) -> int:
-        if index is None:
-            return 0
-        if index < 0:
-            return len(self) + index
-        return index
-
-    def _normalize_stop_index(self, index: Optional[int]) -> int:
-        if index is None:
-            return len(self)
-        if index < 0:
-            return len(self) + index
-        return index
-
-    def string_index_to_glyph_index(self, string_index: int) -> int:
-        if string_index == len(self._string):
-            return len(self)
-        for g_index, s_index in self._string_index_by_glyph_index.items():
-            if s_index == string_index:
-                return g_index
-        raise ValueError(f"No corresponding glyph index found for string index {string_index}.")
diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py
index 6f3bdfe0..baadf8eb 100644
--- a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py
+++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py
@@ -476,13 +476,9 @@ def test_process_scripture_element() -> None:
 
     assert quote_convention_changer._quotation_mark_finder.num_times_called == 1
     assert mock_quotation_mark_resolver.num_times_called == 1
+    assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text == "this is a ‘test"
     assert (
-        str(quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text)
-        == "this is a ‘test"
-    )
-    assert (
-        str(quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text)
-        == "the test ends” here"
+        quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text == "the test ends” here"
     )
 
 
@@ -497,7 +493,7 @@ def test_create_text_segments_basic() -> None:
     text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
 
     assert len(text_segments) == 1
-    assert str(text_segments[0].text) == "test segment"
+    assert text_segments[0].text == "test segment"
     assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
     assert text_segments[0]._markers_in_preceding_context == set()
     assert text_segments[0].previous_segment is None
@@ -520,7 +516,7 @@ def test_create_text_segments_with_preceding_markers() -> None:
     text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
 
     assert len(text_segments) == 1
-    assert str(text_segments[0].text) == "test segment"
+    assert text_segments[0].text == "test segment"
     assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
     assert text_segments[0]._markers_in_preceding_context == {
         UsfmMarkerType.VERSE,
@@ -550,7 +546,7 @@ def test_create_text_segments_with_multiple_text_tokens() -> None:
     text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
 
     assert len(text_segments) == 2
-    assert str(text_segments[0].text) == "test segment1"
+    assert text_segments[0].text == "test segment1"
     assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
     assert text_segments[0]._markers_in_preceding_context == {
         UsfmMarkerType.VERSE,
@@ -558,7 +554,7 @@ def test_create_text_segments_with_multiple_text_tokens() -> None:
     }
     assert text_segments[0].previous_segment is None
     assert text_segments[0].next_segment == text_segments[1]
-    assert str(text_segments[1].text) == "test segment2"
+    assert text_segments[1].text == "test segment2"
     assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER
     assert text_segments[1]._markers_in_preceding_context == {
         UsfmMarkerType.VERSE,
@@ -577,7 +573,7 @@ def test_create_text_segment() -> None:
     segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token)
 
     assert segment is not None
-    assert str(segment.text) == "test segment"
+    assert segment.text == "test segment"
     assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
     assert segment._markers_in_preceding_context == set()
     assert segment._usfm_token == usfm_token
@@ -647,7 +643,7 @@ def test_update_quotation_marks() -> None:
 
     multi_char_to_single_char_quote_convention_changer._update_quotation_marks(multi_character_quotation_marks)
 
-    assert str(multi_character_text_segment.text) == "this “is ‘a test segment’ ”"
+    assert multi_character_text_segment.text == "this “is ‘a test segment’ ”"
 
     assert multi_character_quotation_marks[0].start_index == 5
     assert multi_character_quotation_marks[0].end_index == 6
@@ -707,7 +703,7 @@ def test_update_quotation_marks() -> None:
 
     single_char_to_multi_char_quote_convention_changer._update_quotation_marks(single_character_quotation_marks)
 
-    assert str(single_character_text_segment.text) == "this <<is <a test segment> >>"
+    assert single_character_text_segment.text == "this <<is <a test segment> >>"
 
     assert single_character_quotation_marks[0].start_index == 5
     assert single_character_quotation_marks[0].end_index == 7
@@ -768,7 +764,7 @@ def test_start_new_chapter() -> None:
     segment = quote_convention_changer._next_scripture_text_segment_builder.build()
     assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP
     assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER
-    assert str(segment.text) == ""
+    assert segment.text == ""
     assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context
     assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set()
 
diff --git a/tests/punctuation_analysis/test_quotation_mark_finder.py b/tests/punctuation_analysis/test_quotation_mark_finder.py
index f3e26c21..035f50f0 100644
--- a/tests/punctuation_analysis/test_quotation_mark_finder.py
+++ b/tests/punctuation_analysis/test_quotation_mark_finder.py
@@ -177,7 +177,7 @@ def test_that_all_possible_quotation_marks_are_identified() -> None:
 
     assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment(
         TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build()
-    ) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 6, 7)]
+    ) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 9, 10)]
 
 
 def test_that_it_uses_the_quote_convention_set() -> None:
diff --git a/tests/punctuation_analysis/test_quotation_mark_metadata.py b/tests/punctuation_analysis/test_quotation_mark_metadata.py
index 44e43518..4e4bab1a 100644
--- a/tests/punctuation_analysis/test_quotation_mark_metadata.py
+++ b/tests/punctuation_analysis/test_quotation_mark_metadata.py
@@ -19,7 +19,7 @@ def test_update_quotation_mark() -> None:
         end_index=23,
     )
     quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
-    assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said,"
+    assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said,"
 
     quotation_mark_metadata = QuotationMarkMetadata(
         quotation_mark='"',
@@ -30,7 +30,7 @@ def test_update_quotation_mark() -> None:
         end_index=23,
     )
     quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
-    assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, «Has God really said,"
+    assert quotation_mark_metadata.text_segment.text == "He said to the woman, «Has God really said,"
 
     quotation_mark_metadata = QuotationMarkMetadata(
         quotation_mark='"',
@@ -41,7 +41,7 @@ def test_update_quotation_mark() -> None:
         end_index=24,
     )
     quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
-    assert str(quotation_mark_metadata.text_segment.text) == 'He said to the woman, "«as God really said,'
+    assert quotation_mark_metadata.text_segment.text == 'He said to the woman, "«as God really said,'
 
 
 def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
@@ -54,7 +54,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
         end_index=23,
     )
     quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french"))
-    assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, <<Has God really said,"
+    assert quotation_mark_metadata.text_segment.text == "He said to the woman, <<Has God really said,"
     assert quotation_mark_metadata.start_index == 22
     assert quotation_mark_metadata.end_index == 24
 
@@ -67,7 +67,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
         end_index=24,
     )
     quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
-    assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said,"
+    assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said,"
     assert quotation_mark_metadata.start_index == 22
     assert quotation_mark_metadata.end_index == 23
 
diff --git a/tests/punctuation_analysis/test_quotation_mark_string_match.py b/tests/punctuation_analysis/test_quotation_mark_string_match.py
index c24e3c3e..7f478f73 100644
--- a/tests/punctuation_analysis/test_quotation_mark_string_match.py
+++ b/tests/punctuation_analysis/test_quotation_mark_string_match.py
@@ -129,7 +129,7 @@ def test_get_previous_character() -> None:
         0,
         1,
     )
-    assert quotation_mark_string_match.previous_character == "ले"
+    assert quotation_mark_string_match.previous_character == "\u0947"
 
 
 def test_get_next_character() -> None:
diff --git a/tests/punctuation_analysis/test_text_segment.py b/tests/punctuation_analysis/test_text_segment.py
index 9d2b8744..11932f00 100644
--- a/tests/punctuation_analysis/test_text_segment.py
+++ b/tests/punctuation_analysis/test_text_segment.py
@@ -5,7 +5,7 @@
 def test_builder_initialization() -> None:
     builder = TextSegment.Builder()
 
-    assert str(builder._text_segment.text) == ""
+    assert builder._text_segment.text == ""
     assert builder._text_segment.previous_segment is None
     assert builder._text_segment.next_segment is None
     assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
@@ -20,7 +20,7 @@ def test_builder_set_text() -> None:
     text = "Example text"
     builder.set_text(text)
 
-    assert str(builder._text_segment.text) == text
+    assert builder._text_segment.text == text
 
 
 def test_builder_set_previous_segment() -> None:
@@ -62,7 +62,7 @@ def test_builder_set_usfm_token() -> None:
     assert builder._text_segment._usfm_token is not None
     assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT
     assert builder._text_segment._usfm_token.text == "USFM token text"
-    assert str(builder._text_segment.text) == ""
+    assert builder._text_segment.text == ""
     assert builder._text_segment.previous_segment is None
     assert builder._text_segment.next_segment is None
 
@@ -148,10 +148,10 @@ def test_equals() -> None:
 
 def test_get_text() -> None:
     text_segment = TextSegment.Builder().set_text("example text").build()
-    assert str(text_segment.text) == "example text"
+    assert text_segment.text == "example text"
 
     text_segment = TextSegment.Builder().set_text("new example text").build()
-    assert str(text_segment.text) == "new example text"
+    assert text_segment.text == "new example text"
 
 
 def test_length() -> None:
@@ -163,7 +163,7 @@ def test_length() -> None:
 
     # Combining characters
     text_segment = TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build()
-    assert text_segment.length == 11
+    assert text_segment.length == 17
 
     # Surrogate pairs
     text_segment = TextSegment.Builder().set_text("𝜺𝜺").build()
@@ -251,28 +251,28 @@ def test_is_last_segment_in_verse() -> None:
 def test_replace_substring() -> None:
     text_segment = TextSegment.Builder().set_text("example text").build()
     text_segment.replace_substring(0, 7, "sample")
-    assert str(text_segment.text) == "sample text"
+    assert text_segment.text == "sample text"
 
     text_segment.replace_substring(7, 11, "text")
-    assert str(text_segment.text) == "sample text"
+    assert text_segment.text == "sample text"
 
     text_segment.replace_substring(0, 7, "")
-    assert str(text_segment.text) == "text"
+    assert text_segment.text == "text"
 
     text_segment.replace_substring(0, 4, "new'")
-    assert str(text_segment.text) == "new'"
+    assert text_segment.text == "new'"
 
     text_segment.replace_substring(3, 4, "\u2019")
-    assert str(text_segment.text) == "new\u2019"
+    assert text_segment.text == "new\u2019"
 
     text_segment.replace_substring(0, 0, "prefix ")
-    assert str(text_segment.text) == "prefix new\u2019"
+    assert text_segment.text == "prefix new\u2019"
 
     text_segment.replace_substring(0, 0, "")
-    assert str(text_segment.text) == "prefix new\u2019"
+    assert text_segment.text == "prefix new\u2019"
 
     text_segment.replace_substring(11, 11, " suffix")
-    assert str(text_segment.text) == "prefix new\u2019 suffix"
+    assert text_segment.text == "prefix new\u2019 suffix"
 
     text_segment.replace_substring(6, 6, "-")
-    assert str(text_segment.text) == "prefix- new\u2019 suffix"
+    assert text_segment.text == "prefix- new\u2019 suffix"