From b0f223a337290ffffe01a043efb4ef5b5718e532 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 2 Apr 2025 11:13:04 -0400 Subject: [PATCH 01/31] Some tests pass --- machine/corpora/scripture_update_block.py | 45 +++++++++++++++++++ .../scripture_update_block_handler_base.py | 9 ++++ ...date_block_handler_first_elements_first.py | 23 ++++++++++ machine/corpora/scripture_update_element.py | 24 ++++++++++ machine/corpora/update_usfm_parser_handler.py | 1 + 5 files changed, 102 insertions(+) create mode 100644 machine/corpora/scripture_update_block.py create mode 100644 machine/corpora/scripture_update_block_handler_base.py create mode 100644 machine/corpora/scripture_update_block_handler_first_elements_first.py create mode 100644 machine/corpora/scripture_update_element.py diff --git a/machine/corpora/scripture_update_block.py b/machine/corpora/scripture_update_block.py new file mode 100644 index 00000000..00787cf2 --- /dev/null +++ b/machine/corpora/scripture_update_block.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from .scripture_ref import ScriptureRef +from .scripture_update_element import ScriptureUpdateElement, ScriptureUpdateElementType +from .usfm_token import UsfmToken, UsfmTokenType + + +class ScriptureUpdateBlock: + + def __init__(self) -> None: + self._ref: ScriptureRef = ScriptureRef() + self._elements: list[ScriptureUpdateElement] = [] + + def add_existing_text(self, token: UsfmToken, marked_for_removal: bool = False) -> None: + self._elements.append( + ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) + ) + + def add_inserted_text(self, tokens: list[UsfmToken]) -> None: + self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.INSERTED_TEXT, tokens.copy())) + + def add_token(self, token: UsfmToken, marked_for_removal: bool = False) -> None: + if token.type == UsfmTokenType.TEXT: + self._elements.append( + ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) + ) + else: + self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, [token], marked_for_removal)) + + def add_tokens(self, tokens: list[UsfmToken], marked_for_removal: bool = False) -> None: + if len(tokens) == 0: + return + self._elements.append( + ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, tokens.copy(), marked_for_removal) + ) + + def update_ref(self, ref: ScriptureRef) -> None: + self._ref = ref + + def clear(self) -> None: + self._elements.clear() + self._ref = ScriptureRef() + + def get_tokens(self) -> list[UsfmToken]: + return [token for element in self._elements for token in element.get_tokens()] diff --git a/machine/corpora/scripture_update_block_handler_base.py b/machine/corpora/scripture_update_block_handler_base.py new file mode 100644 index 00000000..2998a0d9 --- /dev/null +++ b/machine/corpora/scripture_update_block_handler_base.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from .scripture_update_block import ScriptureUpdateBlock + + +class ScriptureUpdateBlockHandlerBase: + + def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + raise NotImplementedError("Must be implemented in subclass") diff --git a/machine/corpora/scripture_update_block_handler_first_elements_first.py b/machine/corpora/scripture_update_block_handler_first_elements_first.py new file mode 100644 index 00000000..17f44798 --- /dev/null +++ b/machine/corpora/scripture_update_block_handler_first_elements_first.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from .scripture_update_block import ScriptureUpdateBlock +from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase +from .scripture_update_element import ScriptureUpdateElementType + + +class ScriptureUpdateBlockHandlerFirstElementsFirst(ScriptureUpdateBlockHandlerBase): + + def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + # If a paragraph, embed or style element occurs before existing text, move it before inserted text as well. + current_insert_index = 0 + for current_index in range(len(block._elements)): + element = block._elements[current_index] + if element.type == ScriptureUpdateElementType.EXISTING_TEXT: + # we found existing text, so we stop looking for elements to move + break + if current_index != current_insert_index and element.type != ScriptureUpdateElementType.INSERTED_TEXT: + block._elements.remove(element) + block._elements.insert(current_insert_index, element) + current_insert_index += 1 + + return block diff --git a/machine/corpora/scripture_update_element.py b/machine/corpora/scripture_update_element.py new file mode 100644 index 00000000..fe39d7e5 --- /dev/null +++ b/machine/corpora/scripture_update_element.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum, auto + +from .usfm_token import UsfmToken + + +class ScriptureUpdateElementType(Enum): + EXISTING_TEXT = auto() + INSERTED_TEXT = auto() + OTHER = auto() + + +@dataclass +class ScriptureUpdateElement: + type: ScriptureUpdateElementType + tokens: list[UsfmToken] + marked_for_removal: bool = False + + def get_tokens(self) -> list[UsfmToken]: + if self.marked_for_removal: + return [] + return self.tokens diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index a51021b2..df3c90e7 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -1,6 +1,7 @@ from enum import Enum, auto from typing import Iterable, List, Optional, Sequence, Tuple, Union +from ..scripture.verse_ref import VerseRef from .scripture_ref import ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType from .usfm_parser_state import UsfmParserState From 8f0be69266663faaae808664c9cefdf739d0c567 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 3 Apr 2025 13:39:14 -0400 Subject: [PATCH 02/31] Added more test framework --- .../test_update_scripture_block_updater.py | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 tests/corpora/test_update_scripture_block_updater.py diff --git a/tests/corpora/test_update_scripture_block_updater.py b/tests/corpora/test_update_scripture_block_updater.py new file mode 100644 index 00000000..32d9057a --- /dev/null +++ b/tests/corpora/test_update_scripture_block_updater.py @@ -0,0 +1,119 @@ +from typing import List, Optional, Sequence, Tuple + +from machine.corpora.scripture_update_block_handler_first_elements_first import ( + ScriptureUpdateBlockHandlerFirstElementsFirst, +) + +from machine.corpora.scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase +from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH + +from machine.corpora import ( + FileParatextProjectTextUpdater, + ScriptureRef, + UpdateUsfmMarkerBehavior, + UpdateUsfmParserHandler, + UpdateUsfmTextBehavior, + parse_usfm, +) + + +def test_preserve_paragraphs(): + rows = [ + (scr_ref("MAT 1:1"), str("U1")), + ( + scr_ref("MAT 1:1/1:f"), + str("UF1"), + ), + (scr_ref("MAT 1:2"), str("U2")), + ( + scr_ref("MAT 1:2/1:f"), + str("UF2"), + ), + (scr_ref("MAT 1:3"), str("U3")), + ( + scr_ref("MAT 1:3/1:f"), + str("UF3"), + ), + ] + usfm = r"""\id MAT +\c 1 +\v 1 \f \ft \fm ' \fm* hello world \f* it comes first +\v 2 it comes \f \ft hello \fm ' \fm* world \f* middling +\v 3 it comes last \f \ft hello world \fm ' \fm* \f* +""" + + target = update_usfm(rows, usfm) + result = r"""\id MAT +\c 1 +\v 1 U1 \f \ft UF1 \fm ' \fm*\f* +\v 2 U2 \f \ft UF2 \fm ' \fm*\f* +\v 3 U3 \f \ft UF3 \fm ' \fm*\f* +""" + + assess(target, result) + + target_first_element = update_usfm( + rows, usfm, update_block_handlers=[ScriptureUpdateBlockHandlerFirstElementsFirst()] + ) + result_first_element = r"""\id MAT +\c 1 +\v 1 \f \ft \fm ' \fm* UF1 \f* U1 +\v 2 U2 \f \ft UF2 \fm ' \fm*\f* +\v 3 U3 \f \ft UF3 \fm ' \fm*\f* +""" + assess(target_first_element, result_first_element) + + +def scr_ref(*refs: str) -> List[ScriptureRef]: + return [ScriptureRef.parse(ref) for ref in refs] + + +def update_usfm( + rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, + source: Optional[str] = None, + id_text: Optional[str] = None, + text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, + paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, + embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, + style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, + preserve_paragraph_styles: Optional[Sequence[str]] = None, + update_block_handlers: Optional[list[ScriptureUpdateBlockHandlerBase]] = None, +) -> Optional[str]: + if source is None: + updater = FileParatextProjectTextUpdater(USFM_TEST_PROJECT_PATH) + return updater.update_usfm( + "MAT", + rows, + id_text, + text_behavior, + paragraph_behavior, + embed_behavior, + style_behavior, + preserve_paragraph_styles, + update_block_handlers, + ) + else: + source = source.strip().replace("\r\n", "\n") + "\r\n" + updater = UpdateUsfmParserHandler( + rows, + id_text, + text_behavior, + paragraph_behavior, + embed_behavior, + style_behavior, + preserve_paragraph_styles, + update_block_handlers, + ) + parse_usfm(source, updater) + return updater.get_usfm() + + +def assess(target: Optional[str], truth: str) -> None: + assert target is not None + for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): + assert target_line.strip() == truth_line.strip() + + +def read_usfm() -> str: + with (USFM_TEST_PROJECT_PATH / "41MATTes.SFM").open("r", encoding="utf-8-sig", newline="\r\n") as file: + return file.read() From 5f4f9bc077dfd32eb993f756b9044bf8bb7593fa Mon Sep 17 00:00:00 2001 From: Ben King Date: Tue, 8 Apr 2025 16:12:36 -0400 Subject: [PATCH 03/31] Basic implementation and tests for quote convention detection --- machine/corpora/analysis/__init__.py | 3 + machine/corpora/analysis/chapter.py | 9 + .../preliminary_quotation_analyzer.py | 377 ++++++++++++++++++ .../analysis/quotation_mark_direction.py | 6 + .../corpora/analysis/quotation_mark_finder.py | 37 ++ .../analysis/quotation_mark_metadata.py | 27 ++ .../analysis/quotation_mark_resolver.py | 284 +++++++++++++ .../analysis/quotation_mark_string_match.py | 141 +++++++ .../analysis/quotation_mark_tabulator.py | 98 +++++ machine/corpora/analysis/quote_convention.py | 86 ++++ .../analysis/quote_convention_detector.py | 67 ++++ .../corpora/analysis/quote_convention_set.py | 124 ++++++ .../analysis/standard_quote_conventions.py | 193 +++++++++ machine/corpora/analysis/text_segment.py | 74 ++++ machine/corpora/analysis/usfm_marker_type.py | 11 + .../analysis/usfm_structure_extractor.py | 99 +++++ machine/corpora/analysis/verse.py | 15 + .../test_quote_convention_detector.py | 305 ++++++++++++++ 18 files changed, 1956 insertions(+) create mode 100644 machine/corpora/analysis/__init__.py create mode 100644 machine/corpora/analysis/chapter.py create mode 100644 machine/corpora/analysis/preliminary_quotation_analyzer.py create mode 100644 machine/corpora/analysis/quotation_mark_direction.py create mode 100644 machine/corpora/analysis/quotation_mark_finder.py create mode 100644 machine/corpora/analysis/quotation_mark_metadata.py create mode 100644 machine/corpora/analysis/quotation_mark_resolver.py create mode 100644 machine/corpora/analysis/quotation_mark_string_match.py create mode 100644 machine/corpora/analysis/quotation_mark_tabulator.py create mode 100644 machine/corpora/analysis/quote_convention.py create mode 100644 machine/corpora/analysis/quote_convention_detector.py create mode 100644 machine/corpora/analysis/quote_convention_set.py create mode 100644 machine/corpora/analysis/standard_quote_conventions.py create mode 100644 machine/corpora/analysis/text_segment.py create mode 100644 machine/corpora/analysis/usfm_marker_type.py create mode 100644 machine/corpora/analysis/usfm_structure_extractor.py create mode 100644 machine/corpora/analysis/verse.py create mode 100644 tests/corpora/analysis/test_quote_convention_detector.py diff --git a/machine/corpora/analysis/__init__.py b/machine/corpora/analysis/__init__.py new file mode 100644 index 00000000..e8cd623c --- /dev/null +++ b/machine/corpora/analysis/__init__.py @@ -0,0 +1,3 @@ +from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector + +__all__ = ["QuoteConventionAnalysis", "QuoteConventionDetector"] diff --git a/machine/corpora/analysis/chapter.py b/machine/corpora/analysis/chapter.py new file mode 100644 index 00000000..f96441e6 --- /dev/null +++ b/machine/corpora/analysis/chapter.py @@ -0,0 +1,9 @@ +from .verse import Verse + + +class Chapter: + def __init__(self, verses: list[Verse]): + self.verses = verses + + def get_verses(self) -> list[Verse]: + return self.verses diff --git a/machine/corpora/analysis/preliminary_quotation_analyzer.py b/machine/corpora/analysis/preliminary_quotation_analyzer.py new file mode 100644 index 00000000..882a6f99 --- /dev/null +++ b/machine/corpora/analysis/preliminary_quotation_analyzer.py @@ -0,0 +1,377 @@ +from typing import Dict, Generator, List, Tuple + +import regex + +from .chapter import Chapter +from .quotation_mark_finder import QuotationMarkFinder +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quote_convention_set import QuoteConventionSet +from .text_segment import TextSegment +from .verse import Verse + + +class CharacterCountStatistics: + def __init__(self): + self.reset() + + def reset(self) -> None: + self.num_characters = 0 + self.num_apostrophes = 0 + + def count_characters(self, text_segment: TextSegment) -> None: + self.num_characters += len(text_segment.get_text()) + + def add_apostrophe(self) -> None: + self.num_apostrophes += 1 + + def is_apostrophe_proportion_greater_than(self, threshold: float) -> bool: + if self.num_characters == 0: + return False + return self.num_apostrophes / self.num_characters > threshold + + +class QuotationMarkWordPositions: + def __init__(self): + self.reset() + + def reset(self) -> None: + self.word_initial_occurrences: Dict[str, int] = dict() + self.mid_word_occurrences: Dict[str, int] = dict() + self.word_final_occurrences: Dict[str, int] = dict() + + def count_word_initial_apostrophe(self, quotation_mark: str) -> None: + if quotation_mark not in self.word_initial_occurrences: + self.word_initial_occurrences[quotation_mark] = 0 + self.word_initial_occurrences[quotation_mark] += 1 + + def count_mid_word_apostrophe(self, quotation_mark: str) -> None: + if quotation_mark not in self.mid_word_occurrences: + self.mid_word_occurrences[quotation_mark] = 0 + self.mid_word_occurrences[quotation_mark] += 1 + + def count_word_final_apostrophe(self, quotation_mark: str) -> None: + if quotation_mark not in self.word_final_occurrences: + self.word_final_occurrences[quotation_mark] = 0 + self.word_final_occurrences[quotation_mark] += 1 + + def _get_word_initial_occurrences(self, quotation_mark: str) -> int: + return self.word_initial_occurrences[quotation_mark] if quotation_mark in self.word_initial_occurrences else 0 + + def _get_mid_word_occurrences(self, quotation_mark: str) -> int: + return self.mid_word_occurrences[quotation_mark] if quotation_mark in self.mid_word_occurrences else 0 + + def _get_word_final_occurrences(self, quotation_mark: str) -> int: + return self.word_final_occurrences[quotation_mark] if quotation_mark in self.word_final_occurrences else 0 + + def _get_total_occurrences(self, quotation_mark: str) -> int: + return ( + self._get_word_initial_occurrences(quotation_mark) + + self._get_mid_word_occurrences(quotation_mark) + + self._get_word_final_occurrences(quotation_mark) + ) + + def is_mark_rarely_initial(self, quotation_mark: str) -> bool: + num_initial_marks: int = self._get_word_initial_occurrences(quotation_mark) + num_total_marks: int = self._get_total_occurrences(quotation_mark) + return num_total_marks > 0 and num_initial_marks / num_total_marks < 0.1 + + def is_mark_rarely_final(self, quotation_mark: str) -> bool: + num_final_marks: int = self._get_word_final_occurrences(quotation_mark) + num_total_marks: int = self._get_total_occurrences(quotation_mark) + return num_total_marks > 0 and num_final_marks / num_total_marks < 0.1 + + def are_initial_and_final_rates_similar(self, quotation_mark: str) -> bool: + num_initial_marks: int = self._get_word_initial_occurrences(quotation_mark) + num_final_marks: int = self._get_word_final_occurrences(quotation_mark) + num_total_marks: int = self._get_total_occurrences(quotation_mark) + return num_total_marks > 0 and abs(num_initial_marks - num_final_marks) / num_total_marks > 0.3 + + def is_mark_commonly_mid_word(self, quotation_mark: str) -> bool: + num_mid_word_marks: int = self._get_mid_word_occurrences(quotation_mark) + num_total_marks: int = self._get_total_occurrences(quotation_mark) + return num_total_marks > 0 and num_mid_word_marks / num_total_marks > 0.3 + + +class QuotationMarkVersePositions: + def __init__(self): + self.reset() + + def reset(self) -> None: + self.verse_starting_quotation_mark_counts: Dict[str, int] = dict() + self.verse_ending_quotation_mark_counts: Dict[str, int] = dict() + + def process_verse_starting_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> None: + if quotation_mark_match.get_quotation_mark() not in self.verse_starting_quotation_mark_counts: + self.verse_starting_quotation_mark_counts[quotation_mark_match.get_quotation_mark()] = 0 + self.verse_starting_quotation_mark_counts[quotation_mark_match.get_quotation_mark()] += 1 + + def process_verse_ending_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> None: + if quotation_mark_match.get_quotation_mark() not in self.verse_ending_quotation_mark_counts: + self.verse_ending_quotation_mark_counts[quotation_mark_match.get_quotation_mark()] = 0 + self.verse_ending_quotation_mark_counts[quotation_mark_match.get_quotation_mark()] += 1 + + +class QuotationMarkSequences: + def __init__(self): + self.reset() + + def reset(self) -> None: + self.earlier_quotation_mark_counts: Dict[str, int] = dict() + self.later_quotation_mark_counts: Dict[str, int] = dict() + + def record_earlier_quotation_mark(self, quotation_mark: str) -> None: + if quotation_mark not in self.earlier_quotation_mark_counts: + self.earlier_quotation_mark_counts[quotation_mark] = 0 + self.earlier_quotation_mark_counts[quotation_mark] += 1 + + def record_later_quotation_mark(self, quotation_mark: str) -> None: + if quotation_mark not in self.later_quotation_mark_counts: + self.later_quotation_mark_counts[quotation_mark] = 0 + self.later_quotation_mark_counts[quotation_mark] += 1 + + def _get_earlier_occurrences(self, quotation_mark: str) -> int: + return ( + self.earlier_quotation_mark_counts[quotation_mark] + if quotation_mark in self.earlier_quotation_mark_counts + else 0 + ) + + def _get_later_occurrences(self, quotation_mark: str) -> int: + return ( + self.later_quotation_mark_counts[quotation_mark] + if quotation_mark in self.later_quotation_mark_counts + else 0 + ) + + def is_mark_much_more_common_earlier(self, quotation_mark: str) -> bool: + num_early_occurrences: int = self._get_earlier_occurrences(quotation_mark) + num_late_occurrences: int = self._get_later_occurrences(quotation_mark) + return ( + num_late_occurrences == 0 and num_early_occurrences > 5 + ) or num_early_occurrences > num_late_occurrences * 10 + + def is_mark_much_more_common_later(self, quotation_mark: str) -> bool: + num_early_occurrences: int = self._get_earlier_occurrences(quotation_mark) + num_late_occurrences: int = self._get_later_occurrences(quotation_mark) + return ( + num_early_occurrences == 0 and num_late_occurrences > 5 + ) or num_late_occurrences > num_early_occurrences * 10 + + def is_mark_common_early_and_late(self, quotation_mark: str) -> bool: + num_early_occurrences: int = self._get_earlier_occurrences(quotation_mark) + num_late_occurrences: int = self._get_later_occurrences(quotation_mark) + return ( + num_early_occurrences > 0 + and abs(num_late_occurrences - num_early_occurrences) / num_early_occurrences < 0.2 + ) + + +class QuotationMarkGrouper: + def __init__(self, quotation_marks: list[QuotationMarkStringMatch], quote_convention_set: QuoteConventionSet): + self.quote_convention_set = quote_convention_set + self._group_quotation_marks(quotation_marks) + + def _group_quotation_marks(self, quotation_marks: list[QuotationMarkStringMatch]) -> None: + self.grouped_quotation_marks: Dict[str, List[QuotationMarkStringMatch]] = dict() + for quotation_mark_match in quotation_marks: + if quotation_mark_match.get_quotation_mark() not in self.grouped_quotation_marks: + self.grouped_quotation_marks[quotation_mark_match.get_quotation_mark()] = [] + self.grouped_quotation_marks[quotation_mark_match.get_quotation_mark()].append(quotation_mark_match) + + def get_quotation_mark_pairs(self) -> Generator[Tuple[str, str], None, None]: + for mark1, matches1 in self.grouped_quotation_marks.items(): + # handle cases of identical opening/closing marks + if ( + len(matches1) == 2 + and self.quote_convention_set.is_quotation_mark_direction_ambiguous(mark1) + and not self.has_distinct_paired_quotation_mark(mark1) + ): + yield (mark1, mark1) + continue + + # skip verses where quotation mark pairs are ambiguous + if len(matches1) > 1: + continue + + # find matching closing marks + for mark2, matches2 in self.grouped_quotation_marks.items(): + if ( + len(matches2) == 1 + and self.quote_convention_set.are_marks_a_valid_pair(mark1, mark2) + and matches1[0].precedes(matches2[0]) + ): + yield (mark1, mark2) + + def has_distinct_paired_quotation_mark(self, quotation_mark: str) -> bool: + return any( + [ + mark != quotation_mark and mark in self.grouped_quotation_marks + for mark in self.quote_convention_set.get_possible_paired_quotation_marks(quotation_mark) + ] + ) + + +class PreliminaryQuotationAnalyzer: + apostrophe_pattern = regex.compile(r"[\'\u2019]", regex.U) + + def __init__(self, quote_conventions: QuoteConventionSet): + self.quote_conventions = quote_conventions + self.character_count_statistics = CharacterCountStatistics() + self.word_position_statistics = QuotationMarkWordPositions() + self.verse_positions = QuotationMarkVersePositions() + self.quotation_mark_sequences = QuotationMarkSequences() + self._reset_analysis() + + def _reset_analysis(self) -> None: + self.character_count_statistics.reset() + self.word_position_statistics.reset() + self.verse_positions.reset() + self.quotation_mark_sequences.reset() + self.earlier_quotation_mark_counts: dict[str, int] = dict() + self.later_quotation_mark_counts: dict[str, int] = dict() + + def narrow_down_possible_quote_conventions(self, chapters: list[Chapter]) -> QuoteConventionSet: + for chapter in chapters: + self._analyze_quotation_marks_for_chapter(chapter) + return self._select_compatible_quote_conventions() + + def _analyze_quotation_marks_for_chapter(self, chapter: Chapter) -> None: + for verse in chapter.get_verses(): + self._analyze_quotation_marks_for_verse(verse) + + def _analyze_quotation_marks_for_verse(self, verse: Verse) -> None: + self._count_characters_in_verse(verse) + quotation_marks = QuotationMarkFinder(self.quote_conventions).find_all_potential_quotation_marks_in_verse(verse) + self._analyze_quotation_mark_sequence(quotation_marks) + self._count_verse_starting_and_ending_quotation_marks(quotation_marks) + + def _count_characters_in_verse(self, verse: Verse) -> None: + for text_segment in verse.get_text_segments(): + self._count_characters_in_text_segment(text_segment) + + def _count_characters_in_text_segment(self, text_segment: TextSegment) -> None: + self.character_count_statistics.count_characters(text_segment) + + def _analyze_quotation_mark_sequence(self, quotation_marks: list[QuotationMarkStringMatch]) -> None: + quotation_mark_grouper: QuotationMarkGrouper = QuotationMarkGrouper(quotation_marks, self.quote_conventions) + for earlier_mark, later_mark in quotation_mark_grouper.get_quotation_mark_pairs(): + self.quotation_mark_sequences.record_earlier_quotation_mark(earlier_mark) + self.quotation_mark_sequences.record_later_quotation_mark(later_mark) + + def _count_verse_starting_and_ending_quotation_marks(self, quotation_marks: list[QuotationMarkStringMatch]) -> None: + for quotation_mark_match in quotation_marks: + if quotation_mark_match.does_quotation_mark_match(self.apostrophe_pattern): + self._count_apostrophe(quotation_mark_match) + if self._is_at_start_of_verse(quotation_mark_match): + self.verse_positions.process_verse_starting_quotation_mark(quotation_mark_match) + if self._is_at_end_of_verse(quotation_mark_match): + self.verse_positions.process_verse_ending_quotation_mark(quotation_mark_match) + + def _is_at_start_of_verse(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: + return ( + quotation_mark_match.get_text_segment().is_first_segment_in_verse() + and not quotation_mark_match.has_leading_letter() + ) + + def _is_at_end_of_verse(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: + return ( + quotation_mark_match.get_text_segment().is_last_segment_in_verse() + and not quotation_mark_match.has_trailing_letter() + ) + + def _count_apostrophe(self, apostrophe_match: QuotationMarkStringMatch) -> None: + apostrophe: str = apostrophe_match.get_quotation_mark() + self.character_count_statistics.add_apostrophe() + if self._is_match_word_initial(apostrophe_match): + self.word_position_statistics.count_word_initial_apostrophe(apostrophe) + elif self._is_match_mid_word(apostrophe_match): + self.word_position_statistics.count_mid_word_apostrophe(apostrophe) + elif self._is_match_word_final(apostrophe_match): + self.word_position_statistics.count_word_final_apostrophe(apostrophe) + + def _is_match_word_initial(self, apostrophe_match: QuotationMarkStringMatch) -> bool: + if apostrophe_match.has_trailing_whitespace(): + return False + if not apostrophe_match.is_at_start_of_segment() and not apostrophe_match.has_leading_whitespace(): + return False + return True + + def _is_match_mid_word(self, apostrophe_match: QuotationMarkStringMatch) -> bool: + if apostrophe_match.has_trailing_whitespace(): + return False + if apostrophe_match.has_leading_whitespace(): + return False + return True + + def _is_match_word_final(self, apostrophe_match: QuotationMarkStringMatch) -> bool: + if not apostrophe_match.is_at_end_of_segment() and not apostrophe_match.has_trailing_whitespace(): + return False + if apostrophe_match.has_leading_whitespace(): + return False + return True + + def _select_compatible_quote_conventions(self) -> QuoteConventionSet: + opening_quotation_marks = self._find_opening_quotation_marks() + closing_quotation_marks = self._find_closing_quotation_marks() + + return self.quote_conventions.filter_to_compatible_quote_conventions( + opening_quotation_marks, closing_quotation_marks + ) + + def _find_opening_quotation_marks(self) -> List[str]: + return [ + quotation_mark + for quotation_mark in self.quote_conventions.get_possible_opening_marks() + if self._is_opening_quotation_mark(quotation_mark) + ] + + def _is_opening_quotation_mark(self, quotation_mark: str) -> bool: + if self._is_apostrophe_only(quotation_mark): + return False + + if self.quotation_mark_sequences.is_mark_much_more_common_earlier(quotation_mark): + return True + if self.quotation_mark_sequences.is_mark_common_early_and_late( + quotation_mark + ) and self.quote_conventions.is_quotation_mark_direction_ambiguous(quotation_mark): + return True + return False + + def _find_closing_quotation_marks(self) -> List[str]: + return [ + quotation_mark + for quotation_mark in self.quote_conventions.get_possible_closing_marks() + if self._is_closing_quotation_mark(quotation_mark) + ] + + def _is_closing_quotation_mark(self, quotation_mark: str) -> bool: + if self._is_apostrophe_only(quotation_mark): + return False + + if self.quotation_mark_sequences.is_mark_much_more_common_later(quotation_mark): + return True + + if self.quotation_mark_sequences.is_mark_common_early_and_late( + quotation_mark + ) and self.quote_conventions.is_quotation_mark_direction_ambiguous(quotation_mark): + return True + return False + + def _is_apostrophe_only(self, mark: str) -> bool: + if not self.apostrophe_pattern.search(mark): + return False + + if self.word_position_statistics.is_mark_rarely_initial( + mark + ) or self.word_position_statistics.is_mark_rarely_final(mark): + return True + + if self.word_position_statistics.are_initial_and_final_rates_similar( + mark + ) and self.word_position_statistics.is_mark_commonly_mid_word(mark): + return True + + if self.character_count_statistics.is_apostrophe_proportion_greater_than(0.02): + return True + + return False diff --git a/machine/corpora/analysis/quotation_mark_direction.py b/machine/corpora/analysis/quotation_mark_direction.py new file mode 100644 index 00000000..e3996423 --- /dev/null +++ b/machine/corpora/analysis/quotation_mark_direction.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class QuotationMarkDirection(Enum): + Opening = "Opening" + Closing = "Closing" diff --git a/machine/corpora/analysis/quotation_mark_finder.py b/machine/corpora/analysis/quotation_mark_finder.py new file mode 100644 index 00000000..fb187171 --- /dev/null +++ b/machine/corpora/analysis/quotation_mark_finder.py @@ -0,0 +1,37 @@ +import regex + +from .chapter import Chapter +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quote_convention_set import QuoteConventionSet +from .text_segment import TextSegment +from .verse import Verse + + +class QuotationMarkFinder: + quote_pattern = regex.compile(r"(\p{Quotation_Mark}|<<|>>|<|>)", regex.U) + + def __init__(self, quote_convention_set: QuoteConventionSet): + self.quote_convention_set = quote_convention_set + + def find_all_potential_quotation_marks_in_chapter(self, chapter: Chapter) -> list[QuotationMarkStringMatch]: + quotation_matches: list[QuotationMarkStringMatch] = [] + for verse in chapter.get_verses(): + quotation_matches.extend(self.find_all_potential_quotation_marks_in_verse(verse)) + return quotation_matches + + def find_all_potential_quotation_marks_in_verse(self, verse: Verse) -> list[QuotationMarkStringMatch]: + quotation_matches: list[QuotationMarkStringMatch] = [] + for text_segment in verse.get_text_segments(): + quotation_matches.extend(self.find_all_potential_quotation_marks_in_text_segment(text_segment)) + return quotation_matches + + def find_all_potential_quotation_marks_in_text_segment( + self, text_segment: TextSegment + ) -> list[QuotationMarkStringMatch]: + quotation_matches: list[QuotationMarkStringMatch] = [] + for quote_match in self.quote_pattern.finditer(text_segment.get_text()): + if self.quote_convention_set.is_valid_opening_quotation_mark( + quote_match.group() + ) or self.quote_convention_set.is_valid_closing_quotation_mark(quote_match.group()): + quotation_matches.append(QuotationMarkStringMatch(text_segment, quote_match.start(), quote_match.end())) + return quotation_matches diff --git a/machine/corpora/analysis/quotation_mark_metadata.py b/machine/corpora/analysis/quotation_mark_metadata.py new file mode 100644 index 00000000..02bba93d --- /dev/null +++ b/machine/corpora/analysis/quotation_mark_metadata.py @@ -0,0 +1,27 @@ +from .quotation_mark_direction import QuotationMarkDirection + + +class QuotationMarkMetadata: + def __init__( + self, quotation_mark: str, depth: int, direction: QuotationMarkDirection, start_index: int, end_index: int + ): + self.quotation_mark = quotation_mark + self.depth = depth + self.direction = direction + self.start_index = start_index + self.end_index = end_index + + def get_quotation_mark(self) -> str: + return self.quotation_mark + + def get_depth(self) -> int: + return self.depth + + def get_direction(self) -> QuotationMarkDirection: + return self.direction + + def get_start_index(self) -> int: + return self.start_index + + def get_end_index(self) -> int: + return self.end_index diff --git a/machine/corpora/analysis/quotation_mark_resolver.py b/machine/corpora/analysis/quotation_mark_resolver.py new file mode 100644 index 00000000..07760b16 --- /dev/null +++ b/machine/corpora/analysis/quotation_mark_resolver.py @@ -0,0 +1,284 @@ +from typing import Generator, Union + +import regex + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_metadata import QuotationMarkMetadata +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quote_convention_set import QuoteConventionSet +from .usfm_marker_type import UsfmMarkerType + + +class QuotationMarkResolverState: + + def __init__(self): + self.quotation_stack: list[QuotationMarkMetadata] = [] + self.current_depth: int = 0 + + def has_open_quotation_mark(self) -> bool: + return self.current_depth > 0 + + def are_more_than_n_quotes_open(self, n: int) -> bool: + return self.current_depth > n + + def add_opening_quotation_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + quote = quote_match.resolve(self.current_depth + 1, QuotationMarkDirection.Opening) + self.quotation_stack.append(quote) + self.current_depth += 1 + return quote + + def add_closing_quotation_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + quote = quote_match.resolve(self.current_depth, QuotationMarkDirection.Closing) + self.quotation_stack.pop() + self.current_depth -= 1 + return quote + + def get_deepest_opening_quotation_mark(self) -> str: + if not self.has_open_quotation_mark(): + raise RuntimeError( + "get_deepest_opening_quotation_mark() was called when the stack of quotation marks was empty." + ) + return self.quotation_stack[-1].get_quotation_mark() + + +class QuotationContinuerState: + def __init__(self): + self.quotation_continuer_stack: list[QuotationMarkMetadata] = [] + + def has_continuer_been_observed(self) -> bool: + return len(self.quotation_continuer_stack) > 0 + + def add_quotation_continuer( + self, quote_match: QuotationMarkStringMatch, quotation_mark_resolver_state: QuotationMarkResolverState + ) -> QuotationMarkMetadata: + quote = quote_match.resolve(len(self.quotation_continuer_stack) + 1, QuotationMarkDirection.Opening) + self.quotation_continuer_stack.append(quote) + if len(self.quotation_continuer_stack) == len(quotation_mark_resolver_state.quotation_stack): + self.quotation_continuer_stack.clear() + return quote + + +class QuotationMarkResolver: + quote_pattern = regex.compile(r"(?<=(.)|^)(\p{Quotation_Mark}|<<|>>|<|>)(?=(.)|$)", regex.U) + apostrophe_pattern = regex.compile(r"[\'\u2019\u2018]", regex.U) + whitespace_pattern = regex.compile(r"^[\s~]*$", regex.U) + latin_letter_pattern = regex.compile(r"^\p{script=Latin}$", regex.U) + punctuation_pattern = regex.compile(r"^[\.,;\?!\)\]\-—۔،؛]$", regex.U) + + def __init__(self, quote_convention_set: QuoteConventionSet): + self.quote_convention_set = quote_convention_set + self.quotation_mark_resolver_state = QuotationMarkResolverState() + self.quotation_continuer_state = QuotationContinuerState() + + def resolve_quotation_marks( + self, quote_matches: list[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: + for quote_index, quote_match in enumerate(quote_matches): + previous_mark = None if quote_index == 0 else quote_matches[quote_index - 1] + next_mark = None if quote_index == len(quote_matches) - 1 else quote_matches[quote_index + 1] + yield from self._resolve_quotation_mark(quote_match, previous_mark, next_mark) + + def _resolve_quotation_mark( + self, + quote_match: QuotationMarkStringMatch, + previous_mark: Union[QuotationMarkStringMatch, None], + next_mark: Union[QuotationMarkStringMatch, None], + ) -> Generator[QuotationMarkMetadata, None, None]: + if self._is_opening_quote(quote_match, previous_mark, next_mark): + if self._is_quotation_continuer(quote_match, previous_mark, next_mark): + quote = self._process_quotation_continuer(quote_match) + yield quote + else: + if self._is_depth_too_great(): + return + + quote = self._process_opening_mark(quote_match) + yield quote + elif self._is_apostrophe(quote_match, previous_mark, next_mark): + pass + elif self._is_closing_quote(quote_match, previous_mark, next_mark): + if not self.quotation_mark_resolver_state.has_open_quotation_mark(): + return + quote = self._process_closing_mark(quote_match) + yield quote + elif self._is_malformed_closing_quote(quote_match, previous_mark, next_mark): + quote = self._process_closing_mark(quote_match) + yield quote + elif self._is_malformed_opening_quote(quote_match, previous_mark, next_mark): + quote = self._process_opening_mark(quote_match) + yield quote + + def _is_quotation_continuer( + self, + quote_match: QuotationMarkStringMatch, + previous_match: Union[QuotationMarkStringMatch, None], + next_match: Union[QuotationMarkStringMatch, None], + ) -> bool: + if not quote_match.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.ParagraphMarker): + return False + if not self.quotation_mark_resolver_state.has_open_quotation_mark(): + return False + + if not self.quotation_continuer_state.has_continuer_been_observed(): + if quote_match.start_index > 0: + return False + if ( + quote_match.get_quotation_mark() + != self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark() + ): + return False + if self.quotation_mark_resolver_state.are_more_than_n_quotes_open(1): + if next_match is None or next_match.get_start_index() != quote_match.get_end_index(): + return False + else: + if ( + quote_match.get_quotation_mark() + != self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark() + ): + return False + + return True + + def _process_quotation_continuer(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + return self.quotation_continuer_state.add_quotation_continuer(quote_match, self.quotation_mark_resolver_state) + + def _is_depth_too_great(self) -> bool: + return self.quotation_mark_resolver_state.are_more_than_n_quotes_open(4) + + def _process_opening_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + return self.quotation_mark_resolver_state.add_opening_quotation_mark(quote_match) + + def _process_closing_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + return self.quotation_mark_resolver_state.add_closing_quotation_mark(quote_match) + + def _is_opening_quote( + self, + match: QuotationMarkStringMatch, + previous_match: Union[QuotationMarkStringMatch, None], + next_match: Union[QuotationMarkStringMatch, None], + ) -> bool: + + if not match.is_valid_opening_quotation_mark(self.quote_convention_set): + return False + + # if the quote convention is ambiguous, use whitespace as a clue + if match.is_valid_closing_quotation_mark(self.quote_convention_set): + return ( + match.has_leading_whitespace() + or self._does_most_recent_opening_mark_immediately_precede(match) + or match.has_leading_quote_introducer() + ) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation()) + return True + + def _is_closing_quote( + self, + match: QuotationMarkStringMatch, + previous_match: Union[QuotationMarkStringMatch, None], + next_match: Union[QuotationMarkStringMatch, None], + ) -> bool: + + if not match.is_valid_closing_quotation_mark(self.quote_convention_set): + return False + + # if the quote convention is ambiguous, use whitespace as a clue + if self.quote_convention_set.is_valid_opening_quotation_mark(match.get_quotation_mark()): + return ( + match.has_trailing_whitespace() + or match.has_trailing_punctuation() + or match.has_trailing_closing_quotation_mark(self.quote_convention_set) + ) and not match.has_leading_whitespace() + return True + + def _is_malformed_opening_quote( + self, + match: QuotationMarkStringMatch, + previous_match: Union[QuotationMarkStringMatch, None], + next_match: Union[QuotationMarkStringMatch, None], + ) -> bool: + if not self.quote_convention_set.is_valid_opening_quotation_mark(match.get_quotation_mark()): + return False + + if match.has_leading_quote_introducer(): + return True + + if ( + match.has_leading_whitespace() + and match.has_trailing_whitespace() + and not self.quotation_mark_resolver_state.has_open_quotation_mark() + ): + return True + + return False + + def _is_malformed_closing_quote( + self, + match: QuotationMarkStringMatch, + previous_match: Union[QuotationMarkStringMatch, None], + next_match: Union[QuotationMarkStringMatch, None], + ) -> bool: + if not self.quote_convention_set.is_valid_closing_quotation_mark(match.get_quotation_mark()): + return False + + return ( + ( + not match.has_trailing_whitespace() + or (match.has_leading_whitespace() and match.has_trailing_whitespace()) + ) + and self.quotation_mark_resolver_state.has_open_quotation_mark() + and self.quote_convention_set.are_marks_a_valid_pair( + self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + ) + ) + + def _does_most_recent_opening_mark_immediately_precede(self, match: QuotationMarkStringMatch) -> bool: + if not self.quotation_mark_resolver_state.has_open_quotation_mark(): + return False + + return self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == match.get_previous_character() + + def _is_apostrophe( + self, + match: QuotationMarkStringMatch, + previous_match: Union[QuotationMarkStringMatch, None], + next_match: Union[QuotationMarkStringMatch, None], + ) -> bool: + if not match.does_quotation_mark_match(self.apostrophe_pattern): + return False + + # Latin letters on both sides of punctuation mark + if ( + match.get_previous_character() is not None + and match.has_leading_latin_letter() + and match.get_next_character() is not None + and match.has_trailing_latin_letter() + ): + return True + + # potential final s possessive (e.g. Moses') + if match.does_previous_character_match(regex.compile(r"s")) and ( + match.has_trailing_whitespace() or match.has_trailing_punctuation() + ): + # check whether it could be a closing quote + if not self.quotation_mark_resolver_state.has_open_quotation_mark(): + return True + if not self.quote_convention_set.are_marks_a_valid_pair( + self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + ): + return True + if next_match is not None and self.quote_convention_set.are_marks_a_valid_pair( + self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), next_match.get_quotation_mark() + ): + return True + + # for languages that use apostrophes at the start and end of words + if ( + not self.quotation_mark_resolver_state.has_open_quotation_mark() + and match.get_quotation_mark() == "'" + or self.quotation_mark_resolver_state.has_open_quotation_mark() + and not self.quote_convention_set.are_marks_a_valid_pair( + self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + ) + ): + return True + + return False diff --git a/machine/corpora/analysis/quotation_mark_string_match.py b/machine/corpora/analysis/quotation_mark_string_match.py new file mode 100644 index 00000000..624e57ad --- /dev/null +++ b/machine/corpora/analysis/quotation_mark_string_match.py @@ -0,0 +1,141 @@ +from re import Pattern +from typing import Union + +import regex + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_metadata import QuotationMarkMetadata +from .quote_convention_set import QuoteConventionSet +from .text_segment import TextSegment +from .usfm_marker_type import UsfmMarkerType + + +class QuotationMarkStringMatch: + + # extra stuff in the regex to handle Western Cham + letter_pattern: Pattern = regex.compile(r"[\p{L}\U0001E200-\U0001E28F]", regex.U) + latin_letter_pattern: Pattern = regex.compile(r"^\p{script=Latin}$", regex.U) + whitespace_pattern: Pattern = regex.compile(r"[\s~]", regex.U) + punctuation_pattern: Pattern = regex.compile(r"[\.,;\?!\)\]\-—۔،؛]", regex.U) + quote_introducer_pattern: Pattern = regex.compile(r"[:,]", regex.U) + + def __init__(self, text_segment: TextSegment, start_index: int, end_index: int): + self.text_segment = text_segment + self.start_index = start_index + self.end_index = end_index + + def get_quotation_mark(self) -> str: + return self.text_segment.get_text()[self.start_index : self.end_index] + + def is_valid_opening_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: + return quote_convention_set.is_valid_opening_quotation_mark(self.get_quotation_mark()) + + def is_valid_closing_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: + return quote_convention_set.is_valid_closing_quotation_mark(self.get_quotation_mark()) + + def does_quotation_mark_match(self, regex_pattern: regex.Pattern) -> bool: + return regex_pattern.search(self.get_quotation_mark()) is not None + + def does_next_character_match(self, regex_pattern: regex.Pattern) -> bool: + return self.get_next_character() is not None and regex_pattern.search(self.get_next_character()) is not None + + def does_previous_character_match(self, regex_pattern: regex.Pattern) -> bool: + return ( + self.get_previous_character() is not None + and regex_pattern.search(self.get_previous_character()) is not None + ) + + def get_previous_character(self) -> Union[str, None]: + if self.start_index == 0: + previous_segment = self.text_segment.get_previous_segment() + if previous_segment is not None and not self.text_segment.is_marker_in_preceding_context( + UsfmMarkerType.ParagraphMarker + ): + return previous_segment.get_text()[-1] + return None + return self.text_segment.get_text()[self.start_index - 1] + + def get_next_character(self) -> Union[str, None]: + if self.is_at_end_of_segment(): + next_segment = self.text_segment.get_next_segment() + if next_segment is not None and not next_segment.is_marker_in_preceding_context( + UsfmMarkerType.ParagraphMarker + ): + return next_segment.get_text()[0] + return None + return self.text_segment.get_text()[self.end_index] + + # this assumes that the two matches occur in the same verse + def precedes(self, other: "QuotationMarkStringMatch") -> bool: + return self.text_segment.index_in_verse < other.text_segment.index_in_verse or ( + self.text_segment.index_in_verse == other.text_segment.index_in_verse + and self.start_index < other.start_index + ) + + def get_text_segment(self) -> TextSegment: + return self.text_segment + + def get_start_index(self) -> int: + return self.start_index + + def get_end_index(self) -> int: + return self.end_index + + def get_context(self) -> str: + return self.text_segment.get_text()[ + max(self.start_index - 10, 0) : min(self.end_index + 10, len(self.text_segment.get_text())) + ] + + def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata: + return QuotationMarkMetadata(self.get_quotation_mark(), depth, direction, self.start_index, self.end_index) + + def is_at_start_of_segment(self) -> bool: + return self.start_index == 0 + + def is_at_end_of_segment(self) -> bool: + return self.end_index == self.text_segment.length() + + def has_leading_whitespace(self) -> bool: + if self.get_previous_character() is None: + return ( + self.get_text_segment().get_immediate_preceding_marker_type() == UsfmMarkerType.ParagraphMarker + or self.get_text_segment().get_immediate_preceding_marker_type() == UsfmMarkerType.EmbedMarker + or self.get_text_segment().get_immediate_preceding_marker_type() == UsfmMarkerType.VerseMarker + ) + + return self.does_previous_character_match(self.whitespace_pattern) + + def has_trailing_whitespace(self) -> bool: + return self.does_next_character_match(self.whitespace_pattern) + + def has_leading_punctuation(self) -> bool: + return self.does_next_character_match(self.punctuation_pattern) + + def has_trailing_punctuation(self) -> bool: + return self.does_next_character_match(self.punctuation_pattern) + + # TODO: refactor this to use a passed regex pattern + def has_leading_letter(self) -> bool: + if self.letter_pattern.search(self.text_segment.substring_before(self.start_index)): + return True + return False + + def has_trailing_letter(self) -> bool: + if self.letter_pattern.search(self.text_segment.substring_after(self.end_index)): + return True + return False + + def has_leading_latin_letter(self) -> bool: + return self.does_previous_character_match(self.latin_letter_pattern) + + def has_trailing_latin_letter(self) -> bool: + return self.does_next_character_match(self.latin_letter_pattern) + + def has_leading_quote_introducer(self) -> bool: + return self.does_previous_character_match(self.quote_introducer_pattern) + + def has_leading_closing_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: + return self.does_previous_character_match(quote_convention_set.get_opening_quotation_mark_regex()) + + def has_trailing_closing_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: + return self.does_next_character_match(quote_convention_set.get_closing_quotation_mark_regex()) diff --git a/machine/corpora/analysis/quotation_mark_tabulator.py b/machine/corpora/analysis/quotation_mark_tabulator.py new file mode 100644 index 00000000..eb3eb6c3 --- /dev/null +++ b/machine/corpora/analysis/quotation_mark_tabulator.py @@ -0,0 +1,98 @@ +from typing import Dict + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_metadata import QuotationMarkMetadata +from .quote_convention import QuoteConvention + + +class QuotationMarkCounts: + def __init__(self): + self.string_counts: Dict[str, int] = dict() + self.total_count = 0 + + def count_quotation_mark(self, quotation_mark: str) -> None: + if quotation_mark not in self.string_counts: + self.string_counts[quotation_mark] = 0 + self.string_counts[quotation_mark] += 1 + self.total_count += 1 + + def get_best_proportion(self) -> tuple[str, int, int]: + best_str = max(self.string_counts, key=lambda x: self.string_counts[x]) + return (best_str, self.string_counts[best_str], self.total_count) + + def calculate_num_differences(self, expected_quotation_mark: str) -> int: + if expected_quotation_mark not in self.string_counts: + return self.total_count + return self.total_count - self.string_counts[expected_quotation_mark] + + def get_observed_count(self) -> int: + return self.total_count + + +class QuotationMarkTabulator: + + def __init__(self): + self.quotation_counts_by_depth_and_direction: dict[tuple[int, QuotationMarkDirection], QuotationMarkCounts] = ( + dict() + ) + + def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None: + for quotation_mark in quotation_marks: + self._count_quotation_mark(quotation_mark) + + def _count_quotation_mark(self, quote: QuotationMarkMetadata) -> None: + key = (quote.get_depth(), quote.get_direction()) + quotation_mark = quote.get_quotation_mark() + if key not in self.quotation_counts_by_depth_and_direction: + self.quotation_counts_by_depth_and_direction[key] = QuotationMarkCounts() + self.quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark) + + def _has_depth_and_direction_been_observed(self, depth: int, direction: QuotationMarkDirection) -> bool: + return (depth, direction) in self.quotation_counts_by_depth_and_direction + + def _get_most_common_quote_by_depth_and_direction( + self, depth: int, direction: QuotationMarkDirection + ) -> tuple[str, int, int]: + return self.quotation_counts_by_depth_and_direction[(depth, direction)].get_best_proportion() + + def calculate_similarity(self, quote_convention: QuoteConvention) -> float: + num_differences = 0 + num_total_quotation_marks = 0 + for depth, direction in self.quotation_counts_by_depth_and_direction: + expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction) + + # give higher weight to shallower depths, since deeper marks are more likely to be mistakes + num_differences += self.quotation_counts_by_depth_and_direction[ + (depth, direction) + ].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth) + num_total_quotation_marks += self.quotation_counts_by_depth_and_direction[ + (depth, direction) + ].get_observed_count() * 2 ** (-depth) + + if num_total_quotation_marks == 0: + return 0 + return 1 - (num_differences / num_total_quotation_marks) + + def print_summary(self) -> None: + for depth in range(1, 5): + if self._has_depth_and_direction_been_observed( + depth, QuotationMarkDirection.Opening + ) and self._has_depth_and_direction_been_observed(depth, QuotationMarkDirection.Closing): + (opening_quotation_mark, observed_opening_count, total_opening_count) = ( + self._get_most_common_quote_by_depth_and_direction(depth, QuotationMarkDirection.Opening) + ) + (closing_quotation_mark, observed_closing_count, total_closing_count) = ( + self._get_most_common_quote_by_depth_and_direction(depth, QuotationMarkDirection.Closing) + ) + print( + "The most common level %i quotes are %s (%i of %i opening quotes) and %s (%i of %i closing quotes)" + % ( + depth, + opening_quotation_mark, + observed_opening_count, + total_opening_count, + closing_quotation_mark, + observed_closing_count, + total_closing_count, + ) + ) diff --git a/machine/corpora/analysis/quote_convention.py b/machine/corpora/analysis/quote_convention.py new file mode 100644 index 00000000..50dc3a0f --- /dev/null +++ b/machine/corpora/analysis/quote_convention.py @@ -0,0 +1,86 @@ +from .quotation_mark_direction import QuotationMarkDirection + + +class SingleLevelQuoteConvention: + def __init__(self, opening_quote: str, closing_quote: str): + self.opening_quote = opening_quote + self.closing_quote = closing_quote + + def get_opening_quote(self) -> str: + return self.opening_quote + + def get_closing_quote(self) -> str: + return self.closing_quote + + +class QuoteConvention: + def __init__(self, name: str, levels: list[SingleLevelQuoteConvention]): + self.name = name + self.levels = levels + + def get_name(self) -> str: + return self.name + + def get_num_levels(self) -> int: + return len(self.levels) + + def get_opening_quote_at_level(self, level: int) -> str: + return self.levels[level - 1].get_opening_quote() + + def get_closing_quote_at_level(self, level: int) -> str: + return self.levels[level - 1].get_closing_quote() + + def get_expected_quotation_mark(self, depth: int, direction: QuotationMarkDirection) -> str: + if depth > len(self.levels): + return "" + return ( + self.get_opening_quote_at_level(depth) + if direction == QuotationMarkDirection.Opening + else self.get_closing_quote_at_level(depth) + ) + + def _includes_opening_quotation_mark(self, opening_quotation_mark: str) -> bool: + for level in self.levels: + if level.get_opening_quote() == opening_quotation_mark: + return True + return False + + def _includes_closing_quotation_mark(self, closing_quotation_mark: str) -> bool: + for level in self.levels: + if level.get_closing_quote() == closing_quotation_mark: + return True + return False + + def is_compatible_with_observed_quotation_marks( + self, opening_quotation_marks: list[str], closing_quotation_marks: list[str] + ) -> bool: + for opening_quotation_mark in opening_quotation_marks: + if not self._includes_opening_quotation_mark(opening_quotation_mark): + return False + for closing_quotation_mark in closing_quotation_marks: + if not self._includes_closing_quotation_mark(closing_quotation_mark): + return False + + # we require the first-level quotes to have been observed + if self.get_opening_quote_at_level(1) not in opening_quotation_marks: + return False + if self.get_closing_quote_at_level(1) not in closing_quotation_marks: + return False + return True + + def print_summary(self) -> None: + print(self.get_name()) + for level, convention in enumerate(self.levels): + ordinal_name = self._get_ordinal_name(level + 1) + print("%s%s-level quote%s" % (convention.get_opening_quote(), ordinal_name, convention.get_closing_quote())) + + def _get_ordinal_name(self, level) -> str: + if level == 1: + return "First" + if level == 2: + return "Second" + if level == 3: + return "Third" + if level == 4: + return "Fourth" + return str(level) + "th" diff --git a/machine/corpora/analysis/quote_convention_detector.py b/machine/corpora/analysis/quote_convention_detector.py new file mode 100644 index 00000000..4295058f --- /dev/null +++ b/machine/corpora/analysis/quote_convention_detector.py @@ -0,0 +1,67 @@ +from typing import Union + +from .chapter import Chapter +from .preliminary_quotation_analyzer import PreliminaryQuotationAnalyzer +from .quotation_mark_finder import QuotationMarkFinder +from .quotation_mark_metadata import QuotationMarkMetadata +from .quotation_mark_resolver import QuotationMarkResolver +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quotation_mark_tabulator import QuotationMarkTabulator +from .quote_convention import QuoteConvention +from .quote_convention_set import QuoteConventionSet +from .standard_quote_conventions import standard_quote_conventions +from .usfm_structure_extractor import UsfmStructureExtractor + + +class QuoteConventionAnalysis: + def __init__(self, best_quote_convention: QuoteConvention, best_quote_convention_score: float): + self.best_quote_convention = best_quote_convention + self.best_quote_convention_score = best_quote_convention_score + + def get_best_quote_convention(self) -> QuoteConvention: + return self.best_quote_convention + + def get_best_quote_convention_similarity_score(self) -> float: + return self.best_quote_convention_score * 100 + + +class QuoteConventionDetector(UsfmStructureExtractor): + + def __init__(self): + super().__init__() + self.quotation_mark_tabulator = QuotationMarkTabulator() + + def _count_quotation_marks_in_chapters(self, chapters: list[Chapter]) -> None: + possible_quote_conventions: QuoteConventionSet = PreliminaryQuotationAnalyzer( + standard_quote_conventions + ).narrow_down_possible_quote_conventions(chapters) + + for chapter in chapters: + self._count_quotation_marks_in_chapter(chapter, possible_quote_conventions) + + def _count_quotation_marks_in_chapter( + self, chapter: Chapter, possible_quote_conventions: QuoteConventionSet + ) -> None: + quotation_mark_matches: list[QuotationMarkStringMatch] = QuotationMarkFinder( + possible_quote_conventions + ).find_all_potential_quotation_marks_in_chapter(chapter) + + resolved_quotation_marks: list[QuotationMarkMetadata] = list( + QuotationMarkResolver(possible_quote_conventions).resolve_quotation_marks(quotation_mark_matches) + ) + + self.quotation_mark_tabulator.tabulate(resolved_quotation_marks) + + def detect_quotation_convention(self, print_summary: bool) -> Union[QuoteConventionAnalysis, None]: + self._count_quotation_marks_in_chapters(self.get_chapters()) + + (best_quote_convention, score) = standard_quote_conventions.find_most_similar_convention( + self.quotation_mark_tabulator + ) + + if print_summary: + self.quotation_mark_tabulator.print_summary() + + if score > 0 and best_quote_convention is not None: + return QuoteConventionAnalysis(best_quote_convention, score) + return None diff --git a/machine/corpora/analysis/quote_convention_set.py b/machine/corpora/analysis/quote_convention_set.py new file mode 100644 index 00000000..f34ab453 --- /dev/null +++ b/machine/corpora/analysis/quote_convention_set.py @@ -0,0 +1,124 @@ +from re import Pattern +from typing import Dict, List, Set, Tuple, Union + +import regex + +from .quotation_mark_tabulator import QuotationMarkTabulator +from .quote_convention import QuoteConvention + + +class QuoteConventionSet: + def __init__(self, conventions: List[QuoteConvention]): + self.conventions = conventions + self._create_quote_regexes() + self._create_quotation_mark_pair_map() + + def _create_quote_regexes(self) -> None: + opening_quotation_marks: Set[str] = set() + closing_quotation_marks: Set[str] = set() + all_quotation_marks: Set[str] = set() + + if len(self.conventions) > 0: + for convention in self.conventions: + for level in range(1, convention.get_num_levels() + 1): + opening_quote = convention.get_opening_quote_at_level(level) + closing_quote = convention.get_closing_quote_at_level(level) + opening_quotation_marks.add(opening_quote) + closing_quotation_marks.add(closing_quote) + all_quotation_marks.add(opening_quote) + all_quotation_marks.add(closing_quote) + + self.opening_quotation_mark_regex: Pattern = regex.compile(r"[" + "".join(opening_quotation_marks) + "]") + self.closing_quotation_mark_regex: Pattern = regex.compile(r"[" + "".join(closing_quotation_marks) + "]") + self.all_quotation_mark_regex: Pattern = regex.compile(r"[" + "".join(all_quotation_marks) + "]") + else: + self.opening_quotation_mark_regex = regex.compile(r"") + self.closing_quotation_mark_regex = regex.compile(r"") + self.all_quotation_mark_regex = regex.compile(r"") + + def _create_quotation_mark_pair_map(self) -> None: + self.closing_marks_by_opening_mark: Dict[str, set[str]] = dict() + self.opening_marks_by_closing_mark: Dict[str, set[str]] = dict() + for convention in self.conventions: + for level in range(1, convention.get_num_levels() + 1): + opening_quote = convention.get_opening_quote_at_level(level) + closing_quote = convention.get_closing_quote_at_level(level) + if opening_quote not in self.closing_marks_by_opening_mark: + self.closing_marks_by_opening_mark[opening_quote] = set() + self.closing_marks_by_opening_mark[opening_quote].add(closing_quote) + if closing_quote not in self.opening_marks_by_closing_mark: + self.opening_marks_by_closing_mark[closing_quote] = set() + self.opening_marks_by_closing_mark[closing_quote].add(opening_quote) + + def get_possible_opening_marks(self) -> list[str]: + return list(self.closing_marks_by_opening_mark.keys()) + + def get_possible_closing_marks(self) -> list[str]: + return list(self.opening_marks_by_closing_mark.keys()) + + def is_valid_opening_quotation_mark(self, quotation_mark: str) -> bool: + return quotation_mark in self.closing_marks_by_opening_mark + + def is_valid_closing_quotation_mark(self, quotation_mark: str) -> bool: + for closing_mark_set in self.closing_marks_by_opening_mark.values(): + if quotation_mark in closing_mark_set: + return True + return False + + def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: + return (opening_mark in self.closing_marks_by_opening_mark) and ( + closing_mark in self.closing_marks_by_opening_mark[opening_mark] + ) + + def is_quotation_mark_direction_ambiguous(self, quotation_mark: str) -> bool: + return ( + quotation_mark in self.closing_marks_by_opening_mark + and quotation_mark in self.closing_marks_by_opening_mark[quotation_mark] + ) + + def get_possible_paired_quotation_marks(self, quotation_mark: str) -> Set[str]: + paired_quotation_marks: Set[str] = set() + if quotation_mark in self.closing_marks_by_opening_mark: + paired_quotation_marks.update(self.closing_marks_by_opening_mark[quotation_mark]) + if quotation_mark in self.opening_marks_by_closing_mark: + paired_quotation_marks.update(self.opening_marks_by_closing_mark[quotation_mark]) + return paired_quotation_marks + + def get_opening_quotation_mark_regex(self) -> Pattern: + return self.opening_quotation_mark_regex + + def get_closing_quotation_mark_regex(self) -> Pattern: + return self.closing_quotation_mark_regex + + def get_quotation_mark_regex(self) -> Pattern: + return self.all_quotation_mark_regex + + def filter_to_compatible_quote_conventions( + self, opening_quotation_marks: list[str], closing_quotation_marks: list[str] + ) -> "QuoteConventionSet": + return QuoteConventionSet( + [ + convention + for convention in self.conventions + if convention.is_compatible_with_observed_quotation_marks( + opening_quotation_marks, closing_quotation_marks + ) + ] + ) + + def find_most_similar_convention( + self, tabulated_quotation_marks: QuotationMarkTabulator + ) -> Tuple[Union[QuoteConvention, None], float]: + best_similarity: float = float("-inf") + best_quote_convention: Union[QuoteConvention, None] = None + for quote_convention in self.conventions: + similarity = tabulated_quotation_marks.calculate_similarity(quote_convention) + if similarity > best_similarity: + best_similarity = similarity + best_quote_convention = quote_convention + + return (best_quote_convention, best_similarity) + + def print_summary(self) -> None: + print("Opening quotation marks must be one of the following: ", self.get_possible_opening_marks()) + print("Closing quotation marks must be one of the following: ", self.get_possible_closing_marks()) diff --git a/machine/corpora/analysis/standard_quote_conventions.py b/machine/corpora/analysis/standard_quote_conventions.py new file mode 100644 index 00000000..d3c72b90 --- /dev/null +++ b/machine/corpora/analysis/standard_quote_conventions.py @@ -0,0 +1,193 @@ +from .quote_convention import QuoteConvention, SingleLevelQuoteConvention +from .quote_convention_set import QuoteConventionSet + +standard_quote_conventions: QuoteConventionSet = QuoteConventionSet( + [ + QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ), + QuoteConvention( + "british_english", + [ + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + ], + ), + QuoteConvention( + "british_typewriter_english", + [ + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + ], + ), + QuoteConvention( + "hybrid_typewriter_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + ], + ), + QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ), + QuoteConvention( + "typewriter_french", + [ + SingleLevelQuoteConvention("<<", ">>"), + SingleLevelQuoteConvention("<", ">"), + SingleLevelQuoteConvention("<<", ">>"), + SingleLevelQuoteConvention("<", ">"), + ], + ), + QuoteConvention( + "french_variant", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + QuoteConvention( + "british_inspired_western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + ], + ), + QuoteConvention( + "typewriter_western_european", + [ + SingleLevelQuoteConvention("<<", ">>"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ), + QuoteConvention( + "typewriter_western_european_variant", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("<", ">"), + SingleLevelQuoteConvention("'", "'"), + ], + ), + QuoteConvention( + "hybrid_typewriter_western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ), + QuoteConvention( + "hybrid_british_typewriter_western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + ], + ), + QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ), + QuoteConvention( + "central_european_guillemets", + [ + SingleLevelQuoteConvention("\u00bb", "\u00ab"), + SingleLevelQuoteConvention("\u203a", "\u2039"), + SingleLevelQuoteConvention("\u00bb", "\u00ab"), + SingleLevelQuoteConvention("\u203a", "\u2039"), + ], + ), + QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ), + QuoteConvention( + "standard_finnish", + [ + SingleLevelQuoteConvention("\u00bb", "\u00bb"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ), + QuoteConvention( + "eastern_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + ], + ), + QuoteConvention( + "standard_russian", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ), + QuoteConvention( + "standard_arabic", + [ + SingleLevelQuoteConvention("\u201d", "\u201c"), + SingleLevelQuoteConvention("\u2019", "\u2018"), + SingleLevelQuoteConvention("\u201d", "\u201c"), + SingleLevelQuoteConvention("\u2019", "\u2018"), + ], + ), + QuoteConvention( + "non-standard_arabic", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2019", "\u2018"), + ], + ), + ] +) diff --git a/machine/corpora/analysis/text_segment.py b/machine/corpora/analysis/text_segment.py new file mode 100644 index 00000000..1f8819b2 --- /dev/null +++ b/machine/corpora/analysis/text_segment.py @@ -0,0 +1,74 @@ +from typing import Set, Union + +from .usfm_marker_type import UsfmMarkerType + + +class TextSegment: + def __init__(self): + self.text = "" + self.immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NoMarker + self.markers_in_preceding_context: Set[UsfmMarkerType] = set() + self.previous_segment: Union[TextSegment, None] = None + self.next_segment: Union[TextSegment, None] = None + self.index_in_verse: int = 0 + self.num_segments_in_verse: int = 0 + + def get_text(self) -> str: + return self.text + + def length(self) -> int: + return len(self.text) + + def substring_before(self, index: int) -> str: + return self.text[0:index] + + def substring_after(self, index: int) -> str: + return self.text[index:-1] + + def get_immediate_preceding_marker_type(self) -> UsfmMarkerType: + return self.immediate_preceding_marker + + def is_marker_in_preceding_context(self, marker: UsfmMarkerType) -> bool: + return marker in self.markers_in_preceding_context + + def get_previous_segment(self) -> "TextSegment | None": + return self.previous_segment + + def get_next_segment(self) -> "TextSegment | None": + return self.next_segment + + def is_first_segment_in_verse(self) -> bool: + return self.index_in_verse == 0 + + def is_last_segment_in_verse(self) -> bool: + return self.index_in_verse == self.num_segments_in_verse - 1 + + # These setters need to be implemented outside the builder to avoid circular dependencies + def set_next_segment(self, next_segment: "TextSegment") -> None: + self.next_segment = next_segment + + def set_index_in_verse(self, index_in_verse: int) -> None: + self.index_in_verse = index_in_verse + + def set_num_segments_in_verse(self, num_segments_in_verse: int) -> None: + self.num_segments_in_verse = num_segments_in_verse + + class Builder: + def __init__(self): + self.text_segment = TextSegment() + + def set_previous_segment(self, previous_segment: "TextSegment") -> "TextSegment.Builder": + self.text_segment.previous_segment = previous_segment + return self + + def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder": + self.text_segment.immediate_preceding_marker = marker + self.text_segment.markers_in_preceding_context.add(marker) + return self + + def set_text(self, text: str) -> "TextSegment.Builder": + self.text_segment.text = text + return self + + def build(self) -> "TextSegment": + return self.text_segment diff --git a/machine/corpora/analysis/usfm_marker_type.py b/machine/corpora/analysis/usfm_marker_type.py new file mode 100644 index 00000000..e1dfc2c4 --- /dev/null +++ b/machine/corpora/analysis/usfm_marker_type.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class UsfmMarkerType(Enum): + ParagraphMarker = "ParagraphMarker" + CharacterMarker = "CharacterMarker" + VerseMarker = "VerseMarker" + ChapterMarker = "ChapterMarker" + EmbedMarker = "Embed" + Other = "Other" + NoMarker = "NoMarker" diff --git a/machine/corpora/analysis/usfm_structure_extractor.py b/machine/corpora/analysis/usfm_structure_extractor.py new file mode 100644 index 00000000..a36c3f59 --- /dev/null +++ b/machine/corpora/analysis/usfm_structure_extractor.py @@ -0,0 +1,99 @@ +from typing import Optional, Sequence + +from ..usfm_parser_handler import UsfmParserHandler +from ..usfm_parser_state import UsfmParserState +from ..usfm_token import UsfmAttribute +from .chapter import Chapter +from .text_segment import TextSegment +from .usfm_marker_type import UsfmMarkerType +from .verse import Verse + + +class UsfmStructureExtractor(UsfmParserHandler): + def __init__(self): + self._reset() + + def _reset(self): + self.text_segments: list[TextSegment] = [] + self.next_text_segment_builder: TextSegment.Builder = TextSegment.Builder() + + def chapter( + self, + state: UsfmParserState, + number: str, + marker: str, + alt_number: Optional[str], + pub_number: Optional[str], + ) -> None: + self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) + + def start_para( + self, + state: UsfmParserState, + marker: str, + unknown: bool, + attributes: Optional[Sequence[UsfmAttribute]], + ) -> None: + self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.ParagraphMarker) + + def start_char( + self, + state: UsfmParserState, + marker_without_plus: str, + unknown: bool, + attributes: Optional[Sequence[UsfmAttribute]], + ) -> None: + self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CharacterMarker) + + def end_char( + self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool + ) -> None: + self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CharacterMarker) + + def verse( + self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str] + ) -> None: + self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) + + def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: + self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + + def end_table(self, state: UsfmParserState) -> None: + self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + + def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: + self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + + def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None: + self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + + def text(self, state: UsfmParserState, text: str) -> None: + if not state.is_verse_text: + return + if len(text) > 0: + self.next_text_segment_builder.set_text(text) + text_segment: TextSegment = self.next_text_segment_builder.build() + if len(self.text_segments) > 0: + self.text_segments[-1].set_next_segment(text_segment) + self.text_segments.append(text_segment) + self.next_text_segment_builder = TextSegment.Builder() + + def get_chapters(self) -> list[Chapter]: + chapters: list[Chapter] = [] + current_chapter_verses: list[Verse] = [] + current_verse_segments: list[TextSegment] = [] + for text_segment in self.text_segments: + if text_segment.is_marker_in_preceding_context(UsfmMarkerType.VerseMarker): + if len(current_verse_segments) > 0: + current_chapter_verses.append(Verse(current_verse_segments)) + current_verse_segments = [] + if text_segment.is_marker_in_preceding_context(UsfmMarkerType.ChapterMarker): + if len(current_chapter_verses) > 0: + chapters.append(Chapter(current_chapter_verses)) + current_chapter_verses = [] + current_verse_segments.append(text_segment) + if len(current_verse_segments) > 0: + current_chapter_verses.append(Verse(current_verse_segments)) + if len(current_chapter_verses) > 0: + chapters.append(Chapter(current_chapter_verses)) + return chapters diff --git a/machine/corpora/analysis/verse.py b/machine/corpora/analysis/verse.py new file mode 100644 index 00000000..98fc58ca --- /dev/null +++ b/machine/corpora/analysis/verse.py @@ -0,0 +1,15 @@ +from .text_segment import TextSegment + + +class Verse: + def __init__(self, text_segments: list[TextSegment]): + self.text_segments = text_segments + self._index_text_segments() + + def _index_text_segments(self) -> None: + for index, text_segment in enumerate(self.text_segments): + text_segment.set_index_in_verse(index) + text_segment.set_num_segments_in_verse(len(self.text_segments)) + + def get_text_segments(self) -> list[TextSegment]: + return self.text_segments diff --git a/tests/corpora/analysis/test_quote_convention_detector.py b/tests/corpora/analysis/test_quote_convention_detector.py new file mode 100644 index 00000000..59321a0e --- /dev/null +++ b/tests/corpora/analysis/test_quote_convention_detector.py @@ -0,0 +1,305 @@ +from typing import Union + +from machine.corpora import parse_usfm +from machine.corpora.analysis import QuoteConventionAnalysis, QuoteConventionDetector + +# Text comes from the World English Bible, which is in the public domain. + + +def test_standard_english() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "standard_english" + + +def test_typewriter_english() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, \"Has God really said, + 'You shall not eat of any tree of the garden'?\" + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "typewriter_english" + + +def test_british_english() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + “You shall not eat of any tree of the garden”?’ + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "british_english" + + +def test_british_typewriter_english() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + \"You shall not eat of any tree of the garden\"?' + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "british_typewriter_english" + + +def test_hybrid_typewriter_english() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + 'You shall not eat of any tree of the garden'?” + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "hybrid_typewriter_english" + + +def test_standard_french() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ‹You shall not eat of any tree of the garden›?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "standard_french" + + +def test_typewriter_french() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "typewriter_french" + + +# french_variant requires a 3rd-level of quotes to differentiate from standard_french + + +def test_western_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + “You shall not eat of any tree of the garden”?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "western_european" + + +def test_british_inspired_western_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ‘You shall not eat of any tree of the garden’?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "british_inspired_western_european" + + +def test_typewriter_western_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <> + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "typewriter_western_european" + + +def test_typewriter_western_european_variant() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + ?" + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "typewriter_western_european_variant" + + +def test_hybrid_typewriter_western_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + "You shall not eat of any tree of the garden"?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "hybrid_typewriter_western_european" + + +def test_hybrid_british_typewriter_western_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + 'You shall not eat of any tree of the garden'?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "hybrid_british_typewriter_western_european" + + +def test_central_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, „Has God really said, + ‚You shall not eat of any tree of the garden‘?“ + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "central_european" + + +def test_central_european_guillemets() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, »Has God really said, + ›You shall not eat of any tree of the garden‹?« + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "central_european_guillemets" + + +def test_standard_swedish() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ”Has God really said, + ’You shall not eat of any tree of the garden’?” + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "standard_swedish" + + +def test_standard_finnish() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, »Has God really said, + ’You shall not eat of any tree of the garden’?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "standard_finnish" + + +def test_eastern_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, „Has God really said, + ‚You shall not eat of any tree of the garden’?” + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "eastern_european" + + +def test_standard_russian() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + „You shall not eat of any tree of the garden“?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "standard_russian" + + +def test_standard_arabic() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ”Has God really said, + ’You shall not eat of any tree of the garden‘?“ + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "standard_arabic" + + +def test_non_standard_arabic() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ’You shall not eat of any tree of the garden‘?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "non-standard_arabic" + + +def test_mismatched_quotation_marks() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + \\v 2 The woman said to the serpent, + “We may eat fruit from the trees of the garden, + \\v 3 but not the fruit of the tree which is in the middle of the garden. + God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’ + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.get_best_quote_convention().get_name() == "standard_english" + + +def detect_quote_convention(usfm: str) -> Union[QuoteConventionAnalysis, None]: + quote_convention_detector = QuoteConventionDetector() + parse_usfm(usfm, quote_convention_detector) + return quote_convention_detector.detect_quotation_convention(print_summary=False) From e39e841aabfe946ef3fc9722c0f217930d44cf7a Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 11 Apr 2025 14:10:35 -0400 Subject: [PATCH 04/31] Initial working version of quotation denormalization --- machine/corpora/__init__.py | 4 + machine/corpora/analysis/__init__.py | 17 +- .../corpora/analysis/quotation_mark_finder.py | 21 +- .../analysis/quotation_mark_metadata.py | 25 +- .../analysis/quotation_mark_string_match.py | 4 +- machine/corpora/analysis/quote_convention.py | 33 ++ .../corpora/analysis/quote_convention_set.py | 6 + machine/corpora/analysis/text_segment.py | 14 + .../analysis/usfm_structure_extractor.py | 8 +- ...lization_scripture_update_block_handler.py | 77 +++++ ...lization_scripture_block_update_handler.py | 294 ++++++++++++++++++ 11 files changed, 492 insertions(+), 11 deletions(-) create mode 100644 machine/corpora/quotation_denormalization_scripture_update_block_handler.py create mode 100644 tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 45eb628b..bf5f611c 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -24,6 +24,9 @@ from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler +from .quotation_denormalization_scripture_update_block_handler import ( + QuotationDenormalizationScriptureUpdateBlockHandler, +) from .scripture_element import ScriptureElement from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType @@ -121,6 +124,7 @@ "PlaceMarkersAlignmentInfo", "PlaceMarkersUsfmUpdateBlockHandler", "parse_usfm", + "QuotationDenormalizationScriptureUpdateBlockHandler", "RtlReferenceOrder", "ScriptureElement", "ScriptureRef", diff --git a/machine/corpora/analysis/__init__.py b/machine/corpora/analysis/__init__.py index e8cd623c..411a5643 100644 --- a/machine/corpora/analysis/__init__.py +++ b/machine/corpora/analysis/__init__.py @@ -1,3 +1,18 @@ +from .quotation_mark_resolver import QuotationMarkResolver +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quote_convention import QuoteConvention from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector +from .quote_convention_set import QuoteConventionSet +from .text_segment import TextSegment +from .usfm_marker_type import UsfmMarkerType -__all__ = ["QuoteConventionAnalysis", "QuoteConventionDetector"] +__all__ = [ + "QuotationMarkResolver", + "QuotationMarkStringMatch", + "QuoteConvention", + "QuoteConventionAnalysis", + "QuoteConventionDetector", + "QuoteConventionSet", + "TextSegment", + "UsfmMarkerType", +] diff --git a/machine/corpora/analysis/quotation_mark_finder.py b/machine/corpora/analysis/quotation_mark_finder.py index fb187171..73078abc 100644 --- a/machine/corpora/analysis/quotation_mark_finder.py +++ b/machine/corpora/analysis/quotation_mark_finder.py @@ -1,3 +1,5 @@ +from typing import List + import regex from .chapter import Chapter @@ -13,22 +15,27 @@ class QuotationMarkFinder: def __init__(self, quote_convention_set: QuoteConventionSet): self.quote_convention_set = quote_convention_set - def find_all_potential_quotation_marks_in_chapter(self, chapter: Chapter) -> list[QuotationMarkStringMatch]: - quotation_matches: list[QuotationMarkStringMatch] = [] + def find_all_potential_quotation_marks_in_chapter(self, chapter: Chapter) -> List[QuotationMarkStringMatch]: + quotation_matches: List[QuotationMarkStringMatch] = [] for verse in chapter.get_verses(): quotation_matches.extend(self.find_all_potential_quotation_marks_in_verse(verse)) return quotation_matches - def find_all_potential_quotation_marks_in_verse(self, verse: Verse) -> list[QuotationMarkStringMatch]: - quotation_matches: list[QuotationMarkStringMatch] = [] - for text_segment in verse.get_text_segments(): + def find_all_potential_quotation_marks_in_verse(self, verse: Verse) -> List[QuotationMarkStringMatch]: + return self.find_all_potential_quotation_marks_in_text_segments(verse.get_text_segments()) + + def find_all_potential_quotation_marks_in_text_segments( + self, text_segments: List[TextSegment] + ) -> list[QuotationMarkStringMatch]: + quotation_matches: List[QuotationMarkStringMatch] = [] + for text_segment in text_segments: quotation_matches.extend(self.find_all_potential_quotation_marks_in_text_segment(text_segment)) return quotation_matches def find_all_potential_quotation_marks_in_text_segment( self, text_segment: TextSegment - ) -> list[QuotationMarkStringMatch]: - quotation_matches: list[QuotationMarkStringMatch] = [] + ) -> List[QuotationMarkStringMatch]: + quotation_matches: List[QuotationMarkStringMatch] = [] for quote_match in self.quote_pattern.finditer(text_segment.get_text()): if self.quote_convention_set.is_valid_opening_quotation_mark( quote_match.group() diff --git a/machine/corpora/analysis/quotation_mark_metadata.py b/machine/corpora/analysis/quotation_mark_metadata.py index 02bba93d..72736ec1 100644 --- a/machine/corpora/analysis/quotation_mark_metadata.py +++ b/machine/corpora/analysis/quotation_mark_metadata.py @@ -1,13 +1,22 @@ from .quotation_mark_direction import QuotationMarkDirection +from .quote_convention import QuoteConvention +from .text_segment import TextSegment class QuotationMarkMetadata: def __init__( - self, quotation_mark: str, depth: int, direction: QuotationMarkDirection, start_index: int, end_index: int + self, + quotation_mark: str, + depth: int, + direction: QuotationMarkDirection, + text_segment: TextSegment, + start_index: int, + end_index: int, ): self.quotation_mark = quotation_mark self.depth = depth self.direction = direction + self.text_segment = text_segment self.start_index = start_index self.end_index = end_index @@ -20,8 +29,22 @@ def get_depth(self) -> int: def get_direction(self) -> QuotationMarkDirection: return self.direction + def get_text_segment(self) -> TextSegment: + return self.text_segment + def get_start_index(self) -> int: return self.start_index def get_end_index(self) -> int: return self.end_index + + def update_quotation_mark(self, quote_convention: QuoteConvention) -> None: + updated_quotation_mark = quote_convention.get_expected_quotation_mark(self.depth, self.direction) + if updated_quotation_mark == self.quotation_mark: + return + + self.text_segment.replace_substring( + self.start_index, + self.end_index, + updated_quotation_mark, + ) diff --git a/machine/corpora/analysis/quotation_mark_string_match.py b/machine/corpora/analysis/quotation_mark_string_match.py index 624e57ad..ee0e5d08 100644 --- a/machine/corpora/analysis/quotation_mark_string_match.py +++ b/machine/corpora/analysis/quotation_mark_string_match.py @@ -87,7 +87,9 @@ def get_context(self) -> str: ] def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata: - return QuotationMarkMetadata(self.get_quotation_mark(), depth, direction, self.start_index, self.end_index) + return QuotationMarkMetadata( + self.get_quotation_mark(), depth, direction, self.text_segment, self.start_index, self.end_index + ) def is_at_start_of_segment(self) -> bool: return self.start_index == 0 diff --git a/machine/corpora/analysis/quote_convention.py b/machine/corpora/analysis/quote_convention.py index 50dc3a0f..2f28a595 100644 --- a/machine/corpora/analysis/quote_convention.py +++ b/machine/corpora/analysis/quote_convention.py @@ -1,5 +1,22 @@ +from typing import Dict + from .quotation_mark_direction import QuotationMarkDirection +quote_normalization_map: Dict[str, str] = { + "\u00ab": '"', + "\u00bb": '"', + "\u2018": "'", + "\u2019": "'", + "\u201a": "'", + "\u201c": '"', + "\u201d": '"', + "\u201e": '"', + "\u300a": '"', + "\u300b": '"', + "\u300c": '"', + "\u300d": '"', +} + class SingleLevelQuoteConvention: def __init__(self, opening_quote: str, closing_quote: str): @@ -12,6 +29,19 @@ def get_opening_quote(self) -> str: def get_closing_quote(self) -> str: return self.closing_quote + def normalize(self) -> "SingleLevelQuoteConvention": + normalized_opening_quote = ( + quote_normalization_map[self.opening_quote] + if self.opening_quote in quote_normalization_map + else self.opening_quote + ) + normalized_closing_quote = ( + quote_normalization_map[self.closing_quote] + if self.closing_quote in quote_normalization_map + else self.closing_quote + ) + return SingleLevelQuoteConvention(normalized_opening_quote, normalized_closing_quote) + class QuoteConvention: def __init__(self, name: str, levels: list[SingleLevelQuoteConvention]): @@ -68,6 +98,9 @@ def is_compatible_with_observed_quotation_marks( return False return True + def normalize(self) -> "QuoteConvention": + return QuoteConvention(self.get_name() + "_normalized", [level.normalize() for level in self.levels]) + def print_summary(self) -> None: print(self.get_name()) for level, convention in enumerate(self.levels): diff --git a/machine/corpora/analysis/quote_convention_set.py b/machine/corpora/analysis/quote_convention_set.py index f34ab453..c115f4d9 100644 --- a/machine/corpora/analysis/quote_convention_set.py +++ b/machine/corpora/analysis/quote_convention_set.py @@ -50,6 +50,12 @@ def _create_quotation_mark_pair_map(self) -> None: self.opening_marks_by_closing_mark[closing_quote] = set() self.opening_marks_by_closing_mark[closing_quote].add(opening_quote) + def get_quote_convention_by_name(self, name: str) -> Union[QuoteConvention, None]: + for convention in self.conventions: + if convention.get_name() == name: + return convention + return None + def get_possible_opening_marks(self) -> list[str]: return list(self.closing_marks_by_opening_mark.keys()) diff --git a/machine/corpora/analysis/text_segment.py b/machine/corpora/analysis/text_segment.py index 1f8819b2..08080341 100644 --- a/machine/corpora/analysis/text_segment.py +++ b/machine/corpora/analysis/text_segment.py @@ -1,5 +1,6 @@ from typing import Set, Union +from ..usfm_token import UsfmToken from .usfm_marker_type import UsfmMarkerType @@ -12,6 +13,7 @@ def __init__(self): self.next_segment: Union[TextSegment, None] = None self.index_in_verse: int = 0 self.num_segments_in_verse: int = 0 + self.usfm_token: Union[UsfmToken, None] = None def get_text(self) -> str: return self.text @@ -43,7 +45,15 @@ def is_first_segment_in_verse(self) -> bool: def is_last_segment_in_verse(self) -> bool: return self.index_in_verse == self.num_segments_in_verse - 1 + def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None: + self.text = self.text[:start_index] + replacement + self.text[end_index:] + if self.usfm_token is not None: + self.usfm_token.text = self.text + # These setters need to be implemented outside the builder to avoid circular dependencies + def set_previous_segment(self, previous_segment: "TextSegment") -> None: + self.previous_segment = previous_segment + def set_next_segment(self, next_segment: "TextSegment") -> None: self.next_segment = next_segment @@ -66,6 +76,10 @@ def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder": self.text_segment.markers_in_preceding_context.add(marker) return self + def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder": + self.text_segment.usfm_token = token + return self + def set_text(self, text: str) -> "TextSegment.Builder": self.text_segment.text = text return self diff --git a/machine/corpora/analysis/usfm_structure_extractor.py b/machine/corpora/analysis/usfm_structure_extractor.py index a36c3f59..71968780 100644 --- a/machine/corpora/analysis/usfm_structure_extractor.py +++ b/machine/corpora/analysis/usfm_structure_extractor.py @@ -73,8 +73,14 @@ def text(self, state: UsfmParserState, text: str) -> None: if len(text) > 0: self.next_text_segment_builder.set_text(text) text_segment: TextSegment = self.next_text_segment_builder.build() - if len(self.text_segments) > 0: + # don't look past verse boundaries, to enable identical functionality in the + # online one-verse-at-a-time (QuotationDenormalizationScriptureUpdateBlockHandler) + # and offline whole-book-at-once settings (QuoteConventionDetector) + if len(self.text_segments) > 0 and not text_segment.is_marker_in_preceding_context( + UsfmMarkerType.VerseMarker + ): self.text_segments[-1].set_next_segment(text_segment) + text_segment.set_previous_segment(self.text_segments[-1]) self.text_segments.append(text_segment) self.next_text_segment_builder = TextSegment.Builder() diff --git a/machine/corpora/quotation_denormalization_scripture_update_block_handler.py b/machine/corpora/quotation_denormalization_scripture_update_block_handler.py new file mode 100644 index 00000000..163310b2 --- /dev/null +++ b/machine/corpora/quotation_denormalization_scripture_update_block_handler.py @@ -0,0 +1,77 @@ +from typing import List + +from .analysis.quotation_mark_finder import QuotationMarkFinder +from .analysis.quotation_mark_resolver import QuotationMarkResolver +from .analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .analysis.quote_convention import QuoteConvention +from .analysis.quote_convention_set import QuoteConventionSet +from .analysis.text_segment import TextSegment +from .analysis.usfm_marker_type import UsfmMarkerType +from .scripture_update_block import ScriptureUpdateBlock +from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase +from .scripture_update_element import ScriptureUpdateElement +from .usfm_token import UsfmTokenType + + +class QuotationDenormalizationScriptureUpdateBlockHandler(ScriptureUpdateBlockHandlerBase): + + def __init__(self, target_quote_convention: QuoteConvention): + self._target_quote_convention: QuoteConvention = target_quote_convention + self._normalized_quote_convention: QuoteConvention = target_quote_convention.normalize() + self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( + QuoteConventionSet([self._normalized_quote_convention]) + ) + self._quotation_mark_resolver: QuotationMarkResolver = QuotationMarkResolver( + QuoteConventionSet([self._normalized_quote_convention]) + ) + self._next_scripture_text_segment_builder: TextSegment.Builder = TextSegment.Builder() + + def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + for element_index, element in enumerate(block._elements): + self._process_scripture_element(element) + return block + + def _process_scripture_element(self, element: ScriptureUpdateElement) -> None: + text_segments: List[TextSegment] = self._create_text_segments(element) + quotation_mark_matches: List[QuotationMarkStringMatch] = ( + self._quotation_mark_finder.find_all_potential_quotation_marks_in_text_segments(text_segments) + ) + for match in quotation_mark_matches: + print(match.get_context()) + for resolved_quotation_mark in self._quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches): + resolved_quotation_mark.update_quotation_mark(self._target_quote_convention) + + def _create_text_segments(self, element: ScriptureUpdateElement) -> List[TextSegment]: + text_segments: List[TextSegment] = [] + for token in element.get_tokens(): + if token.type == UsfmTokenType.CHAPTER: + self._quotation_mark_resolver: QuotationMarkResolver = QuotationMarkResolver( + QuoteConventionSet([self._normalized_quote_convention]) + ) + self._next_scripture_text_segment_builder = TextSegment.Builder() + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) + elif token.type == UsfmTokenType.VERSE: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) + elif token.type == UsfmTokenType.PARAGRAPH: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ParagraphMarker) + elif token.type == UsfmTokenType.CHARACTER: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.CharacterMarker) + elif token.type == UsfmTokenType.NOTE: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + # TODO: create a text segment for the embed + elif token.type == UsfmTokenType.TEXT: + self._next_scripture_text_segment_builder.set_usfm_token(token) + if token.text is not None: + self._next_scripture_text_segment_builder.set_text(token.text) + text_segments.append(self._next_scripture_text_segment_builder.build()) + else: + self._next_scripture_text_segment_builder = TextSegment.Builder() + return self._set_previous_and_next_for_segments(text_segments) + + def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) -> List[TextSegment]: + for i in range(len(text_segments)): + if i > 0: + text_segments[i].set_previous_segment(text_segments[i - 1]) + if i < len(text_segments) - 1: + text_segments[i].set_next_segment(text_segments[i + 1]) + return text_segments diff --git a/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py b/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py new file mode 100644 index 00000000..0c2e2006 --- /dev/null +++ b/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py @@ -0,0 +1,294 @@ +from machine.corpora import QuotationDenormalizationScriptureUpdateBlockHandler, UpdateUsfmParserHandler, parse_usfm +from machine.corpora.analysis import standard_quote_conventions + +simple_normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + 'You shall not eat of any tree of the garden'?" + """ + + +def test_simple_english_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_british_english_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + "You shall not eat of any tree of the garden"?' + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "british_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# no denormalization should be needed for this example +def test_simple_typewriter_english_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, \"Has God really said, 'You shall not eat of any tree of the garden'?\"" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "typewriter_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# some of the quotes shouldn't need to be denormalized +def test_simple_hybrid_typewriter_english_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, 'You shall not eat of any tree of the garden'?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "hybrid_typewriter_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# the single guillemets shouldn't need to be denormalized +# because Moses doesn't normalize them +def test_simple_french_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + ‹You shall not eat of any tree of the garden›?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, ‹You shall not eat of any tree of the garden›?»" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_french") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# the unusual quotation marks shouldn't need to be denormalized +def test_simple_typewriter_french_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, <?>>" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "typewriter_french") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# the 1st- and 2nd-level quotes are denormalized to identical marks +def test_simple_western_european_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden"?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, “You shall not eat of any tree of the garden”?»" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "western_european") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_typewriter_western_european_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <> + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, <>' + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "typewriter_western_european") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_typewriter_western_european_variant_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + ?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, "Has God really said, ?"' + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "typewriter_western_european_variant") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_hybrid_typewriter_western_european_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden"?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, «Has God really said, "You shall not eat of any tree of the garden"?»' + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "hybrid_typewriter_western_european") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_central_european_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden"?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden‘?“" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "central_european") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_central_european_guillemets_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + ›You shall not eat of any tree of the garden‹?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, »Has God really said, ›You shall not eat of any tree of the garden‹?«" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "central_european_guillemets") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_swedish_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + 'You shall not eat of any tree of the garden'?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_swedish") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_finnish_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, »Has God really said, ’You shall not eat of any tree of the garden’?»" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_finnish") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_eastern_european_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "eastern_european") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_russian_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden"?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, „You shall not eat of any tree of the garden“?»" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_russian") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_arabic_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden‘?“" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_arabic") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def denormalize_quotation_marks(normalized_usfm: str, quote_convention_name: str) -> str: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name(quote_convention_name) + ) + assert standard_english_quote_convention is not None + + quotation_denormalizer: QuotationDenormalizationScriptureUpdateBlockHandler = ( + QuotationDenormalizationScriptureUpdateBlockHandler(standard_english_quote_convention) + ) + updater = UpdateUsfmParserHandler(update_block_handlers=[quotation_denormalizer]) + parse_usfm(normalized_usfm, updater) + + return updater.get_usfm() + + +def assert_usfm_equal(observed_usfm: str, expected_usfm: str) -> None: + for observed_line, expected_line in zip(observed_usfm.split("\n"), expected_usfm.split("\n")): + assert observed_line.strip() == expected_line.strip() From 32fb53b93fedb6c6b37ffddc11f81fa9b571a6e3 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 3 Apr 2025 13:14:24 -0400 Subject: [PATCH 05/31] I want to process the data in segments that correspond to individual translations. These updates make it happen. --- tests/corpora/test_update_usfm_parser_handler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 47cd6280..510ae264 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -209,6 +209,7 @@ def test_paragraph_in_verse(): usfm = r"""\id MAT - Test \c 1 \p paragraph not in a verse +\p paragraph not in a verse \v 1 verse 1 \p inner verse paragraph \s1 Section Header \v 2 Verse 2 \p inner verse paragraph @@ -219,6 +220,7 @@ def test_paragraph_in_verse(): result = r"""\id MAT - Test \c 1 \p paragraph not in a verse +\p paragraph not in a verse \v 1 Update 1 \s1 Section Header \v 2 Verse 2 inner verse paragraph @@ -235,7 +237,7 @@ def test_paragraph_in_verse(): result_strip = r"""\id MAT \c 1 -\p +\p \v 1 Update 1 \s1 \v 2 From 072bcb7e6b275aa711ce48a7866e6acb0e10eb8c Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 10 Apr 2025 13:33:40 -0400 Subject: [PATCH 06/31] Updates for reviewer comments --- machine/corpora/scripture_update_block.py | 4 + ...e.py => scripture_update_block_handler.py} | 3 +- ...date_block_handler_first_elements_first.py | 23 ---- .../test_update_scripture_block_updater.py | 119 ------------------ 4 files changed, 6 insertions(+), 143 deletions(-) rename machine/corpora/{scripture_update_block_handler_base.py => scripture_update_block_handler.py} (80%) delete mode 100644 machine/corpora/scripture_update_block_handler_first_elements_first.py delete mode 100644 tests/corpora/test_update_scripture_block_updater.py diff --git a/machine/corpora/scripture_update_block.py b/machine/corpora/scripture_update_block.py index 00787cf2..afb9e75a 100644 --- a/machine/corpora/scripture_update_block.py +++ b/machine/corpora/scripture_update_block.py @@ -11,6 +11,10 @@ def __init__(self) -> None: self._ref: ScriptureRef = ScriptureRef() self._elements: list[ScriptureUpdateElement] = [] + @property + def elements(self) -> list[ScriptureUpdateElement]: + return self._elements + def add_existing_text(self, token: UsfmToken, marked_for_removal: bool = False) -> None: self._elements.append( ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) diff --git a/machine/corpora/scripture_update_block_handler_base.py b/machine/corpora/scripture_update_block_handler.py similarity index 80% rename from machine/corpora/scripture_update_block_handler_base.py rename to machine/corpora/scripture_update_block_handler.py index 2998a0d9..ff1d6f9e 100644 --- a/machine/corpora/scripture_update_block_handler_base.py +++ b/machine/corpora/scripture_update_block_handler.py @@ -1,9 +1,10 @@ from __future__ import annotations +from abc import ABC from .scripture_update_block import ScriptureUpdateBlock -class ScriptureUpdateBlockHandlerBase: +class ScriptureUpdateBlockHandler(ABC): def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: raise NotImplementedError("Must be implemented in subclass") diff --git a/machine/corpora/scripture_update_block_handler_first_elements_first.py b/machine/corpora/scripture_update_block_handler_first_elements_first.py deleted file mode 100644 index 17f44798..00000000 --- a/machine/corpora/scripture_update_block_handler_first_elements_first.py +++ /dev/null @@ -1,23 +0,0 @@ -from __future__ import annotations - -from .scripture_update_block import ScriptureUpdateBlock -from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase -from .scripture_update_element import ScriptureUpdateElementType - - -class ScriptureUpdateBlockHandlerFirstElementsFirst(ScriptureUpdateBlockHandlerBase): - - def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: - # If a paragraph, embed or style element occurs before existing text, move it before inserted text as well. - current_insert_index = 0 - for current_index in range(len(block._elements)): - element = block._elements[current_index] - if element.type == ScriptureUpdateElementType.EXISTING_TEXT: - # we found existing text, so we stop looking for elements to move - break - if current_index != current_insert_index and element.type != ScriptureUpdateElementType.INSERTED_TEXT: - block._elements.remove(element) - block._elements.insert(current_insert_index, element) - current_insert_index += 1 - - return block diff --git a/tests/corpora/test_update_scripture_block_updater.py b/tests/corpora/test_update_scripture_block_updater.py deleted file mode 100644 index 32d9057a..00000000 --- a/tests/corpora/test_update_scripture_block_updater.py +++ /dev/null @@ -1,119 +0,0 @@ -from typing import List, Optional, Sequence, Tuple - -from machine.corpora.scripture_update_block_handler_first_elements_first import ( - ScriptureUpdateBlockHandlerFirstElementsFirst, -) - -from machine.corpora.scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase -from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH - -from machine.corpora import ( - FileParatextProjectTextUpdater, - ScriptureRef, - UpdateUsfmMarkerBehavior, - UpdateUsfmParserHandler, - UpdateUsfmTextBehavior, - parse_usfm, -) - - -def test_preserve_paragraphs(): - rows = [ - (scr_ref("MAT 1:1"), str("U1")), - ( - scr_ref("MAT 1:1/1:f"), - str("UF1"), - ), - (scr_ref("MAT 1:2"), str("U2")), - ( - scr_ref("MAT 1:2/1:f"), - str("UF2"), - ), - (scr_ref("MAT 1:3"), str("U3")), - ( - scr_ref("MAT 1:3/1:f"), - str("UF3"), - ), - ] - usfm = r"""\id MAT -\c 1 -\v 1 \f \ft \fm ' \fm* hello world \f* it comes first -\v 2 it comes \f \ft hello \fm ' \fm* world \f* middling -\v 3 it comes last \f \ft hello world \fm ' \fm* \f* -""" - - target = update_usfm(rows, usfm) - result = r"""\id MAT -\c 1 -\v 1 U1 \f \ft UF1 \fm ' \fm*\f* -\v 2 U2 \f \ft UF2 \fm ' \fm*\f* -\v 3 U3 \f \ft UF3 \fm ' \fm*\f* -""" - - assess(target, result) - - target_first_element = update_usfm( - rows, usfm, update_block_handlers=[ScriptureUpdateBlockHandlerFirstElementsFirst()] - ) - result_first_element = r"""\id MAT -\c 1 -\v 1 \f \ft \fm ' \fm* UF1 \f* U1 -\v 2 U2 \f \ft UF2 \fm ' \fm*\f* -\v 3 U3 \f \ft UF3 \fm ' \fm*\f* -""" - assess(target_first_element, result_first_element) - - -def scr_ref(*refs: str) -> List[ScriptureRef]: - return [ScriptureRef.parse(ref) for ref in refs] - - -def update_usfm( - rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, - source: Optional[str] = None, - id_text: Optional[str] = None, - text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, - paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, - embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, - style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, - preserve_paragraph_styles: Optional[Sequence[str]] = None, - update_block_handlers: Optional[list[ScriptureUpdateBlockHandlerBase]] = None, -) -> Optional[str]: - if source is None: - updater = FileParatextProjectTextUpdater(USFM_TEST_PROJECT_PATH) - return updater.update_usfm( - "MAT", - rows, - id_text, - text_behavior, - paragraph_behavior, - embed_behavior, - style_behavior, - preserve_paragraph_styles, - update_block_handlers, - ) - else: - source = source.strip().replace("\r\n", "\n") + "\r\n" - updater = UpdateUsfmParserHandler( - rows, - id_text, - text_behavior, - paragraph_behavior, - embed_behavior, - style_behavior, - preserve_paragraph_styles, - update_block_handlers, - ) - parse_usfm(source, updater) - return updater.get_usfm() - - -def assess(target: Optional[str], truth: str) -> None: - assert target is not None - for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): - assert target_line.strip() == truth_line.strip() - - -def read_usfm() -> str: - with (USFM_TEST_PROJECT_PATH / "41MATTes.SFM").open("r", encoding="utf-8-sig", newline="\r\n") as file: - return file.read() From d7d804f83bf7e86d294064cd236d061f0ac00d0c Mon Sep 17 00:00:00 2001 From: John Lambert Date: Fri, 11 Apr 2025 16:21:50 -0400 Subject: [PATCH 07/31] Respond to reviewer comments Pass marker type (embed, style) to update block --- machine/corpora/scripture_embed.py | 16 ++++++++ .../scripture_ref_usfm_parser_handler.py | 41 ++++++++++++------- machine/corpora/scripture_update_block.py | 18 ++++---- .../corpora/scripture_update_block_handler.py | 3 +- machine/corpora/scripture_update_element.py | 21 +++++++++- machine/corpora/update_usfm_parser_handler.py | 10 +++-- 6 files changed, 80 insertions(+), 29 deletions(-) create mode 100644 machine/corpora/scripture_embed.py diff --git a/machine/corpora/scripture_embed.py b/machine/corpora/scripture_embed.py new file mode 100644 index 00000000..cc4a64f6 --- /dev/null +++ b/machine/corpora/scripture_embed.py @@ -0,0 +1,16 @@ +from typing import Optional + +EMBED_PART_START_CHAR_STYLES = ("f", "x", "z") +EMBED_STYLES = ("f", "fe", "fig", "fm", "x") + + +def is_note_text(marker: Optional[str]) -> bool: + return marker == "ft" + + +def is_embed_part_style(marker: Optional[str]) -> bool: + return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES) + + +def is_embed_style(marker: Optional[str]) -> bool: + return marker is not None and marker.strip("*") in EMBED_STYLES diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler.py index db9081b7..febc4922 100644 --- a/machine/corpora/scripture_ref_usfm_parser_handler.py +++ b/machine/corpora/scripture_ref_usfm_parser_handler.py @@ -5,6 +5,7 @@ from ..scripture.verse_ref import VerseRef, are_overlapping_verse_ranges from .corpora_utils import merge_verse_ranges from .scripture_element import ScriptureElement +from .scripture_embed import EMBED_PART_START_CHAR_STYLES, is_embed_part_style, is_embed_style, is_note_text from .scripture_ref import ScriptureRef from .usfm_parser_handler import UsfmParserHandler from .usfm_parser_state import UsfmParserState @@ -21,10 +22,6 @@ class ScriptureTextType(Enum): _EMBED_STYLES = {"f", "fe", "x", "fig"} -def _is_embed_style(marker: Optional[str]) -> bool: - return marker is not None and (marker.strip("*") in _EMBED_STYLES or marker.startswith("z")) - - class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC): def __init__(self) -> None: self._cur_verse_ref: VerseRef = VerseRef() @@ -122,23 +119,29 @@ def opt_break(self, state: UsfmParserState) -> None: def start_char( self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]] ) -> None: + if is_embed_part_style(marker) and self._in_note_text: + self._in_nested_embed = True # if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment self._check_convert_verse_para_to_non_verse(state) - if _is_embed_style(marker): - self._start_embed_text_wrapper(state, marker) + if is_embed_style(marker): + self._in_embed = True + self._start_embed_wrapper(state, marker) + + if is_note_text(marker): + self._start_note_text_wrapper(state) def end_char( self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool ) -> None: - if _is_embed_style(marker): - self._end_embed_text_wrapper(state) - - def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None: - self._start_embed_text_wrapper(state, marker) - - def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: - self._end_embed_text_wrapper(state) + if is_embed_part_style(marker): + if self._in_nested_embed: + self._in_nested_embed = False + else: + self._end_note_text_wrapper(state) + if is_embed_style(marker): + self._end_embed(state, marker, attributes, closed) + self._in_embed = False def _start_verse_text(self, state: UsfmParserState, scripture_refs: Optional[Sequence[ScriptureRef]]) -> None: ... @@ -209,7 +212,7 @@ def _end_parent_element(self) -> None: self._cur_elements_stack.pop() def _end_embed_elements(self) -> None: - if self._cur_elements_stack and _is_embed_style(self._cur_elements_stack[-1].name): + if self._cur_elements_stack and is_embed_style(self._cur_elements_stack[-1].name): self._cur_elements_stack.pop() def _create_verse_refs(self) -> List[ScriptureRef]: @@ -238,3 +241,11 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None ): self._start_parent_element(para_tag.marker) self._start_non_verse_text_wrapper(state) + + def _is_in_embed(self, marker: Optional[str]) -> bool: + return self._in_embed or is_embed_style(marker) + + def _is_in_nested_embed(self, marker: Optional[str]) -> bool: + return self._in_nested_embed or ( + marker is not None and marker.startswith("+") and marker[1] in EMBED_PART_START_CHAR_STYLES + ) diff --git a/machine/corpora/scripture_update_block.py b/machine/corpora/scripture_update_block.py index afb9e75a..b4c7e290 100644 --- a/machine/corpora/scripture_update_block.py +++ b/machine/corpora/scripture_update_block.py @@ -1,14 +1,18 @@ from __future__ import annotations from .scripture_ref import ScriptureRef -from .scripture_update_element import ScriptureUpdateElement, ScriptureUpdateElementType +from .scripture_update_element import ( + ScriptureUpdateElement, + ScriptureUpdateElementType, + create_non_text_scripture_element, +) from .usfm_token import UsfmToken, UsfmTokenType class ScriptureUpdateBlock: def __init__(self) -> None: - self._ref: ScriptureRef = ScriptureRef() + self.ref: ScriptureRef = ScriptureRef() self._elements: list[ScriptureUpdateElement] = [] @property @@ -29,21 +33,19 @@ def add_token(self, token: UsfmToken, marked_for_removal: bool = False) -> None: ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) ) else: - self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, [token], marked_for_removal)) + self._elements.append(create_non_text_scripture_element([token], marked_for_removal)) def add_tokens(self, tokens: list[UsfmToken], marked_for_removal: bool = False) -> None: if len(tokens) == 0: return - self._elements.append( - ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, tokens.copy(), marked_for_removal) - ) + self._elements.append(create_non_text_scripture_element(tokens, marked_for_removal)) def update_ref(self, ref: ScriptureRef) -> None: - self._ref = ref + self.ref = ref def clear(self) -> None: self._elements.clear() - self._ref = ScriptureRef() + self.ref = ScriptureRef() def get_tokens(self) -> list[UsfmToken]: return [token for element in self._elements for token in element.get_tokens()] diff --git a/machine/corpora/scripture_update_block_handler.py b/machine/corpora/scripture_update_block_handler.py index ff1d6f9e..e5dc9cca 100644 --- a/machine/corpora/scripture_update_block_handler.py +++ b/machine/corpora/scripture_update_block_handler.py @@ -6,5 +6,4 @@ class ScriptureUpdateBlockHandler(ABC): - def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: - raise NotImplementedError("Must be implemented in subclass") + def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: ... diff --git a/machine/corpora/scripture_update_element.py b/machine/corpora/scripture_update_element.py index fe39d7e5..7296bd0a 100644 --- a/machine/corpora/scripture_update_element.py +++ b/machine/corpora/scripture_update_element.py @@ -3,12 +3,16 @@ from dataclasses import dataclass from enum import Enum, auto -from .usfm_token import UsfmToken +from .scripture_embed import is_embed_style +from .usfm_token import UsfmToken, UsfmTokenType class ScriptureUpdateElementType(Enum): EXISTING_TEXT = auto() INSERTED_TEXT = auto() + PARAGRAPH = auto() + EMBED = auto() + STYLE = auto() OTHER = auto() @@ -22,3 +26,18 @@ def get_tokens(self) -> list[UsfmToken]: if self.marked_for_removal: return [] return self.tokens + + +def create_non_text_scripture_element( + tokens: list[UsfmToken], marked_for_removal: bool = False +) -> ScriptureUpdateElement: + tokens = tokens.copy() + # Determine if it is a Paragraph, style, embed or other + if len(tokens) == 0 or tokens[0].marker is None: + return ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, [], marked_for_removal) + if tokens[0].type == UsfmTokenType.PARAGRAPH: + return ScriptureUpdateElement(ScriptureUpdateElementType.PARAGRAPH, tokens, marked_for_removal) + if is_embed_style(tokens[0].marker): + return ScriptureUpdateElement(ScriptureUpdateElementType.EMBED, tokens, marked_for_removal) + else: + return ScriptureUpdateElement(ScriptureUpdateElementType.STYLE, tokens, marked_for_removal) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index df3c90e7..203e8971 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -2,6 +2,7 @@ from typing import Iterable, List, Optional, Sequence, Tuple, Union from ..scripture.verse_ref import VerseRef +from .scripture_embed import is_embed_part_style from .scripture_ref import ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType from .usfm_parser_state import UsfmParserState @@ -354,9 +355,12 @@ def _skip_updatable_tokens(self, state: UsfmParserState) -> None: self._token_index += 1 self._token_index = state.index + 1 + state.special_token_count - def _replace_with_new_tokens(self, state: UsfmParserState) -> bool: - if self._current_text_type == ScriptureTextType.EMBED: - return False + def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) -> bool: + marker: Optional[str] = state.token if state.token is None else state.token.marker + in_embed: bool = self._is_in_embed(marker) + + in_nested_embed: bool = self._is_in_nested_embed(marker) + is_style_tag: bool = marker is not None and not is_embed_part_style(marker) existing_text = any( t.type == UsfmTokenType.TEXT and t.text From 390baa02ec541c852df1bc7e848e00e0ed6a93f6 Mon Sep 17 00:00:00 2001 From: Ben King Date: Mon, 14 Apr 2025 13:09:10 -0400 Subject: [PATCH 08/31] Additional denormalization tests (not all passing) --- .../analysis/quotation_mark_resolver.py | 95 +++++++++++-------- machine/corpora/analysis/text_segment.py | 4 +- ...lization_scripture_update_block_handler.py | 38 ++++++-- .../analysis/test_quotation_mark_resolver.py | 19 ++++ ...lization_scripture_block_update_handler.py | 36 +++++++ 5 files changed, 142 insertions(+), 50 deletions(-) create mode 100644 tests/corpora/analysis/test_quotation_mark_resolver.py diff --git a/machine/corpora/analysis/quotation_mark_resolver.py b/machine/corpora/analysis/quotation_mark_resolver.py index 07760b16..5d98e5c6 100644 --- a/machine/corpora/analysis/quotation_mark_resolver.py +++ b/machine/corpora/analysis/quotation_mark_resolver.py @@ -15,6 +15,9 @@ def __init__(self): self.quotation_stack: list[QuotationMarkMetadata] = [] self.current_depth: int = 0 + def get_current_depth(self) -> int: + return self.current_depth + 1 + def has_open_quotation_mark(self) -> bool: return self.current_depth > 0 @@ -44,6 +47,10 @@ def get_deepest_opening_quotation_mark(self) -> str: class QuotationContinuerState: def __init__(self): self.quotation_continuer_stack: list[QuotationMarkMetadata] = [] + self.current_depth = 0 + + def get_current_depth(self) -> int: + return self.current_depth def has_continuer_been_observed(self) -> bool: return len(self.quotation_continuer_stack) > 0 @@ -53,22 +60,24 @@ def add_quotation_continuer( ) -> QuotationMarkMetadata: quote = quote_match.resolve(len(self.quotation_continuer_stack) + 1, QuotationMarkDirection.Opening) self.quotation_continuer_stack.append(quote) + self.current_depth += 1 if len(self.quotation_continuer_stack) == len(quotation_mark_resolver_state.quotation_stack): self.quotation_continuer_stack.clear() + self.current_depth = 0 return quote class QuotationMarkResolver: - quote_pattern = regex.compile(r"(?<=(.)|^)(\p{Quotation_Mark}|<<|>>|<|>)(?=(.)|$)", regex.U) apostrophe_pattern = regex.compile(r"[\'\u2019\u2018]", regex.U) - whitespace_pattern = regex.compile(r"^[\s~]*$", regex.U) - latin_letter_pattern = regex.compile(r"^\p{script=Latin}$", regex.U) - punctuation_pattern = regex.compile(r"^[\.,;\?!\)\]\-—۔،؛]$", regex.U) def __init__(self, quote_convention_set: QuoteConventionSet): - self.quote_convention_set = quote_convention_set - self.quotation_mark_resolver_state = QuotationMarkResolverState() - self.quotation_continuer_state = QuotationContinuerState() + self._quote_convention_set = quote_convention_set + self._quotation_mark_resolver_state = QuotationMarkResolverState() + self._quotation_continuer_state = QuotationContinuerState() + + def reset(self) -> None: + self._quotation_mark_resolver_state = QuotationMarkResolverState() + self._quotation_continuer_state = QuotationContinuerState() def resolve_quotation_marks( self, quote_matches: list[QuotationMarkStringMatch] @@ -97,7 +106,7 @@ def _resolve_quotation_mark( elif self._is_apostrophe(quote_match, previous_mark, next_mark): pass elif self._is_closing_quote(quote_match, previous_mark, next_mark): - if not self.quotation_mark_resolver_state.has_open_quotation_mark(): + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): return quote = self._process_closing_mark(quote_match) yield quote @@ -116,40 +125,45 @@ def _is_quotation_continuer( ) -> bool: if not quote_match.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.ParagraphMarker): return False - if not self.quotation_mark_resolver_state.has_open_quotation_mark(): + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): return False - if not self.quotation_continuer_state.has_continuer_been_observed(): + if not self._quotation_continuer_state.has_continuer_been_observed(): if quote_match.start_index > 0: return False if ( quote_match.get_quotation_mark() - != self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark() + != self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() ): return False - if self.quotation_mark_resolver_state.are_more_than_n_quotes_open(1): + if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): if next_match is None or next_match.get_start_index() != quote_match.get_end_index(): return False + elif ( + self._quotation_continuer_state.get_current_depth() + >= self._quotation_mark_resolver_state.get_current_depth() + ): + return False else: if ( quote_match.get_quotation_mark() - != self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark() + != self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() ): return False return True def _process_quotation_continuer(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - return self.quotation_continuer_state.add_quotation_continuer(quote_match, self.quotation_mark_resolver_state) + return self._quotation_continuer_state.add_quotation_continuer(quote_match, self._quotation_mark_resolver_state) def _is_depth_too_great(self) -> bool: - return self.quotation_mark_resolver_state.are_more_than_n_quotes_open(4) + return self._quotation_mark_resolver_state.are_more_than_n_quotes_open(4) def _process_opening_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - return self.quotation_mark_resolver_state.add_opening_quotation_mark(quote_match) + return self._quotation_mark_resolver_state.add_opening_quotation_mark(quote_match) def _process_closing_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - return self.quotation_mark_resolver_state.add_closing_quotation_mark(quote_match) + return self._quotation_mark_resolver_state.add_closing_quotation_mark(quote_match) def _is_opening_quote( self, @@ -158,11 +172,11 @@ def _is_opening_quote( next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if not match.is_valid_opening_quotation_mark(self.quote_convention_set): + if not match.is_valid_opening_quotation_mark(self._quote_convention_set): return False # if the quote convention is ambiguous, use whitespace as a clue - if match.is_valid_closing_quotation_mark(self.quote_convention_set): + if match.is_valid_closing_quotation_mark(self._quote_convention_set): return ( match.has_leading_whitespace() or self._does_most_recent_opening_mark_immediately_precede(match) @@ -177,15 +191,15 @@ def _is_closing_quote( next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if not match.is_valid_closing_quotation_mark(self.quote_convention_set): + if not match.is_valid_closing_quotation_mark(self._quote_convention_set): return False # if the quote convention is ambiguous, use whitespace as a clue - if self.quote_convention_set.is_valid_opening_quotation_mark(match.get_quotation_mark()): + if self._quote_convention_set.is_valid_opening_quotation_mark(match.get_quotation_mark()): return ( match.has_trailing_whitespace() or match.has_trailing_punctuation() - or match.has_trailing_closing_quotation_mark(self.quote_convention_set) + or match.has_trailing_closing_quotation_mark(self._quote_convention_set) ) and not match.has_leading_whitespace() return True @@ -195,7 +209,7 @@ def _is_malformed_opening_quote( previous_match: Union[QuotationMarkStringMatch, None], next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if not self.quote_convention_set.is_valid_opening_quotation_mark(match.get_quotation_mark()): + if not self._quote_convention_set.is_valid_opening_quotation_mark(match.get_quotation_mark()): return False if match.has_leading_quote_introducer(): @@ -204,7 +218,7 @@ def _is_malformed_opening_quote( if ( match.has_leading_whitespace() and match.has_trailing_whitespace() - and not self.quotation_mark_resolver_state.has_open_quotation_mark() + and not self._quotation_mark_resolver_state.has_open_quotation_mark() ): return True @@ -216,7 +230,7 @@ def _is_malformed_closing_quote( previous_match: Union[QuotationMarkStringMatch, None], next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if not self.quote_convention_set.is_valid_closing_quotation_mark(match.get_quotation_mark()): + if not self._quote_convention_set.is_valid_closing_quotation_mark(match.get_quotation_mark()): return False return ( @@ -224,17 +238,19 @@ def _is_malformed_closing_quote( not match.has_trailing_whitespace() or (match.has_leading_whitespace() and match.has_trailing_whitespace()) ) - and self.quotation_mark_resolver_state.has_open_quotation_mark() - and self.quote_convention_set.are_marks_a_valid_pair( - self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + and self._quotation_mark_resolver_state.has_open_quotation_mark() + and self._quote_convention_set.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() ) ) def _does_most_recent_opening_mark_immediately_precede(self, match: QuotationMarkStringMatch) -> bool: - if not self.quotation_mark_resolver_state.has_open_quotation_mark(): + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): return False - return self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == match.get_previous_character() + return ( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == match.get_previous_character() + ) def _is_apostrophe( self, @@ -259,24 +275,25 @@ def _is_apostrophe( match.has_trailing_whitespace() or match.has_trailing_punctuation() ): # check whether it could be a closing quote - if not self.quotation_mark_resolver_state.has_open_quotation_mark(): + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): return True - if not self.quote_convention_set.are_marks_a_valid_pair( - self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + if not self._quote_convention_set.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() ): return True - if next_match is not None and self.quote_convention_set.are_marks_a_valid_pair( - self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), next_match.get_quotation_mark() + if next_match is not None and self._quote_convention_set.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), + next_match.get_quotation_mark(), ): return True # for languages that use apostrophes at the start and end of words if ( - not self.quotation_mark_resolver_state.has_open_quotation_mark() + not self._quotation_mark_resolver_state.has_open_quotation_mark() and match.get_quotation_mark() == "'" - or self.quotation_mark_resolver_state.has_open_quotation_mark() - and not self.quote_convention_set.are_marks_a_valid_pair( - self.quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + or self._quotation_mark_resolver_state.has_open_quotation_mark() + and not self._quote_convention_set.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() ) ): return True diff --git a/machine/corpora/analysis/text_segment.py b/machine/corpora/analysis/text_segment.py index 08080341..ec94b08a 100644 --- a/machine/corpora/analysis/text_segment.py +++ b/machine/corpora/analysis/text_segment.py @@ -33,10 +33,10 @@ def get_immediate_preceding_marker_type(self) -> UsfmMarkerType: def is_marker_in_preceding_context(self, marker: UsfmMarkerType) -> bool: return marker in self.markers_in_preceding_context - def get_previous_segment(self) -> "TextSegment | None": + def get_previous_segment(self) -> Union["TextSegment", None]: return self.previous_segment - def get_next_segment(self) -> "TextSegment | None": + def get_next_segment(self) -> Union["TextSegment", None]: return self.next_segment def is_first_segment_in_verse(self) -> bool: diff --git a/machine/corpora/quotation_denormalization_scripture_update_block_handler.py b/machine/corpora/quotation_denormalization_scripture_update_block_handler.py index 163310b2..78451b9c 100644 --- a/machine/corpora/quotation_denormalization_scripture_update_block_handler.py +++ b/machine/corpora/quotation_denormalization_scripture_update_block_handler.py @@ -21,33 +21,53 @@ def __init__(self, target_quote_convention: QuoteConvention): self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( QuoteConventionSet([self._normalized_quote_convention]) ) - self._quotation_mark_resolver: QuotationMarkResolver = QuotationMarkResolver( + self._next_scripture_text_segment_builder: TextSegment.Builder = TextSegment.Builder() + + # Each embed represents a separate context for quotation marks + # (i.e. you can't open a quote in one and close it in another) + # so we need to keep track of the verse and embed contexts separately. + self._verse_text_quotation_mark_resolver: QuotationMarkResolver = QuotationMarkResolver( + QuoteConventionSet([self._normalized_quote_convention]) + ) + self._embed_quotation_mark_resolver: QuotationMarkResolver = QuotationMarkResolver( QuoteConventionSet([self._normalized_quote_convention]) ) - self._next_scripture_text_segment_builder: TextSegment.Builder = TextSegment.Builder() def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: - for element_index, element in enumerate(block._elements): - self._process_scripture_element(element) + # print(",".join([p.name for p in block._ref.path])) + if block._ref.is_verse: + return self._process_verse_text_block(block) + else: + return self._process_embed_block(block) + + def _process_verse_text_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + for element in block._elements: + self._process_scripture_element(element, self._verse_text_quotation_mark_resolver) + return block + + def _process_embed_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + self._embed_quotation_mark_resolver.reset() + for element in block._elements: + self._process_scripture_element(element, self._embed_quotation_mark_resolver) return block - def _process_scripture_element(self, element: ScriptureUpdateElement) -> None: + def _process_scripture_element( + self, element: ScriptureUpdateElement, quotation_mark_resolver: QuotationMarkResolver + ) -> None: text_segments: List[TextSegment] = self._create_text_segments(element) quotation_mark_matches: List[QuotationMarkStringMatch] = ( self._quotation_mark_finder.find_all_potential_quotation_marks_in_text_segments(text_segments) ) for match in quotation_mark_matches: print(match.get_context()) - for resolved_quotation_mark in self._quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches): + for resolved_quotation_mark in quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches): resolved_quotation_mark.update_quotation_mark(self._target_quote_convention) def _create_text_segments(self, element: ScriptureUpdateElement) -> List[TextSegment]: text_segments: List[TextSegment] = [] for token in element.get_tokens(): if token.type == UsfmTokenType.CHAPTER: - self._quotation_mark_resolver: QuotationMarkResolver = QuotationMarkResolver( - QuoteConventionSet([self._normalized_quote_convention]) - ) + self._verse_text_quotation_mark_resolver.reset() self._next_scripture_text_segment_builder = TextSegment.Builder() self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) elif token.type == UsfmTokenType.VERSE: diff --git a/tests/corpora/analysis/test_quotation_mark_resolver.py b/tests/corpora/analysis/test_quotation_mark_resolver.py new file mode 100644 index 00000000..73784f8a --- /dev/null +++ b/tests/corpora/analysis/test_quotation_mark_resolver.py @@ -0,0 +1,19 @@ +from machine.corpora.analysis import QuotationMarkResolver, standard_quote_conventions + + +def test_reset() -> None: + quotation_mark_resolver: QuotationMarkResolver = QuotationMarkResolver( + standard_quote_conventions.standard_quote_conventions + ) + + assert quotation_mark_resolver._quotation_mark_resolver_state.quotation_stack == [] + assert quotation_mark_resolver._quotation_continuer_state.quotation_continuer_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 + assert quotation_mark_resolver._quotation_continuer_state.current_depth == 0 + + quotation_mark_resolver.reset() + + assert quotation_mark_resolver._quotation_mark_resolver_state.quotation_stack == [] + assert quotation_mark_resolver._quotation_continuer_state.quotation_continuer_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 + assert quotation_mark_resolver._quotation_continuer_state.current_depth == 0 diff --git a/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py b/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py index 0c2e2006..a952c515 100644 --- a/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py +++ b/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py @@ -274,6 +274,42 @@ def test_simple_arabic_quote_denormalization() -> None: assert_usfm_equal(observed_usfm, expected_usfm) +def test_quotes_spanning_verses() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + \\v 2 a'You shall not eat of any tree of the garden'?" + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, \n" + + "\\v 2 ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_single_embed() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + \\f + \\ft "This is a 'footnote'" \\f* + of the field which Yahweh God had made. + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal " + + "\\f + \\ft “This is a ‘footnote’” \\f* of the field which Yahweh God had made." + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + def denormalize_quotation_marks(normalized_usfm: str, quote_convention_name: str) -> str: standard_english_quote_convention = ( standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name(quote_convention_name) From 3fe808b0b2278677f0c17bed6086f47bf2d0ad96 Mon Sep 17 00:00:00 2001 From: Ben King Date: Tue, 15 Apr 2025 09:16:56 -0400 Subject: [PATCH 09/31] Rebase + additional denormalization tests --- .../analysis/quotation_mark_string_match.py | 6 +- ...lization_scripture_update_block_handler.py | 37 ++++++---- ...lization_scripture_block_update_handler.py | 71 ++++++++++++++++++- 3 files changed, 93 insertions(+), 21 deletions(-) diff --git a/machine/corpora/analysis/quotation_mark_string_match.py b/machine/corpora/analysis/quotation_mark_string_match.py index ee0e5d08..55cd440a 100644 --- a/machine/corpora/analysis/quotation_mark_string_match.py +++ b/machine/corpora/analysis/quotation_mark_string_match.py @@ -100,9 +100,9 @@ def is_at_end_of_segment(self) -> bool: def has_leading_whitespace(self) -> bool: if self.get_previous_character() is None: return ( - self.get_text_segment().get_immediate_preceding_marker_type() == UsfmMarkerType.ParagraphMarker - or self.get_text_segment().get_immediate_preceding_marker_type() == UsfmMarkerType.EmbedMarker - or self.get_text_segment().get_immediate_preceding_marker_type() == UsfmMarkerType.VerseMarker + self.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.ParagraphMarker) + or self.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.EmbedMarker) + or self.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.VerseMarker) ) return self.does_previous_character_match(self.whitespace_pattern) diff --git a/machine/corpora/quotation_denormalization_scripture_update_block_handler.py b/machine/corpora/quotation_denormalization_scripture_update_block_handler.py index 78451b9c..3c011b5b 100644 --- a/machine/corpora/quotation_denormalization_scripture_update_block_handler.py +++ b/machine/corpora/quotation_denormalization_scripture_update_block_handler.py @@ -8,16 +8,19 @@ from .analysis.text_segment import TextSegment from .analysis.usfm_marker_type import UsfmMarkerType from .scripture_update_block import ScriptureUpdateBlock -from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase -from .scripture_update_element import ScriptureUpdateElement +from .scripture_update_block_handler import ScriptureUpdateBlockHandler +from .scripture_update_element import ScriptureUpdateElement, ScriptureUpdateElementType from .usfm_token import UsfmTokenType -class QuotationDenormalizationScriptureUpdateBlockHandler(ScriptureUpdateBlockHandlerBase): +class QuotationDenormalizationScriptureUpdateBlockHandler(ScriptureUpdateBlockHandler): - def __init__(self, target_quote_convention: QuoteConvention): + def __init__(self, target_quote_convention: QuoteConvention, should_run_on_existing_text: bool = False): + super().__init__() self._target_quote_convention: QuoteConvention = target_quote_convention self._normalized_quote_convention: QuoteConvention = target_quote_convention.normalize() + self._should_run_on_existing_text: bool = should_run_on_existing_text + self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( QuoteConventionSet([self._normalized_quote_convention]) ) @@ -34,23 +37,30 @@ def __init__(self, target_quote_convention: QuoteConvention): ) def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: - # print(",".join([p.name for p in block._ref.path])) - if block._ref.is_verse: - return self._process_verse_text_block(block) - else: + if len(block.elements) > 0 and block.elements[0].type == ScriptureUpdateElementType.EMBED: return self._process_embed_block(block) - def _process_verse_text_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: - for element in block._elements: - self._process_scripture_element(element, self._verse_text_quotation_mark_resolver) - return block + return self._process_verse_text_block(block) def _process_embed_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: self._embed_quotation_mark_resolver.reset() for element in block._elements: + if element.type == ScriptureUpdateElementType.EXISTING_TEXT and not self._should_run_on_existing_text: + continue + self._process_scripture_element(element, self._embed_quotation_mark_resolver) return block + def _process_verse_text_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + for element in block._elements: + if element.type == ScriptureUpdateElementType.EMBED_BLOCK: + continue + if element.type == ScriptureUpdateElementType.EXISTING_TEXT and not self._should_run_on_existing_text: + continue + + self._process_scripture_element(element, self._verse_text_quotation_mark_resolver) + return block + def _process_scripture_element( self, element: ScriptureUpdateElement, quotation_mark_resolver: QuotationMarkResolver ) -> None: @@ -58,8 +68,6 @@ def _process_scripture_element( quotation_mark_matches: List[QuotationMarkStringMatch] = ( self._quotation_mark_finder.find_all_potential_quotation_marks_in_text_segments(text_segments) ) - for match in quotation_mark_matches: - print(match.get_context()) for resolved_quotation_mark in quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches): resolved_quotation_mark.update_quotation_mark(self._target_quote_convention) @@ -78,7 +86,6 @@ def _create_text_segments(self, element: ScriptureUpdateElement) -> List[TextSeg self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.CharacterMarker) elif token.type == UsfmTokenType.NOTE: self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) - # TODO: create a text segment for the embed elif token.type == UsfmTokenType.TEXT: self._next_scripture_text_segment_builder.set_usfm_token(token) if token.text is not None: diff --git a/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py b/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py index a952c515..5197773f 100644 --- a/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py +++ b/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py @@ -279,7 +279,7 @@ def test_quotes_spanning_verses() -> None: \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, "Has God really said, - \\v 2 a'You shall not eat of any tree of the garden'?" + \\v 2 'You shall not eat of any tree of the garden'?" """ expected_usfm = ( @@ -310,15 +310,80 @@ def test_single_embed() -> None: assert_usfm_equal(observed_usfm, expected_usfm) -def denormalize_quotation_marks(normalized_usfm: str, quote_convention_name: str) -> str: +def test_multiple_embeds() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + \\f + \\ft "This is a 'footnote'" \\f* + of the field \\f + \\ft Second "footnote" here \\f* which Yahweh God had made. + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal " + + "\\f + \\ft “This is a ‘footnote’” \\f* of the field \\f + \\ft Second " + + "“footnote” here \\f* which Yahweh God had made." + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_quotes_in_text_and_embed() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really \\f + \\ft a + "footnote" in the "midst of 'text'" \\f* said, + 'You shall not eat of any tree of the garden'?" + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + + "said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_quotes_in_multiple_verses_and_embed() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God + \\v 2 really \\f + \\ft a + "footnote" in the "midst of 'text'" \\f* said, + 'You shall not eat of any tree of the garden'?" + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God\n" + + "\\v 2 really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + + "said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def denormalize_quotation_marks( + normalized_usfm: str, quote_convention_name: str, should_run_on_existing_text=True +) -> str: standard_english_quote_convention = ( standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name(quote_convention_name) ) assert standard_english_quote_convention is not None quotation_denormalizer: QuotationDenormalizationScriptureUpdateBlockHandler = ( - QuotationDenormalizationScriptureUpdateBlockHandler(standard_english_quote_convention) + QuotationDenormalizationScriptureUpdateBlockHandler( + standard_english_quote_convention, should_run_on_existing_text=should_run_on_existing_text + ) ) + updater = UpdateUsfmParserHandler(update_block_handlers=[quotation_denormalizer]) parse_usfm(normalized_usfm, updater) From dd01cbc6918cf68a480f74395029eef8aabc7514 Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 25 Apr 2025 15:25:33 -0400 Subject: [PATCH 10/31] Improved handling for NLLB-produced quote errors --- machine/corpora/__init__.py | 8 + machine/corpora/analysis/__init__.py | 10 +- .../depth_based_quotation_mark_resolver.py | 341 +++++++++++++++ .../preliminary_quotation_analyzer.py | 14 +- .../analysis/quotation_mark_direction.py | 6 +- .../quotation_mark_resolution_issue.py | 9 + .../quotation_mark_resolution_settings.py | 24 + .../analysis/quotation_mark_resolver.py | 302 +------------ .../analysis/quotation_mark_string_match.py | 25 +- machine/corpora/analysis/quote_convention.py | 13 +- ...onvention_detection_resolution_settings.py | 35 ++ .../analysis/quote_convention_detector.py | 13 +- .../corpora/analysis/quote_convention_set.py | 15 + machine/corpora/analysis/usfm_marker_type.py | 16 +- .../corpora/basic_quotation_mark_resolver.py | 119 +++++ .../quotation_denormalization_action.py | 7 + .../quotation_denormalization_first_pass.py | 82 ++++ ...ion_denormalization_resolution_settings.py | 37 ++ ...lization_scripture_update_block_handler.py | 94 +++- .../quotation_denormalization_settings.py | 40 ++ .../analysis/test_quotation_mark_resolver.py | 40 +- ...st_quotation_denormalization_first_pass.py | 205 +++++++++ ...lization_scripture_block_update_handler.py | 413 ++++++++++++++++-- 23 files changed, 1487 insertions(+), 381 deletions(-) create mode 100644 machine/corpora/analysis/depth_based_quotation_mark_resolver.py create mode 100644 machine/corpora/analysis/quotation_mark_resolution_issue.py create mode 100644 machine/corpora/analysis/quotation_mark_resolution_settings.py create mode 100644 machine/corpora/analysis/quote_convention_detection_resolution_settings.py create mode 100644 machine/corpora/basic_quotation_mark_resolver.py create mode 100644 machine/corpora/quotation_denormalization_action.py create mode 100644 machine/corpora/quotation_denormalization_first_pass.py create mode 100644 machine/corpora/quotation_denormalization_resolution_settings.py create mode 100644 machine/corpora/quotation_denormalization_settings.py create mode 100644 tests/corpora/test_quotation_denormalization_first_pass.py diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index bf5f611c..23e95124 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -24,9 +24,13 @@ from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler +from .quotation_denormalization_action import QuotationDenormalizationAction +from .quotation_denormalization_first_pass import QuotationDenormalizationFirstPass +from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings from .quotation_denormalization_scripture_update_block_handler import ( QuotationDenormalizationScriptureUpdateBlockHandler, ) +from .quotation_denormalization_settings import QuotationDenormalizationSettings from .scripture_element import ScriptureElement from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType @@ -124,7 +128,11 @@ "PlaceMarkersAlignmentInfo", "PlaceMarkersUsfmUpdateBlockHandler", "parse_usfm", + "QuotationDenormalizationAction", + "QuotationDenormalizationFirstPass", "QuotationDenormalizationScriptureUpdateBlockHandler", + "QuotationDenormalizationResolutionSettings", + "QuotationDenormalizationSettings", "RtlReferenceOrder", "ScriptureElement", "ScriptureRef", diff --git a/machine/corpora/analysis/__init__.py b/machine/corpora/analysis/__init__.py index 411a5643..8bb23a5b 100644 --- a/machine/corpora/analysis/__init__.py +++ b/machine/corpora/analysis/__init__.py @@ -1,16 +1,24 @@ +from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings from .quotation_mark_resolver import QuotationMarkResolver from .quotation_mark_string_match import QuotationMarkStringMatch from .quote_convention import QuoteConvention +from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector from .quote_convention_set import QuoteConventionSet from .text_segment import TextSegment from .usfm_marker_type import UsfmMarkerType __all__ = [ - "QuotationMarkResolver", + "DepthBasedQuotationMarkResolver", "QuotationMarkStringMatch", "QuoteConvention", "QuoteConventionAnalysis", + "QuoteConventionDetectionResolutionSettings", + "QuotationMarkResolutionIssue", + "QuotationMarkResolutionSettings", + "QuotationMarkResolver", "QuoteConventionDetector", "QuoteConventionSet", "TextSegment", diff --git a/machine/corpora/analysis/depth_based_quotation_mark_resolver.py b/machine/corpora/analysis/depth_based_quotation_mark_resolver.py new file mode 100644 index 00000000..202cff00 --- /dev/null +++ b/machine/corpora/analysis/depth_based_quotation_mark_resolver.py @@ -0,0 +1,341 @@ +from typing import Generator, Set, Union + +import regex + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_metadata import QuotationMarkMetadata +from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .quotation_mark_resolver import QuotationMarkResolver +from .quotation_mark_string_match import QuotationMarkStringMatch +from .usfm_marker_type import UsfmMarkerType + + +class QuotationMarkResolverState: + + def __init__(self): + self.quotation_stack: list[QuotationMarkMetadata] = [] + self.current_depth: int = 0 + + def get_current_depth(self) -> int: + return self.current_depth + 1 + + def has_open_quotation_mark(self) -> bool: + return self.current_depth > 0 + + def are_more_than_n_quotes_open(self, n: int) -> bool: + return self.current_depth > n + + def add_opening_quotation_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + quote = quote_match.resolve(self.current_depth + 1, QuotationMarkDirection.Opening) + self.quotation_stack.append(quote) + self.current_depth += 1 + return quote + + def add_closing_quotation_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + quote = quote_match.resolve(self.current_depth, QuotationMarkDirection.Closing) + self.quotation_stack.pop() + self.current_depth -= 1 + return quote + + def get_opening_quotation_mark_at_depth(self, depth: int) -> str: + if depth > len(self.quotation_stack): + raise RuntimeError( + "get_opening_quotation_mark_at_depth() was called with a depth greater than the quotation stack size." + ) + return self.quotation_stack[depth - 1].get_quotation_mark() + + def get_deepest_opening_quotation_mark(self) -> str: + if not self.has_open_quotation_mark(): + raise RuntimeError( + "get_deepest_opening_quotation_mark() was called when the stack of quotation marks was empty." + ) + return self.quotation_stack[-1].get_quotation_mark() + + +class QuotationContinuerState: + def __init__(self): + self.quotation_continuer_stack: list[QuotationMarkMetadata] = [] + self.current_depth = 0 + + def get_current_depth(self) -> int: + return self.current_depth + + def has_continuer_been_observed(self) -> bool: + return len(self.quotation_continuer_stack) > 0 + + def add_quotation_continuer( + self, quote_match: QuotationMarkStringMatch, quotation_mark_resolver_state: QuotationMarkResolverState + ) -> QuotationMarkMetadata: + quote = quote_match.resolve(len(self.quotation_continuer_stack) + 1, QuotationMarkDirection.Opening) + self.quotation_continuer_stack.append(quote) + self.current_depth += 1 + if len(self.quotation_continuer_stack) == len(quotation_mark_resolver_state.quotation_stack): + self.quotation_continuer_stack.clear() + self.current_depth = 0 + return quote + + +class DepthBasedQuotationMarkResolver(QuotationMarkResolver): + apostrophe_pattern = regex.compile(r"[\'\u2019\u2018]", regex.U) + + def __init__(self, settings: QuotationMarkResolutionSettings): + self._settings = settings + self._quotation_mark_resolver_state = QuotationMarkResolverState() + self._quotation_continuer_state = QuotationContinuerState() + self._issues: Set[QuotationMarkResolutionIssue] = set() + + def reset(self) -> None: + self._quotation_mark_resolver_state = QuotationMarkResolverState() + self._quotation_continuer_state = QuotationContinuerState() + self._issues = set() + + def resolve_quotation_marks( + self, quote_matches: list[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: + for quote_index, quote_match in enumerate(quote_matches): + previous_mark = None if quote_index == 0 else quote_matches[quote_index - 1] + next_mark = None if quote_index == len(quote_matches) - 1 else quote_matches[quote_index + 1] + yield from self._resolve_quotation_mark(quote_match, previous_mark, next_mark) + if self._quotation_mark_resolver_state.has_open_quotation_mark(): + self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) + + def _resolve_quotation_mark( + self, + quote_match: QuotationMarkStringMatch, + previous_mark: Union[QuotationMarkStringMatch, None], + next_mark: Union[QuotationMarkStringMatch, None], + ) -> Generator[QuotationMarkMetadata, None, None]: + if self._is_opening_quote(quote_match): + if self._is_quotation_continuer(quote_match, previous_mark, next_mark): + yield self._process_quotation_continuer(quote_match) + else: + if self._is_depth_too_great(): + self._issues.add(QuotationMarkResolutionIssue.TOO_DEEP_NESTING) + return + + yield self._process_opening_mark(quote_match) + elif self._is_apostrophe(quote_match, next_mark): + pass + elif self._is_closing_quote(quote_match): + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): + self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) + return + yield self._process_closing_mark(quote_match) + elif self._is_malformed_closing_quote(quote_match): + yield self._process_closing_mark(quote_match) + elif self._is_malformed_opening_quote(quote_match): + yield self._process_opening_mark(quote_match) + elif self._is_unpaired_closing_quote(quote_match): + self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) + else: + self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) + + def _is_quotation_continuer( + self, + quote_match: QuotationMarkStringMatch, + previous_match: Union[QuotationMarkStringMatch, None], + next_match: Union[QuotationMarkStringMatch, None], + ) -> bool: + if ( + self._settings.should_rely_on_paragraph_markers() + and not quote_match.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.ParagraphMarker) + ): + return False + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): + return False + + if quote_match.has_quote_introducer_in_leading_substring(): + return False + + if not self._quotation_continuer_state.has_continuer_been_observed(): + if quote_match.start_index > 0: + return False + if ( + quote_match.get_quotation_mark() + != self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( + self._quotation_continuer_state.get_current_depth() + 1 + ) + ): + return False + if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): + if next_match is None or next_match.get_start_index() != quote_match.get_end_index(): + return False + elif ( + self._quotation_continuer_state.get_current_depth() + >= self._quotation_mark_resolver_state.get_current_depth() + ): + return False + else: + if ( + quote_match.get_quotation_mark() + != self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() + ): + return False + + return True + + def _process_quotation_continuer(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + return self._quotation_continuer_state.add_quotation_continuer(quote_match, self._quotation_mark_resolver_state) + + def _is_depth_too_great(self) -> bool: + return self._quotation_mark_resolver_state.are_more_than_n_quotes_open(4) + + def _process_opening_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + if not self._settings.does_metadata_match_quotation_mark( + quote_match.get_quotation_mark(), + self._quotation_mark_resolver_state.get_current_depth(), + QuotationMarkDirection.Opening, + ): + self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) + return self._quotation_mark_resolver_state.add_opening_quotation_mark(quote_match) + + def _process_closing_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + if not self._settings.does_metadata_match_quotation_mark( + quote_match.get_quotation_mark(), + self._quotation_mark_resolver_state.get_current_depth() - 1, + QuotationMarkDirection.Closing, + ): + self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) + return self._quotation_mark_resolver_state.add_closing_quotation_mark(quote_match) + + def _is_opening_quote( + self, + match: QuotationMarkStringMatch, + ) -> bool: + + if not self._settings.is_valid_opening_quotation_mark(match): + return False + + # if the quote convention is ambiguous, use whitespace as a clue + if self._settings.is_valid_closing_quotation_mark(match): + return ( + match.has_leading_whitespace() + or self._does_most_recent_opening_mark_immediately_precede(match) + or match.has_quote_introducer_in_leading_substring() + ) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation()) + return True + + def _is_closing_quote( + self, + match: QuotationMarkStringMatch, + ) -> bool: + + if not self._settings.is_valid_closing_quotation_mark(match): + return False + + # if the quote convention is ambiguous, use whitespace as a clue + if self._settings.is_valid_opening_quotation_mark(match): + return ( + match.has_trailing_whitespace() + or match.has_trailing_punctuation() + # or match.has_trailing_closing_quotation_mark(self._possible_quote_convention_set) + ) and not match.has_leading_whitespace() + return True + + def _is_malformed_opening_quote( + self, + match: QuotationMarkStringMatch, + ) -> bool: + if not self._settings.is_valid_opening_quotation_mark(match): + return False + + if match.has_quote_introducer_in_leading_substring(): + return True + + if ( + match.has_leading_whitespace() + and match.has_trailing_whitespace() + and not self._quotation_mark_resolver_state.has_open_quotation_mark() + ): + return True + + return False + + def _is_malformed_closing_quote( + self, + match: QuotationMarkStringMatch, + ) -> bool: + if not self._settings.is_valid_closing_quotation_mark(match): + return False + + return ( + ( + (match.is_at_end_of_segment() or not match.has_trailing_whitespace()) + or (match.has_leading_whitespace() and match.has_trailing_whitespace()) + ) + and self._quotation_mark_resolver_state.has_open_quotation_mark() + and self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + ) + ) + + def _is_unpaired_closing_quote( + self, + match: QuotationMarkStringMatch, + ) -> bool: + if not self._settings.is_valid_closing_quotation_mark(match): + return False + + if self._quotation_mark_resolver_state.has_open_quotation_mark(): + return False + + return not match.has_leading_whitespace() and (match.is_at_end_of_segment() or match.has_trailing_whitespace()) + + def _does_most_recent_opening_mark_immediately_precede(self, match: QuotationMarkStringMatch) -> bool: + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): + return False + + return ( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == match.get_previous_character() + ) + + def _is_apostrophe( + self, + match: QuotationMarkStringMatch, + next_match: Union[QuotationMarkStringMatch, None], + ) -> bool: + if not match.does_quotation_mark_match(self.apostrophe_pattern): + return False + + # Latin letters on both sides of punctuation mark + if ( + match.get_previous_character() is not None + and match.has_leading_latin_letter() + and match.get_next_character() is not None + and match.has_trailing_latin_letter() + ): + return True + + # potential final s possessive (e.g. Moses') + if match.does_previous_character_match(regex.compile(r"s")) and ( + match.has_trailing_whitespace() or match.has_trailing_punctuation() + ): + # check whether it could be a closing quote + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): + return True + if not self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + ): + return True + if next_match is not None and self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), + next_match.get_quotation_mark(), + ): + return True + + # for languages that use apostrophes at the start and end of words + if ( + not self._quotation_mark_resolver_state.has_open_quotation_mark() + and match.get_quotation_mark() == "'" + or self._quotation_mark_resolver_state.has_open_quotation_mark() + and not self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + ) + ): + return True + + return False + + def get_issues(self) -> Set[QuotationMarkResolutionIssue]: + return self._issues diff --git a/machine/corpora/analysis/preliminary_quotation_analyzer.py b/machine/corpora/analysis/preliminary_quotation_analyzer.py index 882a6f99..6204243d 100644 --- a/machine/corpora/analysis/preliminary_quotation_analyzer.py +++ b/machine/corpora/analysis/preliminary_quotation_analyzer.py @@ -167,11 +167,11 @@ def is_mark_common_early_and_late(self, quotation_mark: str) -> bool: class QuotationMarkGrouper: - def __init__(self, quotation_marks: list[QuotationMarkStringMatch], quote_convention_set: QuoteConventionSet): + def __init__(self, quotation_marks: List[QuotationMarkStringMatch], quote_convention_set: QuoteConventionSet): self.quote_convention_set = quote_convention_set self._group_quotation_marks(quotation_marks) - def _group_quotation_marks(self, quotation_marks: list[QuotationMarkStringMatch]) -> None: + def _group_quotation_marks(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: self.grouped_quotation_marks: Dict[str, List[QuotationMarkStringMatch]] = dict() for quotation_mark_match in quotation_marks: if quotation_mark_match.get_quotation_mark() not in self.grouped_quotation_marks: @@ -230,7 +230,7 @@ def _reset_analysis(self) -> None: self.earlier_quotation_mark_counts: dict[str, int] = dict() self.later_quotation_mark_counts: dict[str, int] = dict() - def narrow_down_possible_quote_conventions(self, chapters: list[Chapter]) -> QuoteConventionSet: + def narrow_down_possible_quote_conventions(self, chapters: List[Chapter]) -> QuoteConventionSet: for chapter in chapters: self._analyze_quotation_marks_for_chapter(chapter) return self._select_compatible_quote_conventions() @@ -252,13 +252,13 @@ def _count_characters_in_verse(self, verse: Verse) -> None: def _count_characters_in_text_segment(self, text_segment: TextSegment) -> None: self.character_count_statistics.count_characters(text_segment) - def _analyze_quotation_mark_sequence(self, quotation_marks: list[QuotationMarkStringMatch]) -> None: + def _analyze_quotation_mark_sequence(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: quotation_mark_grouper: QuotationMarkGrouper = QuotationMarkGrouper(quotation_marks, self.quote_conventions) for earlier_mark, later_mark in quotation_mark_grouper.get_quotation_mark_pairs(): self.quotation_mark_sequences.record_earlier_quotation_mark(earlier_mark) self.quotation_mark_sequences.record_later_quotation_mark(later_mark) - def _count_verse_starting_and_ending_quotation_marks(self, quotation_marks: list[QuotationMarkStringMatch]) -> None: + def _count_verse_starting_and_ending_quotation_marks(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: for quotation_mark_match in quotation_marks: if quotation_mark_match.does_quotation_mark_match(self.apostrophe_pattern): self._count_apostrophe(quotation_mark_match) @@ -270,13 +270,13 @@ def _count_verse_starting_and_ending_quotation_marks(self, quotation_marks: list def _is_at_start_of_verse(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: return ( quotation_mark_match.get_text_segment().is_first_segment_in_verse() - and not quotation_mark_match.has_leading_letter() + and not quotation_mark_match.has_letter_in_leading_substring() ) def _is_at_end_of_verse(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: return ( quotation_mark_match.get_text_segment().is_last_segment_in_verse() - and not quotation_mark_match.has_trailing_letter() + and not quotation_mark_match.has_letter_in_trailing_substring() ) def _count_apostrophe(self, apostrophe_match: QuotationMarkStringMatch) -> None: diff --git a/machine/corpora/analysis/quotation_mark_direction.py b/machine/corpora/analysis/quotation_mark_direction.py index e3996423..f606f991 100644 --- a/machine/corpora/analysis/quotation_mark_direction.py +++ b/machine/corpora/analysis/quotation_mark_direction.py @@ -1,6 +1,6 @@ -from enum import Enum +from enum import Enum, auto class QuotationMarkDirection(Enum): - Opening = "Opening" - Closing = "Closing" + Opening = auto() + Closing = auto() diff --git a/machine/corpora/analysis/quotation_mark_resolution_issue.py b/machine/corpora/analysis/quotation_mark_resolution_issue.py new file mode 100644 index 00000000..4022722c --- /dev/null +++ b/machine/corpora/analysis/quotation_mark_resolution_issue.py @@ -0,0 +1,9 @@ +from enum import Enum, auto + + +class QuotationMarkResolutionIssue(Enum): + UNPAIRED_QUOTATION_MARK = auto() + TOO_DEEP_NESTING = auto() + INCOMPATIBLE_QUOTATION_MARK = auto() + AMBIGUOUS_QUOTATION_MARK = auto() + UNEXPECTED_QUOTATION_MARK = auto() diff --git a/machine/corpora/analysis/quotation_mark_resolution_settings.py b/machine/corpora/analysis/quotation_mark_resolution_settings.py new file mode 100644 index 00000000..59d4b6c4 --- /dev/null +++ b/machine/corpora/analysis/quotation_mark_resolution_settings.py @@ -0,0 +1,24 @@ +from abc import ABC +from typing import Set + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_string_match import QuotationMarkStringMatch + + +class QuotationMarkResolutionSettings(ABC): + + def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: ... + + def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: ... + + def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: ... + + def should_rely_on_paragraph_markers(self) -> bool: ... + + def should_quit_on_error(self) -> bool: ... + + def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: ... + + def does_metadata_match_quotation_mark( + self, quotation_mark: str, depth: int, direction: QuotationMarkDirection + ) -> bool: ... diff --git a/machine/corpora/analysis/quotation_mark_resolver.py b/machine/corpora/analysis/quotation_mark_resolver.py index 5d98e5c6..658aa2e1 100644 --- a/machine/corpora/analysis/quotation_mark_resolver.py +++ b/machine/corpora/analysis/quotation_mark_resolver.py @@ -1,301 +1,21 @@ -from typing import Generator, Union +from abc import ABC +from typing import Generator, List, Set -import regex - -from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_metadata import QuotationMarkMetadata +from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings from .quotation_mark_string_match import QuotationMarkStringMatch -from .quote_convention_set import QuoteConventionSet -from .usfm_marker_type import UsfmMarkerType - - -class QuotationMarkResolverState: - - def __init__(self): - self.quotation_stack: list[QuotationMarkMetadata] = [] - self.current_depth: int = 0 - - def get_current_depth(self) -> int: - return self.current_depth + 1 - - def has_open_quotation_mark(self) -> bool: - return self.current_depth > 0 - - def are_more_than_n_quotes_open(self, n: int) -> bool: - return self.current_depth > n - - def add_opening_quotation_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - quote = quote_match.resolve(self.current_depth + 1, QuotationMarkDirection.Opening) - self.quotation_stack.append(quote) - self.current_depth += 1 - return quote - - def add_closing_quotation_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - quote = quote_match.resolve(self.current_depth, QuotationMarkDirection.Closing) - self.quotation_stack.pop() - self.current_depth -= 1 - return quote - - def get_deepest_opening_quotation_mark(self) -> str: - if not self.has_open_quotation_mark(): - raise RuntimeError( - "get_deepest_opening_quotation_mark() was called when the stack of quotation marks was empty." - ) - return self.quotation_stack[-1].get_quotation_mark() - -class QuotationContinuerState: - def __init__(self): - self.quotation_continuer_stack: list[QuotationMarkMetadata] = [] - self.current_depth = 0 - def get_current_depth(self) -> int: - return self.current_depth +class QuotationMarkResolver(ABC): - def has_continuer_been_observed(self) -> bool: - return len(self.quotation_continuer_stack) > 0 - - def add_quotation_continuer( - self, quote_match: QuotationMarkStringMatch, quotation_mark_resolver_state: QuotationMarkResolverState - ) -> QuotationMarkMetadata: - quote = quote_match.resolve(len(self.quotation_continuer_stack) + 1, QuotationMarkDirection.Opening) - self.quotation_continuer_stack.append(quote) - self.current_depth += 1 - if len(self.quotation_continuer_stack) == len(quotation_mark_resolver_state.quotation_stack): - self.quotation_continuer_stack.clear() - self.current_depth = 0 - return quote - - -class QuotationMarkResolver: - apostrophe_pattern = regex.compile(r"[\'\u2019\u2018]", regex.U) - - def __init__(self, quote_convention_set: QuoteConventionSet): - self._quote_convention_set = quote_convention_set - self._quotation_mark_resolver_state = QuotationMarkResolverState() - self._quotation_continuer_state = QuotationContinuerState() - - def reset(self) -> None: - self._quotation_mark_resolver_state = QuotationMarkResolverState() - self._quotation_continuer_state = QuotationContinuerState() + def __init__(self, settings: QuotationMarkResolutionSettings): + self.settings = settings def resolve_quotation_marks( - self, quote_matches: list[QuotationMarkStringMatch] - ) -> Generator[QuotationMarkMetadata, None, None]: - for quote_index, quote_match in enumerate(quote_matches): - previous_mark = None if quote_index == 0 else quote_matches[quote_index - 1] - next_mark = None if quote_index == len(quote_matches) - 1 else quote_matches[quote_index + 1] - yield from self._resolve_quotation_mark(quote_match, previous_mark, next_mark) - - def _resolve_quotation_mark( - self, - quote_match: QuotationMarkStringMatch, - previous_mark: Union[QuotationMarkStringMatch, None], - next_mark: Union[QuotationMarkStringMatch, None], - ) -> Generator[QuotationMarkMetadata, None, None]: - if self._is_opening_quote(quote_match, previous_mark, next_mark): - if self._is_quotation_continuer(quote_match, previous_mark, next_mark): - quote = self._process_quotation_continuer(quote_match) - yield quote - else: - if self._is_depth_too_great(): - return - - quote = self._process_opening_mark(quote_match) - yield quote - elif self._is_apostrophe(quote_match, previous_mark, next_mark): - pass - elif self._is_closing_quote(quote_match, previous_mark, next_mark): - if not self._quotation_mark_resolver_state.has_open_quotation_mark(): - return - quote = self._process_closing_mark(quote_match) - yield quote - elif self._is_malformed_closing_quote(quote_match, previous_mark, next_mark): - quote = self._process_closing_mark(quote_match) - yield quote - elif self._is_malformed_opening_quote(quote_match, previous_mark, next_mark): - quote = self._process_opening_mark(quote_match) - yield quote - - def _is_quotation_continuer( - self, - quote_match: QuotationMarkStringMatch, - previous_match: Union[QuotationMarkStringMatch, None], - next_match: Union[QuotationMarkStringMatch, None], - ) -> bool: - if not quote_match.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.ParagraphMarker): - return False - if not self._quotation_mark_resolver_state.has_open_quotation_mark(): - return False - - if not self._quotation_continuer_state.has_continuer_been_observed(): - if quote_match.start_index > 0: - return False - if ( - quote_match.get_quotation_mark() - != self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() - ): - return False - if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): - if next_match is None or next_match.get_start_index() != quote_match.get_end_index(): - return False - elif ( - self._quotation_continuer_state.get_current_depth() - >= self._quotation_mark_resolver_state.get_current_depth() - ): - return False - else: - if ( - quote_match.get_quotation_mark() - != self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() - ): - return False - - return True - - def _process_quotation_continuer(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - return self._quotation_continuer_state.add_quotation_continuer(quote_match, self._quotation_mark_resolver_state) - - def _is_depth_too_great(self) -> bool: - return self._quotation_mark_resolver_state.are_more_than_n_quotes_open(4) - - def _process_opening_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - return self._quotation_mark_resolver_state.add_opening_quotation_mark(quote_match) - - def _process_closing_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - return self._quotation_mark_resolver_state.add_closing_quotation_mark(quote_match) - - def _is_opening_quote( - self, - match: QuotationMarkStringMatch, - previous_match: Union[QuotationMarkStringMatch, None], - next_match: Union[QuotationMarkStringMatch, None], - ) -> bool: - - if not match.is_valid_opening_quotation_mark(self._quote_convention_set): - return False - - # if the quote convention is ambiguous, use whitespace as a clue - if match.is_valid_closing_quotation_mark(self._quote_convention_set): - return ( - match.has_leading_whitespace() - or self._does_most_recent_opening_mark_immediately_precede(match) - or match.has_leading_quote_introducer() - ) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation()) - return True - - def _is_closing_quote( - self, - match: QuotationMarkStringMatch, - previous_match: Union[QuotationMarkStringMatch, None], - next_match: Union[QuotationMarkStringMatch, None], - ) -> bool: - - if not match.is_valid_closing_quotation_mark(self._quote_convention_set): - return False - - # if the quote convention is ambiguous, use whitespace as a clue - if self._quote_convention_set.is_valid_opening_quotation_mark(match.get_quotation_mark()): - return ( - match.has_trailing_whitespace() - or match.has_trailing_punctuation() - or match.has_trailing_closing_quotation_mark(self._quote_convention_set) - ) and not match.has_leading_whitespace() - return True - - def _is_malformed_opening_quote( - self, - match: QuotationMarkStringMatch, - previous_match: Union[QuotationMarkStringMatch, None], - next_match: Union[QuotationMarkStringMatch, None], - ) -> bool: - if not self._quote_convention_set.is_valid_opening_quotation_mark(match.get_quotation_mark()): - return False - - if match.has_leading_quote_introducer(): - return True - - if ( - match.has_leading_whitespace() - and match.has_trailing_whitespace() - and not self._quotation_mark_resolver_state.has_open_quotation_mark() - ): - return True - - return False - - def _is_malformed_closing_quote( - self, - match: QuotationMarkStringMatch, - previous_match: Union[QuotationMarkStringMatch, None], - next_match: Union[QuotationMarkStringMatch, None], - ) -> bool: - if not self._quote_convention_set.is_valid_closing_quotation_mark(match.get_quotation_mark()): - return False - - return ( - ( - not match.has_trailing_whitespace() - or (match.has_leading_whitespace() and match.has_trailing_whitespace()) - ) - and self._quotation_mark_resolver_state.has_open_quotation_mark() - and self._quote_convention_set.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() - ) - ) - - def _does_most_recent_opening_mark_immediately_precede(self, match: QuotationMarkStringMatch) -> bool: - if not self._quotation_mark_resolver_state.has_open_quotation_mark(): - return False - - return ( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == match.get_previous_character() - ) - - def _is_apostrophe( - self, - match: QuotationMarkStringMatch, - previous_match: Union[QuotationMarkStringMatch, None], - next_match: Union[QuotationMarkStringMatch, None], - ) -> bool: - if not match.does_quotation_mark_match(self.apostrophe_pattern): - return False - - # Latin letters on both sides of punctuation mark - if ( - match.get_previous_character() is not None - and match.has_leading_latin_letter() - and match.get_next_character() is not None - and match.has_trailing_latin_letter() - ): - return True - - # potential final s possessive (e.g. Moses') - if match.does_previous_character_match(regex.compile(r"s")) and ( - match.has_trailing_whitespace() or match.has_trailing_punctuation() - ): - # check whether it could be a closing quote - if not self._quotation_mark_resolver_state.has_open_quotation_mark(): - return True - if not self._quote_convention_set.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() - ): - return True - if next_match is not None and self._quote_convention_set.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), - next_match.get_quotation_mark(), - ): - return True + self, quote_matches: List[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: ... - # for languages that use apostrophes at the start and end of words - if ( - not self._quotation_mark_resolver_state.has_open_quotation_mark() - and match.get_quotation_mark() == "'" - or self._quotation_mark_resolver_state.has_open_quotation_mark() - and not self._quote_convention_set.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() - ) - ): - return True + def reset(self) -> None: ... - return False + def get_issues(self) -> Set[QuotationMarkResolutionIssue]: ... diff --git a/machine/corpora/analysis/quotation_mark_string_match.py b/machine/corpora/analysis/quotation_mark_string_match.py index 55cd440a..ac168e03 100644 --- a/machine/corpora/analysis/quotation_mark_string_match.py +++ b/machine/corpora/analysis/quotation_mark_string_match.py @@ -17,7 +17,7 @@ class QuotationMarkStringMatch: latin_letter_pattern: Pattern = regex.compile(r"^\p{script=Latin}$", regex.U) whitespace_pattern: Pattern = regex.compile(r"[\s~]", regex.U) punctuation_pattern: Pattern = regex.compile(r"[\.,;\?!\)\]\-—۔،؛]", regex.U) - quote_introducer_pattern: Pattern = regex.compile(r"[:,]", regex.U) + quote_introducer_pattern: Pattern = regex.compile(r"[:,]\\s*", regex.U) def __init__(self, text_segment: TextSegment, start_index: int, end_index: int): self.text_segment = text_segment @@ -65,6 +65,12 @@ def get_next_character(self) -> Union[str, None]: return None return self.text_segment.get_text()[self.end_index] + def does_leading_substring_match(self, regex_pattern: regex.Pattern) -> bool: + return regex_pattern.search(self.text_segment.substring_before(self.start_index)) is not None + + def does_trailing_substring_match(self, regex_pattern: regex.Pattern) -> bool: + return regex_pattern.search(self.text_segment.substring_after(self.end_index)) is not None + # this assumes that the two matches occur in the same verse def precedes(self, other: "QuotationMarkStringMatch") -> bool: return self.text_segment.index_in_verse < other.text_segment.index_in_verse or ( @@ -116,16 +122,11 @@ def has_leading_punctuation(self) -> bool: def has_trailing_punctuation(self) -> bool: return self.does_next_character_match(self.punctuation_pattern) - # TODO: refactor this to use a passed regex pattern - def has_leading_letter(self) -> bool: - if self.letter_pattern.search(self.text_segment.substring_before(self.start_index)): - return True - return False + def has_letter_in_leading_substring(self) -> bool: + return self.does_leading_substring_match(self.letter_pattern) - def has_trailing_letter(self) -> bool: - if self.letter_pattern.search(self.text_segment.substring_after(self.end_index)): - return True - return False + def has_letter_in_trailing_substring(self) -> bool: + return self.does_trailing_substring_match(self.letter_pattern) def has_leading_latin_letter(self) -> bool: return self.does_previous_character_match(self.latin_letter_pattern) @@ -133,8 +134,8 @@ def has_leading_latin_letter(self) -> bool: def has_trailing_latin_letter(self) -> bool: return self.does_next_character_match(self.latin_letter_pattern) - def has_leading_quote_introducer(self) -> bool: - return self.does_previous_character_match(self.quote_introducer_pattern) + def has_quote_introducer_in_leading_substring(self) -> bool: + return self.does_leading_substring_match(self.quote_introducer_pattern) def has_leading_closing_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: return self.does_previous_character_match(quote_convention_set.get_opening_quotation_mark_regex()) diff --git a/machine/corpora/analysis/quote_convention.py b/machine/corpora/analysis/quote_convention.py index 2f28a595..5d3ba60f 100644 --- a/machine/corpora/analysis/quote_convention.py +++ b/machine/corpora/analysis/quote_convention.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, Set from .quotation_mark_direction import QuotationMarkDirection @@ -65,7 +65,7 @@ def get_expected_quotation_mark(self, depth: int, direction: QuotationMarkDirect return "" return ( self.get_opening_quote_at_level(depth) - if direction == QuotationMarkDirection.Opening + if direction is QuotationMarkDirection.Opening else self.get_closing_quote_at_level(depth) ) @@ -81,6 +81,15 @@ def _includes_closing_quotation_mark(self, closing_quotation_mark: str) -> bool: return True return False + def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: + depths: Set[int] = set() + for depth, level in enumerate(self.levels, start=1): + if direction is QuotationMarkDirection.Opening and level.get_opening_quote() == quotation_mark: + depths.add(depth) + elif direction is QuotationMarkDirection.Closing and level.get_closing_quote() == quotation_mark: + depths.add(depth) + return depths + def is_compatible_with_observed_quotation_marks( self, opening_quotation_marks: list[str], closing_quotation_marks: list[str] ) -> bool: diff --git a/machine/corpora/analysis/quote_convention_detection_resolution_settings.py b/machine/corpora/analysis/quote_convention_detection_resolution_settings.py new file mode 100644 index 00000000..43328a60 --- /dev/null +++ b/machine/corpora/analysis/quote_convention_detection_resolution_settings.py @@ -0,0 +1,35 @@ +from typing import Set + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quote_convention_set import QuoteConventionSet + + +class QuoteConventionDetectionResolutionSettings(QuotationMarkResolutionSettings): + + def __init__(self, quote_convention_set: QuoteConventionSet): + self._quote_convention_set = quote_convention_set + + def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: + return quotation_mark_match.is_valid_opening_quotation_mark(self._quote_convention_set) + + def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: + return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_convention_set) + + def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: + return self._quote_convention_set.are_marks_a_valid_pair(opening_mark, closing_mark) + + def should_rely_on_paragraph_markers(self): + return True + + def should_quit_on_error(self) -> bool: + return True + + def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: + return self._quote_convention_set.get_possible_depths(quotation_mark, direction) + + def does_metadata_match_quotation_mark( + self, quotation_mark: str, depth: int, direction: QuotationMarkDirection + ) -> bool: + return self._quote_convention_set.does_metadata_match_quotation_mark(quotation_mark, depth, direction) diff --git a/machine/corpora/analysis/quote_convention_detector.py b/machine/corpora/analysis/quote_convention_detector.py index 4295058f..b7b186a9 100644 --- a/machine/corpora/analysis/quote_convention_detector.py +++ b/machine/corpora/analysis/quote_convention_detector.py @@ -1,13 +1,14 @@ -from typing import Union +from typing import List, Union from .chapter import Chapter +from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver from .preliminary_quotation_analyzer import PreliminaryQuotationAnalyzer from .quotation_mark_finder import QuotationMarkFinder from .quotation_mark_metadata import QuotationMarkMetadata -from .quotation_mark_resolver import QuotationMarkResolver from .quotation_mark_string_match import QuotationMarkStringMatch from .quotation_mark_tabulator import QuotationMarkTabulator from .quote_convention import QuoteConvention +from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings from .quote_convention_set import QuoteConventionSet from .standard_quote_conventions import standard_quote_conventions from .usfm_structure_extractor import UsfmStructureExtractor @@ -42,12 +43,14 @@ def _count_quotation_marks_in_chapters(self, chapters: list[Chapter]) -> None: def _count_quotation_marks_in_chapter( self, chapter: Chapter, possible_quote_conventions: QuoteConventionSet ) -> None: - quotation_mark_matches: list[QuotationMarkStringMatch] = QuotationMarkFinder( + quotation_mark_matches: List[QuotationMarkStringMatch] = QuotationMarkFinder( possible_quote_conventions ).find_all_potential_quotation_marks_in_chapter(chapter) - resolved_quotation_marks: list[QuotationMarkMetadata] = list( - QuotationMarkResolver(possible_quote_conventions).resolve_quotation_marks(quotation_mark_matches) + resolved_quotation_marks: List[QuotationMarkMetadata] = list( + DepthBasedQuotationMarkResolver( + QuoteConventionDetectionResolutionSettings(possible_quote_conventions) + ).resolve_quotation_marks(quotation_mark_matches) ) self.quotation_mark_tabulator.tabulate(resolved_quotation_marks) diff --git a/machine/corpora/analysis/quote_convention_set.py b/machine/corpora/analysis/quote_convention_set.py index c115f4d9..b2d5a8ad 100644 --- a/machine/corpora/analysis/quote_convention_set.py +++ b/machine/corpora/analysis/quote_convention_set.py @@ -3,6 +3,7 @@ import regex +from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_tabulator import QuotationMarkTabulator from .quote_convention import QuoteConvention @@ -90,6 +91,12 @@ def get_possible_paired_quotation_marks(self, quotation_mark: str) -> Set[str]: paired_quotation_marks.update(self.opening_marks_by_closing_mark[quotation_mark]) return paired_quotation_marks + def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: + depths: Set[int] = set() + for convention in self.conventions: + depths.update(convention.get_possible_depths(quotation_mark, direction)) + return depths + def get_opening_quotation_mark_regex(self) -> Pattern: return self.opening_quotation_mark_regex @@ -99,6 +106,14 @@ def get_closing_quotation_mark_regex(self) -> Pattern: def get_quotation_mark_regex(self) -> Pattern: return self.all_quotation_mark_regex + def does_metadata_match_quotation_mark( + self, quotation_mark: str, depth: int, direction: QuotationMarkDirection + ) -> bool: + for convention in self.conventions: + if convention.get_expected_quotation_mark(depth, direction) == quotation_mark: + return True + return False + def filter_to_compatible_quote_conventions( self, opening_quotation_marks: list[str], closing_quotation_marks: list[str] ) -> "QuoteConventionSet": diff --git a/machine/corpora/analysis/usfm_marker_type.py b/machine/corpora/analysis/usfm_marker_type.py index e1dfc2c4..00bbbb1a 100644 --- a/machine/corpora/analysis/usfm_marker_type.py +++ b/machine/corpora/analysis/usfm_marker_type.py @@ -1,11 +1,11 @@ -from enum import Enum +from enum import Enum, auto class UsfmMarkerType(Enum): - ParagraphMarker = "ParagraphMarker" - CharacterMarker = "CharacterMarker" - VerseMarker = "VerseMarker" - ChapterMarker = "ChapterMarker" - EmbedMarker = "Embed" - Other = "Other" - NoMarker = "NoMarker" + ParagraphMarker = auto() + CharacterMarker = auto() + VerseMarker = auto() + ChapterMarker = auto() + EmbedMarker = auto() + Other = auto() + NoMarker = auto() diff --git a/machine/corpora/basic_quotation_mark_resolver.py b/machine/corpora/basic_quotation_mark_resolver.py new file mode 100644 index 00000000..da4a50de --- /dev/null +++ b/machine/corpora/basic_quotation_mark_resolver.py @@ -0,0 +1,119 @@ +from typing import Generator, Set, Union + +from .analysis.quotation_mark_direction import QuotationMarkDirection +from .analysis.quotation_mark_metadata import QuotationMarkMetadata +from .analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .analysis.quotation_mark_resolver import QuotationMarkResolver +from .analysis.quotation_mark_string_match import QuotationMarkStringMatch + + +class BasicQuotationMarkResolver(QuotationMarkResolver): + + def __init__(self, settings: QuotationMarkResolutionSettings): + self._settings: QuotationMarkResolutionSettings = settings + self._last_quotation_mark: Union[QuotationMarkMetadata, None] = None + self._issues: Set[QuotationMarkResolutionIssue] = set() + + def reset(self) -> None: + self._last_quotation_mark = None + self._issues = set() + + def resolve_quotation_marks( + self, quote_matches: list[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: + for quote_match in quote_matches: + yield from self._resolve_quotation_mark(quote_match) + + def _resolve_quotation_mark( + self, + quote_match: QuotationMarkStringMatch, + ) -> Generator[QuotationMarkMetadata, None, None]: + if self._is_opening_quote(quote_match): + quote: Union[QuotationMarkMetadata, None] = self._resolve_opening_mark(quote_match) + if quote is not None: + yield quote + else: + self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) + elif self._is_closing_quote(quote_match): + quote: Union[QuotationMarkMetadata, None] = self._resolve_closing_mark(quote_match) + if quote is not None: + yield quote + else: + self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) + else: + self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) + + def _is_opening_quote( + self, + match: QuotationMarkStringMatch, + ) -> bool: + + if self._settings.is_valid_opening_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark( + match + ): + return ( + match.has_leading_whitespace() + or self._does_most_recent_opening_mark_immediately_precede(match) + or match.has_quote_introducer_in_leading_substring() + ) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation()) + elif self._settings.is_valid_opening_quotation_mark(match): + return True + + return False + + def _does_most_recent_opening_mark_immediately_precede( + self, + match: QuotationMarkStringMatch, + ) -> bool: + if ( + self._last_quotation_mark is None + or self._last_quotation_mark.get_direction() is not QuotationMarkDirection.Opening + ): + return False + + return ( + self._last_quotation_mark.get_text_segment() == match.get_text_segment() + and self._last_quotation_mark.get_end_index() == match.get_start_index() + ) + + def _is_closing_quote( + self, + match: QuotationMarkStringMatch, + ) -> bool: + + if self._settings.is_valid_closing_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark( + match + ): + return ( + match.has_trailing_whitespace() or match.has_trailing_punctuation() + ) and not match.has_leading_whitespace() + elif self._settings.is_valid_closing_quotation_mark(match): + return True + + return False + + def _resolve_opening_mark(self, quote_match: QuotationMarkStringMatch) -> Union[QuotationMarkMetadata, None]: + possible_depths: Set[int] = self._settings.get_possible_depths( + quote_match.get_quotation_mark(), QuotationMarkDirection.Opening + ) + if len(possible_depths) == 0: + return None + + quote = quote_match.resolve(min(possible_depths), QuotationMarkDirection.Opening) + self._last_quotation_mark = quote + return quote + + def _resolve_closing_mark(self, quote_match: QuotationMarkStringMatch) -> Union[QuotationMarkMetadata, None]: + possible_depths: Set[int] = self._settings.get_possible_depths( + quote_match.get_quotation_mark(), QuotationMarkDirection.Closing + ) + if len(possible_depths) == 0: + return None + + quote = quote_match.resolve(min(possible_depths), QuotationMarkDirection.Closing) + self._last_quotation_mark = quote + return quote + + def get_issues(self) -> Set[QuotationMarkResolutionIssue]: + return self._issues diff --git a/machine/corpora/quotation_denormalization_action.py b/machine/corpora/quotation_denormalization_action.py new file mode 100644 index 00000000..d036421b --- /dev/null +++ b/machine/corpora/quotation_denormalization_action.py @@ -0,0 +1,7 @@ +from enum import Enum, auto + + +class QuotationDenormalizationAction(Enum): + APPLY_FULL = auto() + APPLY_BASIC = auto() + SKIP = auto() diff --git a/machine/corpora/quotation_denormalization_first_pass.py b/machine/corpora/quotation_denormalization_first_pass.py new file mode 100644 index 00000000..7e3f0c8c --- /dev/null +++ b/machine/corpora/quotation_denormalization_first_pass.py @@ -0,0 +1,82 @@ +from typing import Dict, List, Set + +from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .analysis.quotation_mark_finder import QuotationMarkFinder +from .analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .analysis.quotation_mark_resolver import QuotationMarkResolver +from .analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .analysis.quote_convention import QuoteConvention +from .analysis.quote_convention_set import QuoteConventionSet +from .analysis.usfm_structure_extractor import UsfmStructureExtractor +from .quotation_denormalization_action import QuotationDenormalizationAction +from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings + + +class QuotationDenormalizationFirstPass(UsfmStructureExtractor): + + def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): + super().__init__() + self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( + QuoteConventionSet([source_quote_convention.normalize()]) + ) + self._quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( + QuotationDenormalizationResolutionSettings(source_quote_convention, target_quote_convention) + ) + self._will_basic_denormalization_work: bool = self._check_whether_basic_denormalization_will_work( + source_quote_convention, target_quote_convention + ) + + def _check_whether_basic_denormalization_will_work( + self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention + ) -> bool: + normalized_source_quote_convention: QuoteConvention = source_quote_convention.normalize() + target_marks_by_normalized_source_marks: Dict[str, Set[str]] = {} + for level in range(1, normalized_source_quote_convention.get_num_levels() + 1): + normalized_opening_quotation_mark = normalized_source_quote_convention.get_opening_quote_at_level(level) + if normalized_opening_quotation_mark not in target_marks_by_normalized_source_marks: + target_marks_by_normalized_source_marks[normalized_opening_quotation_mark] = set() + target_marks_by_normalized_source_marks[normalized_opening_quotation_mark].add( + target_quote_convention.get_closing_quote_at_level(level) + ) + + for normalized_source_mark in target_marks_by_normalized_source_marks: + if len(target_marks_by_normalized_source_marks[normalized_source_mark]) > 1: + return False + return True + + def get_best_actions_by_chapter(self, usfm_text: str) -> List[QuotationDenormalizationAction]: + best_actions_by_chapter: List[QuotationDenormalizationAction] = [] + + for chapter in self.get_chapters(): + best_actions_by_chapter.append(self._find_best_action_for_chapter(chapter)) + + return best_actions_by_chapter + + def _find_best_action_for_chapter(self, chapter) -> QuotationDenormalizationAction: + quotation_mark_matches: List[QuotationMarkStringMatch] = ( + self._quotation_mark_finder.find_all_potential_quotation_marks_in_chapter(chapter) + ) + + self._quotation_mark_resolver.reset() + list(self._quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches)) + + return self._choose_best_action_based_on_observed_issues(self._quotation_mark_resolver.get_issues()) + + def _choose_best_action_based_on_observed_issues(self, issues) -> QuotationDenormalizationAction: + print(issues) + if ( + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK in issues + or QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK in issues + ): + return QuotationDenormalizationAction.SKIP + + if ( + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK in issues + or QuotationMarkResolutionIssue.TOO_DEEP_NESTING in issues + or QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK in issues + ): + if self._will_basic_denormalization_work: + return QuotationDenormalizationAction.APPLY_BASIC + return QuotationDenormalizationAction.SKIP + + return QuotationDenormalizationAction.APPLY_FULL diff --git a/machine/corpora/quotation_denormalization_resolution_settings.py b/machine/corpora/quotation_denormalization_resolution_settings.py new file mode 100644 index 00000000..363a505c --- /dev/null +++ b/machine/corpora/quotation_denormalization_resolution_settings.py @@ -0,0 +1,37 @@ +from typing import Set + +from .analysis.quotation_mark_direction import QuotationMarkDirection +from .analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .analysis.quote_convention import QuoteConvention +from .analysis.quote_convention_set import QuoteConventionSet + + +class QuotationDenormalizationResolutionSettings(QuotationMarkResolutionSettings): + def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): + self._normalized_source_quote_convention = source_quote_convention.normalize() + self._normalized_quote_convention_singleton_set = QuoteConventionSet([self._normalized_source_quote_convention]) + self._target_quote_convention = target_quote_convention + + def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: + return quotation_mark_match.is_valid_opening_quotation_mark(self._normalized_quote_convention_singleton_set) + + def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: + return quotation_mark_match.is_valid_closing_quotation_mark(self._normalized_quote_convention_singleton_set) + + def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: + return self._normalized_quote_convention_singleton_set.are_marks_a_valid_pair(opening_mark, closing_mark) + + def should_rely_on_paragraph_markers(self): + return False + + def should_quit_on_error(self) -> bool: + return False + + def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: + return self._normalized_source_quote_convention.get_possible_depths(quotation_mark, direction) + + def does_metadata_match_quotation_mark( + self, quotation_mark: str, depth: int, direction: QuotationMarkDirection + ) -> bool: + return self._normalized_source_quote_convention.get_expected_quotation_mark(depth, direction) == quotation_mark diff --git a/machine/corpora/quotation_denormalization_scripture_update_block_handler.py b/machine/corpora/quotation_denormalization_scripture_update_block_handler.py index 3c011b5b..b36d8b69 100644 --- a/machine/corpora/quotation_denormalization_scripture_update_block_handler.py +++ b/machine/corpora/quotation_denormalization_scripture_update_block_handler.py @@ -1,5 +1,6 @@ -from typing import List +from typing import List, Union +from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver from .analysis.quotation_mark_finder import QuotationMarkFinder from .analysis.quotation_mark_resolver import QuotationMarkResolver from .analysis.quotation_mark_string_match import QuotationMarkStringMatch @@ -7,36 +8,68 @@ from .analysis.quote_convention_set import QuoteConventionSet from .analysis.text_segment import TextSegment from .analysis.usfm_marker_type import UsfmMarkerType +from .basic_quotation_mark_resolver import BasicQuotationMarkResolver +from .quotation_denormalization_action import QuotationDenormalizationAction +from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings +from .quotation_denormalization_settings import QuotationDenormalizationSettings from .scripture_update_block import ScriptureUpdateBlock from .scripture_update_block_handler import ScriptureUpdateBlockHandler from .scripture_update_element import ScriptureUpdateElement, ScriptureUpdateElementType -from .usfm_token import UsfmTokenType +from .usfm_token import UsfmToken, UsfmTokenType class QuotationDenormalizationScriptureUpdateBlockHandler(ScriptureUpdateBlockHandler): - def __init__(self, target_quote_convention: QuoteConvention, should_run_on_existing_text: bool = False): + def __init__( + self, + source_quote_convention: QuoteConvention, + target_quote_convention: QuoteConvention, + settings: QuotationDenormalizationSettings = QuotationDenormalizationSettings(), + ): super().__init__() + self._source_quote_convention: QuoteConvention = source_quote_convention self._target_quote_convention: QuoteConvention = target_quote_convention - self._normalized_quote_convention: QuoteConvention = target_quote_convention.normalize() - self._should_run_on_existing_text: bool = should_run_on_existing_text + self._settings: QuotationDenormalizationSettings = settings self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( - QuoteConventionSet([self._normalized_quote_convention]) + QuoteConventionSet([self._source_quote_convention.normalize()]) ) self._next_scripture_text_segment_builder: TextSegment.Builder = TextSegment.Builder() # Each embed represents a separate context for quotation marks - # (i.e. you can't open a quote in one and close it in another) + # (i.e. you can't open a quote in one context and close it in another) # so we need to keep track of the verse and embed contexts separately. - self._verse_text_quotation_mark_resolver: QuotationMarkResolver = QuotationMarkResolver( - QuoteConventionSet([self._normalized_quote_convention]) + resolution_settings = QuotationDenormalizationResolutionSettings( + self._source_quote_convention, self._target_quote_convention ) - self._embed_quotation_mark_resolver: QuotationMarkResolver = QuotationMarkResolver( - QuoteConventionSet([self._normalized_quote_convention]) + self._verse_text_quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( + resolution_settings ) + self._embed_quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( + resolution_settings + ) + self._simple_quotation_mark_resolver: QuotationMarkResolver = BasicQuotationMarkResolver(resolution_settings) + self._current_denormalization_action = QuotationDenormalizationAction.APPLY_FULL def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + if self._current_denormalization_action is QuotationDenormalizationAction.SKIP: + return block + if self._current_denormalization_action is QuotationDenormalizationAction.APPLY_BASIC: + return self._apply_simple_denormalization(block) + return self._apply_full_denormalization(block) + + def _apply_simple_denormalization(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + for element in block._elements: + if element.type == ScriptureUpdateElementType.EMBED_BLOCK or ( + element.type == ScriptureUpdateElementType.EXISTING_TEXT + and not self._settings.should_run_on_existing_text() + ): + continue + + self._process_scripture_element(element, self._simple_quotation_mark_resolver) + return block + + def _apply_full_denormalization(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: if len(block.elements) > 0 and block.elements[0].type == ScriptureUpdateElementType.EMBED: return self._process_embed_block(block) @@ -45,7 +78,10 @@ def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: def _process_embed_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: self._embed_quotation_mark_resolver.reset() for element in block._elements: - if element.type == ScriptureUpdateElementType.EXISTING_TEXT and not self._should_run_on_existing_text: + if ( + element.type == ScriptureUpdateElementType.EXISTING_TEXT + and not self._settings.should_run_on_existing_text() + ): continue self._process_scripture_element(element, self._embed_quotation_mark_resolver) @@ -55,7 +91,10 @@ def _process_verse_text_block(self, block: ScriptureUpdateBlock) -> ScriptureUpd for element in block._elements: if element.type == ScriptureUpdateElementType.EMBED_BLOCK: continue - if element.type == ScriptureUpdateElementType.EXISTING_TEXT and not self._should_run_on_existing_text: + if ( + element.type == ScriptureUpdateElementType.EXISTING_TEXT + and not self._settings.should_run_on_existing_text() + ): continue self._process_scripture_element(element, self._verse_text_quotation_mark_resolver) @@ -75,8 +114,7 @@ def _create_text_segments(self, element: ScriptureUpdateElement) -> List[TextSeg text_segments: List[TextSegment] = [] for token in element.get_tokens(): if token.type == UsfmTokenType.CHAPTER: - self._verse_text_quotation_mark_resolver.reset() - self._next_scripture_text_segment_builder = TextSegment.Builder() + self._start_new_chapter(token) self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) elif token.type == UsfmTokenType.VERSE: self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) @@ -87,14 +125,28 @@ def _create_text_segments(self, element: ScriptureUpdateElement) -> List[TextSeg elif token.type == UsfmTokenType.NOTE: self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) elif token.type == UsfmTokenType.TEXT: - self._next_scripture_text_segment_builder.set_usfm_token(token) - if token.text is not None: - self._next_scripture_text_segment_builder.set_text(token.text) - text_segments.append(self._next_scripture_text_segment_builder.build()) - else: - self._next_scripture_text_segment_builder = TextSegment.Builder() + text_segment: Union[TextSegment, None] = self._create_text_segment(token) + if text_segment is not None: + text_segments.append(text_segment) return self._set_previous_and_next_for_segments(text_segments) + def _start_new_chapter(self, token: UsfmToken) -> None: + chapter_number: Union[int, None] = int(token.data) if token.data is not None else None + if chapter_number is not None: + self._current_denormalization_action = self._settings.get_action_for_chapter(chapter_number) + self._verse_text_quotation_mark_resolver.reset() + self._next_scripture_text_segment_builder = TextSegment.Builder() + + def _create_text_segment(self, token: UsfmToken) -> Union[TextSegment, None]: + self._next_scripture_text_segment_builder.set_usfm_token(token) + if token.text is not None: + self._next_scripture_text_segment_builder.set_text(token.text) + text_segment_to_return: TextSegment = self._next_scripture_text_segment_builder.build() + self._next_scripture_text_segment_builder = TextSegment.Builder() + return text_segment_to_return + else: + self._next_scripture_text_segment_builder = TextSegment.Builder() + def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) -> List[TextSegment]: for i in range(len(text_segments)): if i > 0: diff --git a/machine/corpora/quotation_denormalization_settings.py b/machine/corpora/quotation_denormalization_settings.py new file mode 100644 index 00000000..cfb2bece --- /dev/null +++ b/machine/corpora/quotation_denormalization_settings.py @@ -0,0 +1,40 @@ +from .quotation_denormalization_action import QuotationDenormalizationAction + + +class QuotationDenormalizationSettings: + + def __init__(self): + self._should_run_on_existing_text = False + self._default_chapter_action = QuotationDenormalizationAction.APPLY_FULL + self._chapter_actions: list[QuotationDenormalizationAction] = [] + + def should_run_on_existing_text(self) -> bool: + return self._should_run_on_existing_text + + def get_action_for_chapter(self, chapter_number: int) -> QuotationDenormalizationAction: + if chapter_number <= len(self._chapter_actions): + return self._chapter_actions[chapter_number - 1] + return self._default_chapter_action + + class Builder: + def __init__(self): + self.settings = QuotationDenormalizationSettings() + + def run_on_existing_text(self) -> "QuotationDenormalizationSettings.Builder": + self.settings._should_run_on_existing_text = True + return self + + def set_chapter_actions( + self, chapter_actions: list[QuotationDenormalizationAction] + ) -> "QuotationDenormalizationSettings.Builder": + self.settings._chapter_actions = chapter_actions + return self + + def set_default_chapter_action( + self, action: QuotationDenormalizationAction + ) -> "QuotationDenormalizationSettings.Builder": + self.settings._default_chapter_action = action + return self + + def build(self): + return self.settings diff --git a/tests/corpora/analysis/test_quotation_mark_resolver.py b/tests/corpora/analysis/test_quotation_mark_resolver.py index 73784f8a..7f207eec 100644 --- a/tests/corpora/analysis/test_quotation_mark_resolver.py +++ b/tests/corpora/analysis/test_quotation_mark_resolver.py @@ -1,9 +1,19 @@ -from machine.corpora.analysis import QuotationMarkResolver, standard_quote_conventions +from typing import List + +from machine.corpora.analysis import ( + DepthBasedQuotationMarkResolver, + QuotationMarkResolver, + QuotationMarkStringMatch, + QuoteConventionDetectionResolutionSettings, + TextSegment, + UsfmMarkerType, + standard_quote_conventions, +) def test_reset() -> None: - quotation_mark_resolver: QuotationMarkResolver = QuotationMarkResolver( - standard_quote_conventions.standard_quote_conventions + quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( + QuoteConventionDetectionResolutionSettings(standard_quote_conventions.standard_quote_conventions) ) assert quotation_mark_resolver._quotation_mark_resolver_state.quotation_stack == [] @@ -17,3 +27,27 @@ def test_reset() -> None: assert quotation_mark_resolver._quotation_continuer_state.quotation_continuer_stack == [] assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 assert quotation_mark_resolver._quotation_continuer_state.current_depth == 0 + + quotation_mark_string_matches: List[QuotationMarkStringMatch] = [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("Opening “quote").build(), 8, 9), + QuotationMarkStringMatch(TextSegment.Builder().set_text("Another opening ‘quote").build(), 16, 17), + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("“‘quote continuer") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 0, + 1, + ), + ] + + list(quotation_mark_resolver.resolve_quotation_marks(quotation_mark_string_matches)) + assert len(quotation_mark_resolver._quotation_mark_resolver_state.quotation_stack) > 0 + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth > 0 + + quotation_mark_resolver.reset() + + assert quotation_mark_resolver._quotation_mark_resolver_state.quotation_stack == [] + assert quotation_mark_resolver._quotation_continuer_state.quotation_continuer_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 + assert quotation_mark_resolver._quotation_continuer_state.current_depth == 0 diff --git a/tests/corpora/test_quotation_denormalization_first_pass.py b/tests/corpora/test_quotation_denormalization_first_pass.py new file mode 100644 index 00000000..10896f2d --- /dev/null +++ b/tests/corpora/test_quotation_denormalization_first_pass.py @@ -0,0 +1,205 @@ +from typing import List + +from machine.corpora import QuotationDenormalizationAction, QuotationDenormalizationFirstPass, parse_usfm +from machine.corpora.analysis import standard_quote_conventions + + +def test_no_issues_in_usfm() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + 'You shall not eat of any tree of the garden'?" + """ + expected_actions = [QuotationDenormalizationAction.APPLY_FULL] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_opening_mark() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + 'You shall not eat of any tree of the garden'? + """ + expected_actions = [QuotationDenormalizationAction.APPLY_BASIC] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_closing_mark() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, Has God really said, + You shall not eat of any tree of the garden?" + """ + expected_actions = [QuotationDenormalizationAction.APPLY_BASIC] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_too_deep_nesting() -> None: + normalized_usfm = """\\c 1 + \\v 1 "Now the serpent was more "subtle than any animal + of the "field which "Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden? + """ + expected_actions = [QuotationDenormalizationAction.APPLY_BASIC] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_ambiguous_quotation_mark() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman"Has God really said, + You shall not eat of any tree of the garden? + """ + expected_actions = [QuotationDenormalizationAction.SKIP] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_no_issues_in_multiple_chapters() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \\c 2 \\v 1 He said to the woman, "Has God really said, + 'You shall not eat of any tree of the garden'?" + """ + expected_actions = [QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_FULL] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_quotation_mark_in_second_chapter() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden?" + """ + expected_actions = [QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_BASIC] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_quotation_mark_in_first_chapter() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had" made. + \\c 2 \\v 1 He said to the woman, Has God really said, + "You shall not eat of any tree of the garden?" + """ + expected_actions = [QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_FULL] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_ambiguous_quotation_mark_in_second_chapter() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not"eat of any tree of the garden?" + """ + expected_actions = [QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.SKIP] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_ambiguous_quotation_mark_in_first_chapter() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field"which Yahweh God had made. + \\c 2 \\v 1 He said to the woman, Has God really said, + "You shall not eat of any tree of the garden?" + """ + expected_actions = [QuotationDenormalizationAction.SKIP, QuotationDenormalizationAction.APPLY_FULL] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_quotation_mark_in_both_chapters() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had" made. + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden?" + """ + expected_actions = [QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_BASIC] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_ambiguous_quotation_mark_in_both_chapters() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had"made. + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not eat of any"tree of the garden? + """ + expected_actions = [QuotationDenormalizationAction.SKIP, QuotationDenormalizationAction.SKIP] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_in_first_ambiguous_in_second() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made." + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not eat of any"tree of the garden? + """ + expected_actions = [QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.SKIP] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_ambiguous_in_first_unpaired_in_second() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God"had made. + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden?" + """ + expected_actions = [QuotationDenormalizationAction.SKIP, QuotationDenormalizationAction.APPLY_BASIC] + observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def run_quotation_denormalization_first_pass( + normalized_usfm: str, source_quote_convention_name: str, target_quote_convention_name: str +) -> List[QuotationDenormalizationAction]: + source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + source_quote_convention_name + ) + assert source_quote_convention is not None + + target_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + target_quote_convention_name + ) + assert target_quote_convention is not None + + first_pass_analyzer = QuotationDenormalizationFirstPass(source_quote_convention, target_quote_convention) + parse_usfm(normalized_usfm, first_pass_analyzer) + + return first_pass_analyzer.get_best_actions_by_chapter(normalized_usfm) diff --git a/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py b/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py index 5197773f..aa330091 100644 --- a/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py +++ b/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py @@ -1,4 +1,10 @@ -from machine.corpora import QuotationDenormalizationScriptureUpdateBlockHandler, UpdateUsfmParserHandler, parse_usfm +from machine.corpora import ( + QuotationDenormalizationAction, + QuotationDenormalizationScriptureUpdateBlockHandler, + QuotationDenormalizationSettings, + UpdateUsfmParserHandler, + parse_usfm, +) from machine.corpora.analysis import standard_quote_conventions simple_normalized_usfm = """\\c 1 @@ -17,7 +23,7 @@ def test_simple_english_quote_denormalization() -> None: + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") assert_usfm_equal(observed_usfm, expected_usfm) @@ -34,7 +40,7 @@ def test_simple_british_english_quote_denormalization() -> None: + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "british_english") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "british_english", "british_english") assert_usfm_equal(observed_usfm, expected_usfm) @@ -47,7 +53,7 @@ def test_simple_typewriter_english_quote_denormalization() -> None: + "the woman, \"Has God really said, 'You shall not eat of any tree of the garden'?\"" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "typewriter_english") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "typewriter_english") assert_usfm_equal(observed_usfm, expected_usfm) @@ -60,7 +66,7 @@ def test_simple_hybrid_typewriter_english_quote_denormalization() -> None: + "the woman, “Has God really said, 'You shall not eat of any tree of the garden'?”" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "hybrid_typewriter_english") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "hybrid_typewriter_english") assert_usfm_equal(observed_usfm, expected_usfm) @@ -79,7 +85,7 @@ def test_simple_french_quote_denormalization() -> None: + "the woman, «Has God really said, ‹You shall not eat of any tree of the garden›?»" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_french") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_french", "standard_french") assert_usfm_equal(observed_usfm, expected_usfm) @@ -97,7 +103,7 @@ def test_simple_typewriter_french_quote_denormalization() -> None: + "the woman, <?>>" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "typewriter_french") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "typewriter_french", "typewriter_french") assert_usfm_equal(observed_usfm, expected_usfm) @@ -115,7 +121,7 @@ def test_simple_western_european_quote_denormalization() -> None: + "the woman, «Has God really said, “You shall not eat of any tree of the garden”?»" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "western_european") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "western_european", "western_european") assert_usfm_equal(observed_usfm, expected_usfm) @@ -132,7 +138,9 @@ def test_simple_typewriter_western_european_quote_denormalization() -> None: + 'the woman, <>' ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "typewriter_western_european") + observed_usfm = denormalize_quotation_marks( + normalized_usfm, "typewriter_western_european", "typewriter_western_european" + ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -149,7 +157,9 @@ def test_simple_typewriter_western_european_variant_quote_denormalization() -> N + 'the woman, "Has God really said, ?"' ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "typewriter_western_european_variant") + observed_usfm = denormalize_quotation_marks( + normalized_usfm, "typewriter_western_european_variant", "typewriter_western_european_variant" + ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -166,7 +176,9 @@ def test_simple_hybrid_typewriter_western_european_quote_denormalization() -> No + 'the woman, «Has God really said, "You shall not eat of any tree of the garden"?»' ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "hybrid_typewriter_western_european") + observed_usfm = denormalize_quotation_marks( + normalized_usfm, "hybrid_typewriter_western_european", "hybrid_typewriter_western_european" + ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -183,7 +195,7 @@ def test_simple_central_european_quote_denormalization() -> None: + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden‘?“" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "central_european") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "central_european", "central_european") assert_usfm_equal(observed_usfm, expected_usfm) @@ -200,7 +212,9 @@ def test_simple_central_european_guillemets_quote_denormalization() -> None: + "the woman, »Has God really said, ›You shall not eat of any tree of the garden‹?«" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "central_european_guillemets") + observed_usfm = denormalize_quotation_marks( + normalized_usfm, "central_european_guillemets", "central_european_guillemets" + ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -217,7 +231,7 @@ def test_simple_swedish_quote_denormalization() -> None: + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden’?”" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_swedish") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_swedish", "standard_swedish") assert_usfm_equal(observed_usfm, expected_usfm) @@ -229,7 +243,7 @@ def test_simple_finnish_quote_denormalization() -> None: + "the woman, »Has God really said, ’You shall not eat of any tree of the garden’?»" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_finnish") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_finnish") assert_usfm_equal(observed_usfm, expected_usfm) @@ -241,7 +255,7 @@ def test_simple_eastern_european_quote_denormalization() -> None: + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden’?”" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "eastern_european") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "eastern_european") assert_usfm_equal(observed_usfm, expected_usfm) @@ -258,7 +272,7 @@ def test_simple_russian_quote_denormalization() -> None: + "the woman, «Has God really said, „You shall not eat of any tree of the garden“?»" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_russian") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_russian", "standard_russian") assert_usfm_equal(observed_usfm, expected_usfm) @@ -270,7 +284,7 @@ def test_simple_arabic_quote_denormalization() -> None: + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden‘?“" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_arabic") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_arabic") assert_usfm_equal(observed_usfm, expected_usfm) @@ -289,7 +303,7 @@ def test_quotes_spanning_verses() -> None: + "\\v 2 ‘You shall not eat of any tree of the garden’?”" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") assert_usfm_equal(observed_usfm, expected_usfm) @@ -306,7 +320,7 @@ def test_single_embed() -> None: + "\\f + \\ft “This is a ‘footnote’” \\f* of the field which Yahweh God had made." ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") assert_usfm_equal(observed_usfm, expected_usfm) @@ -324,7 +338,7 @@ def test_multiple_embeds() -> None: + "“footnote” here \\f* which Yahweh God had made." ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") assert_usfm_equal(observed_usfm, expected_usfm) @@ -344,7 +358,7 @@ def test_quotes_in_text_and_embed() -> None: + "said, ‘You shall not eat of any tree of the garden’?”" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") assert_usfm_equal(observed_usfm, expected_usfm) @@ -366,21 +380,364 @@ def test_quotes_in_multiple_verses_and_embed() -> None: + "said, ‘You shall not eat of any tree of the garden’?”" ) - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english") + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# Basic denormalization does not consider the nesting of quotation marks, +# but only determines opening/closing marks and maps based on that. +def test_basic_quotation_denormalization_same_as_full() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_default_chapter_action(QuotationDenormalizationAction.APPLY_BASIC) + .build(), + ) assert_usfm_equal(observed_usfm, expected_usfm) +def test_basic_quotation_denormalization_incorrectly_nested() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden"?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, “You shall not eat of any tree of the garden”?”" + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_default_chapter_action(QuotationDenormalizationAction.APPLY_BASIC) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_basic_quotation_denormalization_incorrectly_nested_second_case() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + "You shall not eat of any tree of the garden"?' + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_default_chapter_action(QuotationDenormalizationAction.APPLY_BASIC) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_basic_quotation_denormalization_unclosed_quote() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_default_chapter_action(QuotationDenormalizationAction.APPLY_BASIC) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_default_denormalization_action() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_basic_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + expected_skipped_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder().run_on_existing_text().build(), + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_default_chapter_action(QuotationDenormalizationAction.APPLY_FULL) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_default_chapter_action(QuotationDenormalizationAction.APPLY_BASIC) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_basic_usfm) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_default_chapter_action(QuotationDenormalizationAction.SKIP) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_skipped_usfm) + + +def test_single_chapter_denormalization_action() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_basic_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + expected_skipped_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_chapter_actions([QuotationDenormalizationAction.APPLY_FULL]) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_chapter_actions([QuotationDenormalizationAction.APPLY_BASIC]) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_basic_usfm) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_chapter_actions([QuotationDenormalizationAction.SKIP]) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_skipped_usfm) + + +def test_multiple_chapter_same_denormalization_action() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle" than any animal + of the field which Yahweh God had made. + \\c 2 + \\v 1 He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_usfm = ( + "\\c 1\n" + + '\\v 1 Now the serpent was more subtle" than any animal of the field which Yahweh God had made.\n' + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_basic_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_chapter_actions([QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_FULL]) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_chapter_actions([QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_BASIC]) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_basic_usfm) + + +def test_multiple_chapter_multiple_denormalization_actions() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle" than any animal + of the field which Yahweh God had made. + \\c 2 + \\v 1 He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_then_basic_usfm = ( + "\\c 1\n" + + '\\v 1 Now the serpent was more subtle" than any animal of the field which Yahweh God had made.\n' + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + expected_basic_then_full_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_basic_then_skip_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + '\\v 1 He said to the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_chapter_actions([QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_BASIC]) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_full_then_basic_usfm) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_chapter_actions([QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_FULL]) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_basic_then_full_usfm) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .set_chapter_actions([QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.SKIP]) + .build(), + ) + assert_usfm_equal(observed_usfm, expected_basic_then_skip_usfm) + + def denormalize_quotation_marks( - normalized_usfm: str, quote_convention_name: str, should_run_on_existing_text=True + normalized_usfm: str, + source_quote_convention_name: str, + target_quote_convention_name: str, + quotation_denormalization_settings: QuotationDenormalizationSettings = QuotationDenormalizationSettings.Builder() + .run_on_existing_text() + .build(), ) -> str: - standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name(quote_convention_name) + source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + source_quote_convention_name + ) + assert source_quote_convention is not None + + target_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + target_quote_convention_name ) - assert standard_english_quote_convention is not None + assert target_quote_convention is not None quotation_denormalizer: QuotationDenormalizationScriptureUpdateBlockHandler = ( QuotationDenormalizationScriptureUpdateBlockHandler( - standard_english_quote_convention, should_run_on_existing_text=should_run_on_existing_text + source_quote_convention, + target_quote_convention, + quotation_denormalization_settings, ) ) From a43147e14232fa73d5b448bb0a4d0524d7b261b5 Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 8 May 2025 14:37:35 -0400 Subject: [PATCH 11/31] Unit tests for basic quotation mark resolver --- machine/corpora/__init__.py | 2 + machine/corpora/analysis/__init__.py | 8 + .../analysis/quotation_mark_metadata.py | 12 + .../analysis/quotation_mark_string_match.py | 2 +- machine/corpora/analysis/quote_convention.py | 12 +- .../corpora/analysis/quote_convention_set.py | 16 +- machine/corpora/analysis/text_segment.py | 13 + .../corpora/basic_quotation_mark_resolver.py | 10 +- .../quotation_denormalization_first_pass.py | 18 +- .../corpora/analysis/test_quote_convention.py | 20 + .../test_basic_quotation_mark_resolver.py | 313 +++++++++++++++ ...st_quotation_denormalization_first_pass.py | 356 +++++++++++++++++- 12 files changed, 761 insertions(+), 21 deletions(-) create mode 100644 tests/corpora/analysis/test_quote_convention.py create mode 100644 tests/corpora/test_basic_quotation_mark_resolver.py diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 23e95124..f3c8804d 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -2,6 +2,7 @@ from .alignment_collection import AlignmentCollection from .alignment_corpus import AlignmentCorpus from .alignment_row import AlignmentRow +from .basic_quotation_mark_resolver import BasicQuotationMarkResolver from .corpora_utils import batch from .corpus import Corpus from .dbl_bundle_text_corpus import DblBundleTextCorpus @@ -93,6 +94,7 @@ "AlignmentCollection", "AlignmentCorpus", "AlignmentRow", + "BasicQuotationMarkResolver", "batch", "Corpus", "create_versification_ref_corpus", diff --git a/machine/corpora/analysis/__init__.py b/machine/corpora/analysis/__init__.py index 8bb23a5b..849222ff 100644 --- a/machine/corpora/analysis/__init__.py +++ b/machine/corpora/analysis/__init__.py @@ -1,4 +1,7 @@ +from .chapter import Chapter from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_metadata import QuotationMarkMetadata from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings from .quotation_mark_resolver import QuotationMarkResolver @@ -9,9 +12,13 @@ from .quote_convention_set import QuoteConventionSet from .text_segment import TextSegment from .usfm_marker_type import UsfmMarkerType +from .verse import Verse __all__ = [ + "Chapter", "DepthBasedQuotationMarkResolver", + "QuotationMarkDirection", + "QuotationMarkMetadata", "QuotationMarkStringMatch", "QuoteConvention", "QuoteConventionAnalysis", @@ -23,4 +30,5 @@ "QuoteConventionSet", "TextSegment", "UsfmMarkerType", + "Verse", ] diff --git a/machine/corpora/analysis/quotation_mark_metadata.py b/machine/corpora/analysis/quotation_mark_metadata.py index 72736ec1..efe3ce45 100644 --- a/machine/corpora/analysis/quotation_mark_metadata.py +++ b/machine/corpora/analysis/quotation_mark_metadata.py @@ -20,6 +20,18 @@ def __init__( self.start_index = start_index self.end_index = end_index + def __eq__(self, other): + if not isinstance(other, QuotationMarkMetadata): + return False + return ( + self.quotation_mark == other.quotation_mark + and self.depth == other.depth + and self.direction == other.direction + and self.text_segment == other.text_segment + and self.start_index == other.start_index + and self.end_index == other.end_index + ) + def get_quotation_mark(self) -> str: return self.quotation_mark diff --git a/machine/corpora/analysis/quotation_mark_string_match.py b/machine/corpora/analysis/quotation_mark_string_match.py index ac168e03..d8af13e9 100644 --- a/machine/corpora/analysis/quotation_mark_string_match.py +++ b/machine/corpora/analysis/quotation_mark_string_match.py @@ -17,7 +17,7 @@ class QuotationMarkStringMatch: latin_letter_pattern: Pattern = regex.compile(r"^\p{script=Latin}$", regex.U) whitespace_pattern: Pattern = regex.compile(r"[\s~]", regex.U) punctuation_pattern: Pattern = regex.compile(r"[\.,;\?!\)\]\-—۔،؛]", regex.U) - quote_introducer_pattern: Pattern = regex.compile(r"[:,]\\s*", regex.U) + quote_introducer_pattern: Pattern = regex.compile(r"[:,]\s*$", regex.U) def __init__(self, text_segment: TextSegment, start_index: int, end_index: int): self.text_segment = text_segment diff --git a/machine/corpora/analysis/quote_convention.py b/machine/corpora/analysis/quote_convention.py index 5d3ba60f..7d959df0 100644 --- a/machine/corpora/analysis/quote_convention.py +++ b/machine/corpora/analysis/quote_convention.py @@ -111,10 +111,18 @@ def normalize(self) -> "QuoteConvention": return QuoteConvention(self.get_name() + "_normalized", [level.normalize() for level in self.levels]) def print_summary(self) -> None: - print(self.get_name()) + print(self._get_summary_message()) + + def _get_summary_message(self) -> str: + summary = self.get_name() + "\n" for level, convention in enumerate(self.levels): ordinal_name = self._get_ordinal_name(level + 1) - print("%s%s-level quote%s" % (convention.get_opening_quote(), ordinal_name, convention.get_closing_quote())) + summary += "%s%s-level quote%s\n" % ( + convention.get_opening_quote(), + ordinal_name, + convention.get_closing_quote(), + ) + return summary def _get_ordinal_name(self, level) -> str: if level == 1: diff --git a/machine/corpora/analysis/quote_convention_set.py b/machine/corpora/analysis/quote_convention_set.py index b2d5a8ad..c1af9ef4 100644 --- a/machine/corpora/analysis/quote_convention_set.py +++ b/machine/corpora/analysis/quote_convention_set.py @@ -29,12 +29,20 @@ def _create_quote_regexes(self) -> None: all_quotation_marks.add(opening_quote) all_quotation_marks.add(closing_quote) - self.opening_quotation_mark_regex: Pattern = regex.compile(r"[" + "".join(opening_quotation_marks) + "]") - self.closing_quotation_mark_regex: Pattern = regex.compile(r"[" + "".join(closing_quotation_marks) + "]") - self.all_quotation_mark_regex: Pattern = regex.compile(r"[" + "".join(all_quotation_marks) + "]") - else: + if len(all_quotation_marks) > 0: + self.opening_quotation_mark_regex: Pattern = regex.compile( + r"[" + "".join(opening_quotation_marks) + "]" + ) + self.closing_quotation_mark_regex: Pattern = regex.compile( + r"[" + "".join(closing_quotation_marks) + "]" + ) + self.all_quotation_mark_regex: Pattern = regex.compile(r"[" + "".join(all_quotation_marks) + "]") + + if len(opening_quotation_marks) == 0: self.opening_quotation_mark_regex = regex.compile(r"") + if len(closing_quotation_marks) == 0: self.closing_quotation_mark_regex = regex.compile(r"") + if len(all_quotation_marks) == 0: self.all_quotation_mark_regex = regex.compile(r"") def _create_quotation_mark_pair_map(self) -> None: diff --git a/machine/corpora/analysis/text_segment.py b/machine/corpora/analysis/text_segment.py index ec94b08a..5be2413c 100644 --- a/machine/corpora/analysis/text_segment.py +++ b/machine/corpora/analysis/text_segment.py @@ -15,6 +15,19 @@ def __init__(self): self.num_segments_in_verse: int = 0 self.usfm_token: Union[UsfmToken, None] = None + def __eq__(self, value): + if not isinstance(value, TextSegment): + return False + if self.text != value.text: + return False + if self.index_in_verse != value.index_in_verse: + return False + if self.usfm_token != value.usfm_token: + return False + if self.immediate_preceding_marker != value.immediate_preceding_marker: + return False + return True + def get_text(self) -> str: return self.text diff --git a/machine/corpora/basic_quotation_mark_resolver.py b/machine/corpora/basic_quotation_mark_resolver.py index da4a50de..b9f6f04f 100644 --- a/machine/corpora/basic_quotation_mark_resolver.py +++ b/machine/corpora/basic_quotation_mark_resolver.py @@ -30,18 +30,21 @@ def _resolve_quotation_mark( quote_match: QuotationMarkStringMatch, ) -> Generator[QuotationMarkMetadata, None, None]: if self._is_opening_quote(quote_match): + print("Opening quote: %s" % quote_match.get_context()) quote: Union[QuotationMarkMetadata, None] = self._resolve_opening_mark(quote_match) if quote is not None: yield quote else: self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) elif self._is_closing_quote(quote_match): + print("Closing quote: %s" % quote_match.get_context()) quote: Union[QuotationMarkMetadata, None] = self._resolve_closing_mark(quote_match) if quote is not None: yield quote else: self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) else: + print("Unknown quote %s" % quote_match.get_context()) self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) def _is_opening_quote( @@ -53,7 +56,8 @@ def _is_opening_quote( match ): return ( - match.has_leading_whitespace() + match.is_at_start_of_segment() + or match.has_leading_whitespace() or self._does_most_recent_opening_mark_immediately_precede(match) or match.has_quote_introducer_in_leading_substring() ) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation()) @@ -82,11 +86,11 @@ def _is_closing_quote( match: QuotationMarkStringMatch, ) -> bool: - if self._settings.is_valid_closing_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark( + if self._settings.is_valid_opening_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark( match ): return ( - match.has_trailing_whitespace() or match.has_trailing_punctuation() + match.has_trailing_whitespace() or match.has_trailing_punctuation() or match.is_at_end_of_segment() ) and not match.has_leading_whitespace() elif self._settings.is_valid_closing_quotation_mark(match): return True diff --git a/machine/corpora/quotation_denormalization_first_pass.py b/machine/corpora/quotation_denormalization_first_pass.py index 7e3f0c8c..e7d4d1f9 100644 --- a/machine/corpora/quotation_denormalization_first_pass.py +++ b/machine/corpora/quotation_denormalization_first_pass.py @@ -1,5 +1,6 @@ from typing import Dict, List, Set +from .analysis.chapter import Chapter from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver from .analysis.quotation_mark_finder import QuotationMarkFinder from .analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue @@ -35,9 +36,10 @@ def _check_whether_basic_denormalization_will_work( normalized_opening_quotation_mark = normalized_source_quote_convention.get_opening_quote_at_level(level) if normalized_opening_quotation_mark not in target_marks_by_normalized_source_marks: target_marks_by_normalized_source_marks[normalized_opening_quotation_mark] = set() - target_marks_by_normalized_source_marks[normalized_opening_quotation_mark].add( - target_quote_convention.get_closing_quote_at_level(level) - ) + if level <= target_quote_convention.get_num_levels(): + target_marks_by_normalized_source_marks[normalized_opening_quotation_mark].add( + target_quote_convention.get_closing_quote_at_level(level) + ) for normalized_source_mark in target_marks_by_normalized_source_marks: if len(target_marks_by_normalized_source_marks[normalized_source_mark]) > 1: @@ -52,28 +54,26 @@ def get_best_actions_by_chapter(self, usfm_text: str) -> List[QuotationDenormali return best_actions_by_chapter - def _find_best_action_for_chapter(self, chapter) -> QuotationDenormalizationAction: + def _find_best_action_for_chapter(self, chapter: Chapter) -> QuotationDenormalizationAction: quotation_mark_matches: List[QuotationMarkStringMatch] = ( self._quotation_mark_finder.find_all_potential_quotation_marks_in_chapter(chapter) ) self._quotation_mark_resolver.reset() + + # use list() to force evaluation of the generator list(self._quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches)) return self._choose_best_action_based_on_observed_issues(self._quotation_mark_resolver.get_issues()) def _choose_best_action_based_on_observed_issues(self, issues) -> QuotationDenormalizationAction: print(issues) - if ( - QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK in issues - or QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK in issues - ): + if QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK in issues: return QuotationDenormalizationAction.SKIP if ( QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK in issues or QuotationMarkResolutionIssue.TOO_DEEP_NESTING in issues - or QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK in issues ): if self._will_basic_denormalization_work: return QuotationDenormalizationAction.APPLY_BASIC diff --git a/tests/corpora/analysis/test_quote_convention.py b/tests/corpora/analysis/test_quote_convention.py new file mode 100644 index 00000000..3e4e7a7f --- /dev/null +++ b/tests/corpora/analysis/test_quote_convention.py @@ -0,0 +1,20 @@ +from machine.corpora.analysis.quote_convention import QuoteConvention, SingleLevelQuoteConvention + + +def test_print_summary(): + quote_convention = QuoteConvention( + "test-quote-convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201D"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201D", "\u201D"), + ], + ) + expected_summary_message = ( + "test-quote-convention\n" + + "\u201CFirst-level quote\u201D\n" + + "\u2018Second-level quote\u2019\n" + + "\u201DThird-level quote\u201D\n" + ) + assert quote_convention._get_summary_message() == expected_summary_message + assert True diff --git a/tests/corpora/test_basic_quotation_mark_resolver.py b/tests/corpora/test_basic_quotation_mark_resolver.py new file mode 100644 index 00000000..8ea7a362 --- /dev/null +++ b/tests/corpora/test_basic_quotation_mark_resolver.py @@ -0,0 +1,313 @@ +from machine.corpora import BasicQuotationMarkResolver, QuotationDenormalizationResolutionSettings +from machine.corpora.analysis import ( + QuotationMarkDirection, + QuotationMarkMetadata, + QuotationMarkResolutionIssue, + QuotationMarkStringMatch, + QuoteConventionDetectionResolutionSettings, + QuoteConventionSet, + TextSegment, + standard_quote_conventions, +) + + +def test_reset(): + english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + "standard_english" + ) + assert english_quote_convention is not None + + basic_quotation_mark_resolver = BasicQuotationMarkResolver( + QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + ) + + basic_quotation_mark_resolver._last_quotation_mark = QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text('"\'test text"').build(), 0, 1 + ) + basic_quotation_mark_resolver._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) + + basic_quotation_mark_resolver.reset() + assert basic_quotation_mark_resolver._last_quotation_mark is None + assert len(basic_quotation_mark_resolver._issues) == 0 + + +def test_simple_quotation_mark_resolution(): + english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + "standard_english" + ) + assert english_quote_convention is not None + + basic_quotation_mark_resolver = BasicQuotationMarkResolver( + QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + ) + + actual_resolved_quotation_marks = list( + basic_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 10, 11), + ] + ) + ) + expected_resolved_quotation_marks = [ + QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text('"test text"').build(), 0, 1 + ), + QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.Closing, TextSegment.Builder().set_text('"test text"').build(), 10, 11 + ), + ] + + assert_resolved_quotation_marks_equal( + actual_resolved_quotation_marks, + expected_resolved_quotation_marks, + ) + + +def test_is_opening_quote(): + english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + "standard_english" + ) + assert english_quote_convention is not None + + basic_quotation_mark_resolver = BasicQuotationMarkResolver( + QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + ) + + # valid opening quote at start of segment + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1) + assert basic_quotation_mark_resolver._is_opening_quote(quote_match) is True + + # opening quote with leading whitespace + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test "text"').build(), 5, 6) + assert basic_quotation_mark_resolver._is_opening_quote(quote_match) is True + + # opening quote with quote introducer + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test:"text"').build(), 5, 6) + assert basic_quotation_mark_resolver._is_opening_quote(quote_match) is True + + # QuotationMarkStringMatch indices don't indicate a quotation mark + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test "text"').build(), 0, 1) + assert basic_quotation_mark_resolver._is_opening_quote(quote_match) is False + + # the quotation mark is not valid under the current quote convention + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('').build(), 10, 11) + assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is False + + # no trailing whitespace after quotation mark + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('"test"text').build(), 5, 6) + assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is False + + # opening quote at the start of the segment + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1) + assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is False + + # opening quote with leading whitespace + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test "text"').build(), 5, 6) + assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is False + + +def test_is_closing_quote_with_unambiguous_quote_convention(): + english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + "standard_english" + ) + assert english_quote_convention is not None + + basic_quotation_mark_resolver = BasicQuotationMarkResolver( + QuoteConventionDetectionResolutionSettings(QuoteConventionSet([english_quote_convention])) + ) + + # unambiguous closing quote at end of segment + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("“test text”").build(), 10, 11) + assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is True + + # unambiguous closing quote with trailing whitespace + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("“test” text").build(), 5, 6) + assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is True + + # unambiguous closing quote without the "correct" context + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("“test”text").build(), 5, 6) + assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is True + + # unambiguous opening quote + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("test “text”").build(), 5, 6) + assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is False + + +def test_resolve_opening_quote(): + english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + "standard_english" + ) + assert english_quote_convention is not None + + basic_quotation_mark_resolver = BasicQuotationMarkResolver( + QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + ) + + expected_resolved_quotation_mark = QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text('"test text"').build(), 0, 1 + ) + actual_resolved_quotation_mark = basic_quotation_mark_resolver._resolve_opening_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1) + ) + assert actual_resolved_quotation_mark == expected_resolved_quotation_mark + assert basic_quotation_mark_resolver._last_quotation_mark == actual_resolved_quotation_mark + + +def test_resolve_closing_quote(): + english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + "standard_english" + ) + assert english_quote_convention is not None + + basic_quotation_mark_resolver = BasicQuotationMarkResolver( + QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + ) + + expected_resolved_quotation_mark = QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.Closing, TextSegment.Builder().set_text('"test text"').build(), 10, 11 + ) + actual_resolved_quotation_mark = basic_quotation_mark_resolver._resolve_closing_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 10, 11) + ) + assert actual_resolved_quotation_mark == expected_resolved_quotation_mark + + +def assert_resolved_quotation_marks_equal( + actual_resolved_quotation_marks: list[QuotationMarkMetadata], + expected_resolved_quotation_marks: list[QuotationMarkMetadata], +) -> None: + assert len(actual_resolved_quotation_marks) == len(expected_resolved_quotation_marks) + for actual_mark, expected_mark in zip(actual_resolved_quotation_marks, expected_resolved_quotation_marks): + assert actual_mark == expected_mark diff --git a/tests/corpora/test_quotation_denormalization_first_pass.py b/tests/corpora/test_quotation_denormalization_first_pass.py index 10896f2d..99fd83e3 100644 --- a/tests/corpora/test_quotation_denormalization_first_pass.py +++ b/tests/corpora/test_quotation_denormalization_first_pass.py @@ -1,9 +1,333 @@ -from typing import List +from typing import List, Union from machine.corpora import QuotationDenormalizationAction, QuotationDenormalizationFirstPass, parse_usfm -from machine.corpora.analysis import standard_quote_conventions +from machine.corpora.analysis import ( + Chapter, + QuotationMarkResolutionIssue, + QuoteConvention, + TextSegment, + Verse, + standard_quote_conventions, +) + + +def test_check_whether_basic_denormalization_will_work() -> None: + + first_pass_analyzer = QuotationDenormalizationFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + + # Cases where we expect basic denormalization to work + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("standard_english"), + get_quote_convention_by_name("standard_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("standard_french"), + get_quote_convention_by_name("british_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("typewriter_western_european"), + get_quote_convention_by_name("standard_russian"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("typewriter_western_european_variant"), + get_quote_convention_by_name("standard_arabic"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("central_european"), + get_quote_convention_by_name("british_typewriter_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("standard_swedish"), + get_quote_convention_by_name("typewriter_french"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("standard_finnish"), + get_quote_convention_by_name("british_inspired_western_european"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("eastern_european"), + get_quote_convention_by_name("central_european"), + ) + is True + ) + + # Cases where we expect basic denormalization to fail + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("western_european"), + get_quote_convention_by_name("standard_english"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("french_variant"), + get_quote_convention_by_name("hybrid_typewriter_english"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("british_inspired_western_european"), + get_quote_convention_by_name("standard_russian"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("typewriter_english"), + get_quote_convention_by_name("western_european"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("central_european_guillemets"), + get_quote_convention_by_name("french_variant"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("standard_arabic"), + get_quote_convention_by_name("hybrid_typewriter_english"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_basic_denormalization_will_work( + get_quote_convention_by_name("standard_russian"), + get_quote_convention_by_name("standard_french"), + ) + is False + ) + + +def test_choose_best_action_for_chapter() -> None: + # Verse text with no issues + actual_action = run_quotation_denormalization_first_pass_on_chapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + 'He said to the woman, "Has God really said, ' + + "'You shall not eat of any tree of the garden'?\"" + ], + "standard_english", + "standard_english", + ) + expected_action = QuotationDenormalizationAction.APPLY_FULL + assert actual_action == expected_action + + # Verse text with unpaired opening quotation mark + actual_action = run_quotation_denormalization_first_pass_on_chapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + 'He said to the woman, "Has God really said, ' + + "'You shall not eat of any tree of the garden'?" + ], + "standard_english", + "standard_english", + ) + expected_action = QuotationDenormalizationAction.APPLY_BASIC + assert actual_action == expected_action + + # Verse text with unpaired closing quotation mark + actual_action = run_quotation_denormalization_first_pass_on_chapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman, Has God really said, " + + 'You shall not eat of any tree of the garden?"' + ], + "standard_english", + "standard_english", + ) + expected_action = QuotationDenormalizationAction.APPLY_BASIC + assert actual_action == expected_action + + # Verse text with too deeply nested quotation marks + actual_action = run_quotation_denormalization_first_pass_on_chapter( + [ + '"Now the serpent was more "subtle than any animal ' + + 'of the "field which "Yahweh God had made. ' + + 'He said to the woman, "Has God really said, ' + + '"You shall not eat of any tree of the garden?' + ], + "standard_english", + "standard_english", + ) + expected_action = QuotationDenormalizationAction.SKIP + assert actual_action == expected_action + + # Verse text with an ambiguous quotation mark + actual_action = run_quotation_denormalization_first_pass_on_chapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + 'He said to the woman"Has God really said, ' + + "You shall not eat of any tree of the garden?" + ], + "standard_english", + "standard_english", + ) + expected_action = QuotationDenormalizationAction.SKIP + assert actual_action == expected_action + + # Verse text with an ambiguous quotation mark + actual_action = run_quotation_denormalization_first_pass_on_chapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + 'He said to the woman"Has God really said, ' + + "You shall not eat of any tree of the garden?" + ], + "standard_english", + "standard_english", + ) + expected_action = QuotationDenormalizationAction.SKIP + assert actual_action == expected_action + + +def test_choose_best_action_based_on_observed_issues() -> None: + first_pass_analyzer = QuotationDenormalizationFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + first_pass_analyzer._will_basic_denormalization_work = False + + # Test with no issues + best_action = first_pass_analyzer._choose_best_action_based_on_observed_issues([]) + assert best_action == QuotationDenormalizationAction.APPLY_FULL + + # Test with one issue + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK] + ) + == QuotationDenormalizationAction.SKIP + ) + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK] + ) + == QuotationDenormalizationAction.SKIP + ) + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [QuotationMarkResolutionIssue.TOO_DEEP_NESTING] + ) + == QuotationDenormalizationAction.SKIP + ) + + # Test with multiple issues + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [ + QuotationMarkResolutionIssue.TOO_DEEP_NESTING, + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, + ] + ) + == QuotationDenormalizationAction.SKIP + ) + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [ + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, + ] + ) + == QuotationDenormalizationAction.SKIP + ) + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [ + QuotationMarkResolutionIssue.TOO_DEEP_NESTING, + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, + ] + ) + == QuotationDenormalizationAction.SKIP + ) + + +def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> None: + first_pass_analyzer = QuotationDenormalizationFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + first_pass_analyzer._will_basic_denormalization_work = True + + # Test with no issues + best_action = first_pass_analyzer._choose_best_action_based_on_observed_issues([]) + assert best_action == QuotationDenormalizationAction.APPLY_FULL + + # Test with one issue + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK] + ) + == QuotationDenormalizationAction.APPLY_BASIC + ) + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK] + ) + == QuotationDenormalizationAction.SKIP + ) + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [QuotationMarkResolutionIssue.TOO_DEEP_NESTING] + ) + == QuotationDenormalizationAction.APPLY_BASIC + ) + + # Test with multiple issues + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [ + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, + ] + ) + == QuotationDenormalizationAction.SKIP + ) + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [ + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, + QuotationMarkResolutionIssue.TOO_DEEP_NESTING, + ] + ) + == QuotationDenormalizationAction.SKIP + ) + assert ( + first_pass_analyzer._choose_best_action_based_on_observed_issues( + [ + QuotationMarkResolutionIssue.TOO_DEEP_NESTING, + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, + ] + ) + == QuotationDenormalizationAction.APPLY_BASIC + ) +# tests of get_best_actions_by_chapter() def test_no_issues_in_usfm() -> None: normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal @@ -203,3 +527,31 @@ def run_quotation_denormalization_first_pass( parse_usfm(normalized_usfm, first_pass_analyzer) return first_pass_analyzer.get_best_actions_by_chapter(normalized_usfm) + + +def run_quotation_denormalization_first_pass_on_chapter( + verse_texts: List[str], source_quote_convention_name: str, target_quote_convention_name: str +) -> QuotationDenormalizationAction: + source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + source_quote_convention_name + ) + assert source_quote_convention is not None + + target_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + target_quote_convention_name + ) + assert target_quote_convention is not None + + first_pass_analyzer = QuotationDenormalizationFirstPass(source_quote_convention, target_quote_convention) + + chapter = Chapter([Verse([TextSegment.Builder().set_text(verse_text).build() for verse_text in verse_texts])]) + + return first_pass_analyzer._find_best_action_for_chapter(chapter) + + +def get_quote_convention_by_name(name: str) -> QuoteConvention: + quote_convention: Union[QuoteConvention, None] = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name(name) + ) + assert quote_convention is not None + return quote_convention From c17b79b46988181f02b7f6fa4191e99fed88791f Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 9 May 2025 12:02:39 -0400 Subject: [PATCH 12/31] Unit tests for UsfmStructureExtractor --- machine/corpora/analysis/__init__.py | 2 + .../quotation_mark_resolution_settings.py | 2 - ...onvention_detection_resolution_settings.py | 3 - .../corpora/basic_quotation_mark_resolver.py | 3 - ...ion_denormalization_resolution_settings.py | 3 - .../analysis/test_usfm_structure_extractor.py | 441 ++++++++++++++++++ 6 files changed, 443 insertions(+), 11 deletions(-) create mode 100644 tests/corpora/analysis/test_usfm_structure_extractor.py diff --git a/machine/corpora/analysis/__init__.py b/machine/corpora/analysis/__init__.py index 849222ff..cb270801 100644 --- a/machine/corpora/analysis/__init__.py +++ b/machine/corpora/analysis/__init__.py @@ -12,6 +12,7 @@ from .quote_convention_set import QuoteConventionSet from .text_segment import TextSegment from .usfm_marker_type import UsfmMarkerType +from .usfm_structure_extractor import UsfmStructureExtractor from .verse import Verse __all__ = [ @@ -30,5 +31,6 @@ "QuoteConventionSet", "TextSegment", "UsfmMarkerType", + "UsfmStructureExtractor", "Verse", ] diff --git a/machine/corpora/analysis/quotation_mark_resolution_settings.py b/machine/corpora/analysis/quotation_mark_resolution_settings.py index 59d4b6c4..e3385f1d 100644 --- a/machine/corpora/analysis/quotation_mark_resolution_settings.py +++ b/machine/corpora/analysis/quotation_mark_resolution_settings.py @@ -15,8 +15,6 @@ def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: def should_rely_on_paragraph_markers(self) -> bool: ... - def should_quit_on_error(self) -> bool: ... - def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: ... def does_metadata_match_quotation_mark( diff --git a/machine/corpora/analysis/quote_convention_detection_resolution_settings.py b/machine/corpora/analysis/quote_convention_detection_resolution_settings.py index 43328a60..f38c6a5c 100644 --- a/machine/corpora/analysis/quote_convention_detection_resolution_settings.py +++ b/machine/corpora/analysis/quote_convention_detection_resolution_settings.py @@ -23,9 +23,6 @@ def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: def should_rely_on_paragraph_markers(self): return True - def should_quit_on_error(self) -> bool: - return True - def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: return self._quote_convention_set.get_possible_depths(quotation_mark, direction) diff --git a/machine/corpora/basic_quotation_mark_resolver.py b/machine/corpora/basic_quotation_mark_resolver.py index b9f6f04f..5e945c6a 100644 --- a/machine/corpora/basic_quotation_mark_resolver.py +++ b/machine/corpora/basic_quotation_mark_resolver.py @@ -30,21 +30,18 @@ def _resolve_quotation_mark( quote_match: QuotationMarkStringMatch, ) -> Generator[QuotationMarkMetadata, None, None]: if self._is_opening_quote(quote_match): - print("Opening quote: %s" % quote_match.get_context()) quote: Union[QuotationMarkMetadata, None] = self._resolve_opening_mark(quote_match) if quote is not None: yield quote else: self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) elif self._is_closing_quote(quote_match): - print("Closing quote: %s" % quote_match.get_context()) quote: Union[QuotationMarkMetadata, None] = self._resolve_closing_mark(quote_match) if quote is not None: yield quote else: self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) else: - print("Unknown quote %s" % quote_match.get_context()) self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) def _is_opening_quote( diff --git a/machine/corpora/quotation_denormalization_resolution_settings.py b/machine/corpora/quotation_denormalization_resolution_settings.py index 363a505c..8f5dc0c5 100644 --- a/machine/corpora/quotation_denormalization_resolution_settings.py +++ b/machine/corpora/quotation_denormalization_resolution_settings.py @@ -25,9 +25,6 @@ def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: def should_rely_on_paragraph_markers(self): return False - def should_quit_on_error(self) -> bool: - return False - def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: return self._normalized_source_quote_convention.get_possible_depths(quotation_mark, direction) diff --git a/tests/corpora/analysis/test_usfm_structure_extractor.py b/tests/corpora/analysis/test_usfm_structure_extractor.py new file mode 100644 index 00000000..edaef383 --- /dev/null +++ b/tests/corpora/analysis/test_usfm_structure_extractor.py @@ -0,0 +1,441 @@ +from typing import List + +from machine.corpora import UsfmParser +from machine.corpora.analysis import Chapter, TextSegment, UsfmMarkerType, UsfmStructureExtractor, Verse + +verse_text_parser_state = usfm_parser = UsfmParser("").state +verse_text_parser_state.verse_ref.verse_num = 1 + + +def test_chapter_and_verse_markers(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + + +def test_start_paragraph_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.start_para(verse_text_parser_state, "p", False, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + + +def test_start_character_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.start_char(verse_text_parser_state, "k", False, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CharacterMarker) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + + +def test_end_character_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.end_char(verse_text_parser_state, "k", None, False) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CharacterMarker) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + + +def test_end_note_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.end_note(verse_text_parser_state, "f", False) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.EmbedMarker) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + + +def test_end_table_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.end_note(verse_text_parser_state, "tr", False) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.EmbedMarker) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + + +def test_ref_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.end_note(verse_text_parser_state, "x", False) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.EmbedMarker) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + + +def test_sidebar_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.end_note(verse_text_parser_state, "esb", False) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.EmbedMarker) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + + +def test_multiple_verses(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + usfm_structure_extractor.verse(verse_text_parser_state, "2", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test2") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .build() + ] + ), + Verse( + [ + TextSegment.Builder() + .set_text("test2") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .build() + ] + ), + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[1].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[1].text_segments[0].get_next_segment() is None + + +def test_multiple_chapters(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + usfm_structure_extractor.chapter(verse_text_parser_state, "2", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test2") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .build() + ] + ), + ] + ), + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test2") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .build() + ] + ), + ] + ), + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[1].verses[0].text_segments[0].get_previous_segment() is None + assert actual_chapters[1].verses[0].text_segments[0].get_next_segment() is None + + +def test_character_marker_in_text(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + usfm_structure_extractor.start_char(verse_text_parser_state, "k", False, None) + usfm_structure_extractor.text(verse_text_parser_state, "test2") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .build(), + TextSegment.Builder() + .set_text("test2") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CharacterMarker) + .build(), + ] + ), + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert ( + actual_chapters[0].verses[0].text_segments[1].get_previous_segment() + == expected_chapters[0].verses[0].text_segments[0] + ) + assert ( + actual_chapters[0].verses[0].text_segments[0].get_next_segment() + == expected_chapters[0].verses[0].text_segments[1] + ) + + +def test_empty_text(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + usfm_structure_extractor.start_char(verse_text_parser_state, "k", False, None) + usfm_structure_extractor.text(verse_text_parser_state, "") + usfm_structure_extractor.end_char(verse_text_parser_state, "k", None, False) + usfm_structure_extractor.text(verse_text_parser_state, "test2") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .build(), + TextSegment.Builder() + .set_text("test2") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CharacterMarker) + .build(), + ] + ), + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert ( + actual_chapters[0].verses[0].text_segments[1].get_previous_segment() + == expected_chapters[0].verses[0].text_segments[0] + ) + assert ( + actual_chapters[0].verses[0].text_segments[0].get_next_segment() + == expected_chapters[0].verses[0].text_segments[1] + ) + + +def test_reset(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + usfm_structure_extractor._reset() + + expected_chapters = [] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + + +def assert_chapter_equal(expected_chapters: List[Chapter], actual_chapters: List[Chapter]): + assert len(expected_chapters) == len(actual_chapters) + for expected_chapter, actual_chapter in zip(expected_chapters, actual_chapters): + assert len(expected_chapter.verses) == len(actual_chapter.verses) + for expected_verse, actual_verse in zip(expected_chapter.verses, actual_chapter.verses): + assert len(expected_verse.text_segments) == len(actual_verse.text_segments) + for expected_segment, actual_segment in zip(expected_verse.text_segments, actual_verse.text_segments): + assert expected_segment == actual_segment From 46ed639328724f87795d364c9fe92e7c7d4bdb7d Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 6 Jun 2025 13:47:52 -0400 Subject: [PATCH 13/31] Unit tests for several quotation mark analysis classes --- machine/corpora/__init__.py | 6 +- machine/corpora/analysis/__init__.py | 5 +- .../depth_based_quotation_mark_resolver.py | 4 +- .../analysis/quotation_mark_string_match.py | 4 +- machine/corpora/analysis/quote_convention.py | 2 +- .../corpora/analysis/quote_convention_set.py | 20 +- machine/corpora/analysis/text_segment.py | 4 +- .../quotation_denormalization_first_pass.py | 3 +- .../quotation_denormalization_settings.py | 37 +- ...ormalization_usfm_update_block_handler.py} | 94 +- machine/corpora/scripture_update_block.py | 51 - .../corpora/scripture_update_block_handler.py | 9 - machine/corpora/scripture_update_element.py | 43 - tests/corpora/analysis/test_chapter.py | 23 + .../test_quotation_mark_string_match.py | 527 ++++++++ .../corpora/analysis/test_quote_convention.py | 367 +++++- .../analysis/test_quote_convention_set.py | 1145 +++++++++++++++++ tests/corpora/analysis/test_text_segment.py | 320 +++++ tests/corpora/analysis/test_verse.py | 42 + ...st_quotation_denormalization_first_pass.py | 2 +- ...ormalization_usfm_block_update_handler.py} | 367 ++++-- 21 files changed, 2781 insertions(+), 294 deletions(-) rename machine/corpora/{quotation_denormalization_scripture_update_block_handler.py => quotation_denormalization_usfm_update_block_handler.py} (64%) delete mode 100644 machine/corpora/scripture_update_block.py delete mode 100644 machine/corpora/scripture_update_block_handler.py delete mode 100644 machine/corpora/scripture_update_element.py create mode 100644 tests/corpora/analysis/test_chapter.py create mode 100644 tests/corpora/analysis/test_quotation_mark_string_match.py create mode 100644 tests/corpora/analysis/test_quote_convention_set.py create mode 100644 tests/corpora/analysis/test_text_segment.py create mode 100644 tests/corpora/analysis/test_verse.py rename tests/corpora/{test_quotation_denormalization_scripture_block_update_handler.py => test_quotation_denormalization_usfm_block_update_handler.py} (66%) diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index f3c8804d..518901fa 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -28,10 +28,8 @@ from .quotation_denormalization_action import QuotationDenormalizationAction from .quotation_denormalization_first_pass import QuotationDenormalizationFirstPass from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings -from .quotation_denormalization_scripture_update_block_handler import ( - QuotationDenormalizationScriptureUpdateBlockHandler, -) from .quotation_denormalization_settings import QuotationDenormalizationSettings +from .quotation_denormalization_usfm_update_block_handler import QuotationDenormalizationUsfmUpdateBlockHandler from .scripture_element import ScriptureElement from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType @@ -132,7 +130,7 @@ "parse_usfm", "QuotationDenormalizationAction", "QuotationDenormalizationFirstPass", - "QuotationDenormalizationScriptureUpdateBlockHandler", + "QuotationDenormalizationUsfmUpdateBlockHandler", "QuotationDenormalizationResolutionSettings", "QuotationDenormalizationSettings", "RtlReferenceOrder", diff --git a/machine/corpora/analysis/__init__.py b/machine/corpora/analysis/__init__.py index cb270801..90579e4a 100644 --- a/machine/corpora/analysis/__init__.py +++ b/machine/corpora/analysis/__init__.py @@ -1,12 +1,13 @@ from .chapter import Chapter from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_finder import QuotationMarkFinder from .quotation_mark_metadata import QuotationMarkMetadata from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings from .quotation_mark_resolver import QuotationMarkResolver from .quotation_mark_string_match import QuotationMarkStringMatch -from .quote_convention import QuoteConvention +from .quote_convention import QuoteConvention, SingleLevelQuoteConvention from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector from .quote_convention_set import QuoteConventionSet @@ -18,12 +19,14 @@ __all__ = [ "Chapter", "DepthBasedQuotationMarkResolver", + "SingleLevelQuoteConvention", "QuotationMarkDirection", "QuotationMarkMetadata", "QuotationMarkStringMatch", "QuoteConvention", "QuoteConventionAnalysis", "QuoteConventionDetectionResolutionSettings", + "QuotationMarkFinder", "QuotationMarkResolutionIssue", "QuotationMarkResolutionSettings", "QuotationMarkResolver", diff --git a/machine/corpora/analysis/depth_based_quotation_mark_resolver.py b/machine/corpora/analysis/depth_based_quotation_mark_resolver.py index 202cff00..e522faea 100644 --- a/machine/corpora/analysis/depth_based_quotation_mark_resolver.py +++ b/machine/corpora/analysis/depth_based_quotation_mark_resolver.py @@ -227,9 +227,7 @@ def _is_closing_quote( # if the quote convention is ambiguous, use whitespace as a clue if self._settings.is_valid_opening_quotation_mark(match): return ( - match.has_trailing_whitespace() - or match.has_trailing_punctuation() - # or match.has_trailing_closing_quotation_mark(self._possible_quote_convention_set) + match.has_trailing_whitespace() or match.has_trailing_punctuation() or match.is_at_end_of_segment() ) and not match.has_leading_whitespace() return True diff --git a/machine/corpora/analysis/quotation_mark_string_match.py b/machine/corpora/analysis/quotation_mark_string_match.py index d8af13e9..0da48fc3 100644 --- a/machine/corpora/analysis/quotation_mark_string_match.py +++ b/machine/corpora/analysis/quotation_mark_string_match.py @@ -117,7 +117,7 @@ def has_trailing_whitespace(self) -> bool: return self.does_next_character_match(self.whitespace_pattern) def has_leading_punctuation(self) -> bool: - return self.does_next_character_match(self.punctuation_pattern) + return self.does_previous_character_match(self.punctuation_pattern) def has_trailing_punctuation(self) -> bool: return self.does_next_character_match(self.punctuation_pattern) @@ -138,7 +138,7 @@ def has_quote_introducer_in_leading_substring(self) -> bool: return self.does_leading_substring_match(self.quote_introducer_pattern) def has_leading_closing_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: - return self.does_previous_character_match(quote_convention_set.get_opening_quotation_mark_regex()) + return self.does_previous_character_match(quote_convention_set.get_closing_quotation_mark_regex()) def has_trailing_closing_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: return self.does_next_character_match(quote_convention_set.get_closing_quotation_mark_regex()) diff --git a/machine/corpora/analysis/quote_convention.py b/machine/corpora/analysis/quote_convention.py index 7d959df0..9a5ebe0b 100644 --- a/machine/corpora/analysis/quote_convention.py +++ b/machine/corpora/analysis/quote_convention.py @@ -61,7 +61,7 @@ def get_closing_quote_at_level(self, level: int) -> str: return self.levels[level - 1].get_closing_quote() def get_expected_quotation_mark(self, depth: int, direction: QuotationMarkDirection) -> str: - if depth > len(self.levels): + if depth > len(self.levels) or depth < 1: return "" return ( self.get_opening_quote_at_level(depth) diff --git a/machine/corpora/analysis/quote_convention_set.py b/machine/corpora/analysis/quote_convention_set.py index c1af9ef4..c4203774 100644 --- a/machine/corpora/analysis/quote_convention_set.py +++ b/machine/corpora/analysis/quote_convention_set.py @@ -31,12 +31,14 @@ def _create_quote_regexes(self) -> None: if len(all_quotation_marks) > 0: self.opening_quotation_mark_regex: Pattern = regex.compile( - r"[" + "".join(opening_quotation_marks) + "]" + r"[" + "".join(sorted(list(opening_quotation_marks))) + "]" ) self.closing_quotation_mark_regex: Pattern = regex.compile( - r"[" + "".join(closing_quotation_marks) + "]" + r"[" + "".join(sorted(list(closing_quotation_marks))) + "]" + ) + self.all_quotation_mark_regex: Pattern = regex.compile( + r"[" + "".join(sorted(list(all_quotation_marks))) + "]" ) - self.all_quotation_mark_regex: Pattern = regex.compile(r"[" + "".join(all_quotation_marks) + "]") if len(opening_quotation_marks) == 0: self.opening_quotation_mark_regex = regex.compile(r"") @@ -65,20 +67,20 @@ def get_quote_convention_by_name(self, name: str) -> Union[QuoteConvention, None return convention return None + def get_all_quote_convention_names(self) -> List[str]: + return sorted([qc.name for qc in self.conventions]) + def get_possible_opening_marks(self) -> list[str]: - return list(self.closing_marks_by_opening_mark.keys()) + return sorted(list(self.closing_marks_by_opening_mark.keys())) def get_possible_closing_marks(self) -> list[str]: - return list(self.opening_marks_by_closing_mark.keys()) + return sorted(list(self.opening_marks_by_closing_mark.keys())) def is_valid_opening_quotation_mark(self, quotation_mark: str) -> bool: return quotation_mark in self.closing_marks_by_opening_mark def is_valid_closing_quotation_mark(self, quotation_mark: str) -> bool: - for closing_mark_set in self.closing_marks_by_opening_mark.values(): - if quotation_mark in closing_mark_set: - return True - return False + return quotation_mark in self.opening_marks_by_closing_mark def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: return (opening_mark in self.closing_marks_by_opening_mark) and ( diff --git a/machine/corpora/analysis/text_segment.py b/machine/corpora/analysis/text_segment.py index 5be2413c..8c7b7159 100644 --- a/machine/corpora/analysis/text_segment.py +++ b/machine/corpora/analysis/text_segment.py @@ -35,10 +35,10 @@ def length(self) -> int: return len(self.text) def substring_before(self, index: int) -> str: - return self.text[0:index] + return self.text[:index] def substring_after(self, index: int) -> str: - return self.text[index:-1] + return self.text[index:] def get_immediate_preceding_marker_type(self) -> UsfmMarkerType: return self.immediate_preceding_marker diff --git a/machine/corpora/quotation_denormalization_first_pass.py b/machine/corpora/quotation_denormalization_first_pass.py index e7d4d1f9..d1767715 100644 --- a/machine/corpora/quotation_denormalization_first_pass.py +++ b/machine/corpora/quotation_denormalization_first_pass.py @@ -46,7 +46,7 @@ def _check_whether_basic_denormalization_will_work( return False return True - def get_best_actions_by_chapter(self, usfm_text: str) -> List[QuotationDenormalizationAction]: + def get_best_actions_by_chapter(self) -> List[QuotationDenormalizationAction]: best_actions_by_chapter: List[QuotationDenormalizationAction] = [] for chapter in self.get_chapters(): @@ -67,7 +67,6 @@ def _find_best_action_for_chapter(self, chapter: Chapter) -> QuotationDenormaliz return self._choose_best_action_based_on_observed_issues(self._quotation_mark_resolver.get_issues()) def _choose_best_action_based_on_observed_issues(self, issues) -> QuotationDenormalizationAction: - print(issues) if QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK in issues: return QuotationDenormalizationAction.SKIP diff --git a/machine/corpora/quotation_denormalization_settings.py b/machine/corpora/quotation_denormalization_settings.py index cfb2bece..76c85f5d 100644 --- a/machine/corpora/quotation_denormalization_settings.py +++ b/machine/corpora/quotation_denormalization_settings.py @@ -3,38 +3,15 @@ class QuotationDenormalizationSettings: - def __init__(self): - self._should_run_on_existing_text = False - self._default_chapter_action = QuotationDenormalizationAction.APPLY_FULL - self._chapter_actions: list[QuotationDenormalizationAction] = [] - - def should_run_on_existing_text(self) -> bool: - return self._should_run_on_existing_text + def __init__( + self, + default_chapter_action: QuotationDenormalizationAction = QuotationDenormalizationAction.APPLY_FULL, + chapter_actions: list[QuotationDenormalizationAction] = [], + ): + self._default_chapter_action = default_chapter_action + self._chapter_actions = chapter_actions def get_action_for_chapter(self, chapter_number: int) -> QuotationDenormalizationAction: if chapter_number <= len(self._chapter_actions): return self._chapter_actions[chapter_number - 1] return self._default_chapter_action - - class Builder: - def __init__(self): - self.settings = QuotationDenormalizationSettings() - - def run_on_existing_text(self) -> "QuotationDenormalizationSettings.Builder": - self.settings._should_run_on_existing_text = True - return self - - def set_chapter_actions( - self, chapter_actions: list[QuotationDenormalizationAction] - ) -> "QuotationDenormalizationSettings.Builder": - self.settings._chapter_actions = chapter_actions - return self - - def set_default_chapter_action( - self, action: QuotationDenormalizationAction - ) -> "QuotationDenormalizationSettings.Builder": - self.settings._default_chapter_action = action - return self - - def build(self): - return self.settings diff --git a/machine/corpora/quotation_denormalization_scripture_update_block_handler.py b/machine/corpora/quotation_denormalization_usfm_update_block_handler.py similarity index 64% rename from machine/corpora/quotation_denormalization_scripture_update_block_handler.py rename to machine/corpora/quotation_denormalization_usfm_update_block_handler.py index b36d8b69..4d848c38 100644 --- a/machine/corpora/quotation_denormalization_scripture_update_block_handler.py +++ b/machine/corpora/quotation_denormalization_usfm_update_block_handler.py @@ -12,13 +12,13 @@ from .quotation_denormalization_action import QuotationDenormalizationAction from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings from .quotation_denormalization_settings import QuotationDenormalizationSettings -from .scripture_update_block import ScriptureUpdateBlock -from .scripture_update_block_handler import ScriptureUpdateBlockHandler -from .scripture_update_element import ScriptureUpdateElement, ScriptureUpdateElementType from .usfm_token import UsfmToken, UsfmTokenType +from .usfm_update_block import UsfmUpdateBlock +from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType +from .usfm_update_block_handler import UsfmUpdateBlockHandler -class QuotationDenormalizationScriptureUpdateBlockHandler(ScriptureUpdateBlockHandler): +class QuotationDenormalizationUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): def __init__( self, @@ -36,72 +36,50 @@ def __init__( ) self._next_scripture_text_segment_builder: TextSegment.Builder = TextSegment.Builder() + resolution_settings = QuotationDenormalizationResolutionSettings( + self._source_quote_convention, self._target_quote_convention + ) + # Each embed represents a separate context for quotation marks # (i.e. you can't open a quote in one context and close it in another) # so we need to keep track of the verse and embed contexts separately. - resolution_settings = QuotationDenormalizationResolutionSettings( - self._source_quote_convention, self._target_quote_convention + self._verse_text_quotation_mark_resolver: DepthBasedQuotationMarkResolver = DepthBasedQuotationMarkResolver( + resolution_settings ) - self._verse_text_quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( + self._embed_quotation_mark_resolver: DepthBasedQuotationMarkResolver = DepthBasedQuotationMarkResolver( resolution_settings ) - self._embed_quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( + self._simple_quotation_mark_resolver: BasicQuotationMarkResolver = BasicQuotationMarkResolver( resolution_settings ) - self._simple_quotation_mark_resolver: QuotationMarkResolver = BasicQuotationMarkResolver(resolution_settings) self._current_denormalization_action = QuotationDenormalizationAction.APPLY_FULL + self._current_chapter_number: int = 0 - def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: + self._check_for_chapter_change(block) if self._current_denormalization_action is QuotationDenormalizationAction.SKIP: return block if self._current_denormalization_action is QuotationDenormalizationAction.APPLY_BASIC: return self._apply_simple_denormalization(block) return self._apply_full_denormalization(block) - def _apply_simple_denormalization(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + def _apply_simple_denormalization(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: for element in block._elements: - if element.type == ScriptureUpdateElementType.EMBED_BLOCK or ( - element.type == ScriptureUpdateElementType.EXISTING_TEXT - and not self._settings.should_run_on_existing_text() - ): - continue - self._process_scripture_element(element, self._simple_quotation_mark_resolver) return block - def _apply_full_denormalization(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: - if len(block.elements) > 0 and block.elements[0].type == ScriptureUpdateElementType.EMBED: - return self._process_embed_block(block) - - return self._process_verse_text_block(block) - - def _process_embed_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: - self._embed_quotation_mark_resolver.reset() + def _apply_full_denormalization(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: for element in block._elements: - if ( - element.type == ScriptureUpdateElementType.EXISTING_TEXT - and not self._settings.should_run_on_existing_text() - ): - continue + if element.type == UsfmUpdateBlockElementType.EMBED: + self._embed_quotation_mark_resolver.reset() + self._process_scripture_element(element, self._embed_quotation_mark_resolver) + else: + self._process_scripture_element(element, self._verse_text_quotation_mark_resolver) - self._process_scripture_element(element, self._embed_quotation_mark_resolver) - return block - - def _process_verse_text_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: - for element in block._elements: - if element.type == ScriptureUpdateElementType.EMBED_BLOCK: - continue - if ( - element.type == ScriptureUpdateElementType.EXISTING_TEXT - and not self._settings.should_run_on_existing_text() - ): - continue - - self._process_scripture_element(element, self._verse_text_quotation_mark_resolver) return block def _process_scripture_element( - self, element: ScriptureUpdateElement, quotation_mark_resolver: QuotationMarkResolver + self, element: UsfmUpdateBlockElement, quotation_mark_resolver: QuotationMarkResolver ) -> None: text_segments: List[TextSegment] = self._create_text_segments(element) quotation_mark_matches: List[QuotationMarkStringMatch] = ( @@ -110,13 +88,10 @@ def _process_scripture_element( for resolved_quotation_mark in quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches): resolved_quotation_mark.update_quotation_mark(self._target_quote_convention) - def _create_text_segments(self, element: ScriptureUpdateElement) -> List[TextSegment]: + def _create_text_segments(self, element: UsfmUpdateBlockElement) -> List[TextSegment]: text_segments: List[TextSegment] = [] for token in element.get_tokens(): - if token.type == UsfmTokenType.CHAPTER: - self._start_new_chapter(token) - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) - elif token.type == UsfmTokenType.VERSE: + if token.type == UsfmTokenType.VERSE: self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) elif token.type == UsfmTokenType.PARAGRAPH: self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ParagraphMarker) @@ -130,13 +105,6 @@ def _create_text_segments(self, element: ScriptureUpdateElement) -> List[TextSeg text_segments.append(text_segment) return self._set_previous_and_next_for_segments(text_segments) - def _start_new_chapter(self, token: UsfmToken) -> None: - chapter_number: Union[int, None] = int(token.data) if token.data is not None else None - if chapter_number is not None: - self._current_denormalization_action = self._settings.get_action_for_chapter(chapter_number) - self._verse_text_quotation_mark_resolver.reset() - self._next_scripture_text_segment_builder = TextSegment.Builder() - def _create_text_segment(self, token: UsfmToken) -> Union[TextSegment, None]: self._next_scripture_text_segment_builder.set_usfm_token(token) if token.text is not None: @@ -154,3 +122,15 @@ def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) if i < len(text_segments) - 1: text_segments[i].set_next_segment(text_segments[i + 1]) return text_segments + + def _check_for_chapter_change(self, block: UsfmUpdateBlock) -> None: + for scripture_ref in block.refs: + if scripture_ref.chapter_num != self._current_chapter_number: + self._current_chapter_number = scripture_ref.chapter_num + self._start_new_chapter(self._current_chapter_number) + + def _start_new_chapter(self, new_chapter_number: int) -> None: + self._current_denormalization_action = self._settings.get_action_for_chapter(new_chapter_number) + self._verse_text_quotation_mark_resolver.reset() + self._next_scripture_text_segment_builder = TextSegment.Builder() + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) diff --git a/machine/corpora/scripture_update_block.py b/machine/corpora/scripture_update_block.py deleted file mode 100644 index b4c7e290..00000000 --- a/machine/corpora/scripture_update_block.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from .scripture_ref import ScriptureRef -from .scripture_update_element import ( - ScriptureUpdateElement, - ScriptureUpdateElementType, - create_non_text_scripture_element, -) -from .usfm_token import UsfmToken, UsfmTokenType - - -class ScriptureUpdateBlock: - - def __init__(self) -> None: - self.ref: ScriptureRef = ScriptureRef() - self._elements: list[ScriptureUpdateElement] = [] - - @property - def elements(self) -> list[ScriptureUpdateElement]: - return self._elements - - def add_existing_text(self, token: UsfmToken, marked_for_removal: bool = False) -> None: - self._elements.append( - ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) - ) - - def add_inserted_text(self, tokens: list[UsfmToken]) -> None: - self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.INSERTED_TEXT, tokens.copy())) - - def add_token(self, token: UsfmToken, marked_for_removal: bool = False) -> None: - if token.type == UsfmTokenType.TEXT: - self._elements.append( - ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) - ) - else: - self._elements.append(create_non_text_scripture_element([token], marked_for_removal)) - - def add_tokens(self, tokens: list[UsfmToken], marked_for_removal: bool = False) -> None: - if len(tokens) == 0: - return - self._elements.append(create_non_text_scripture_element(tokens, marked_for_removal)) - - def update_ref(self, ref: ScriptureRef) -> None: - self.ref = ref - - def clear(self) -> None: - self._elements.clear() - self.ref = ScriptureRef() - - def get_tokens(self) -> list[UsfmToken]: - return [token for element in self._elements for token in element.get_tokens()] diff --git a/machine/corpora/scripture_update_block_handler.py b/machine/corpora/scripture_update_block_handler.py deleted file mode 100644 index e5dc9cca..00000000 --- a/machine/corpora/scripture_update_block_handler.py +++ /dev/null @@ -1,9 +0,0 @@ -from __future__ import annotations -from abc import ABC - -from .scripture_update_block import ScriptureUpdateBlock - - -class ScriptureUpdateBlockHandler(ABC): - - def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: ... diff --git a/machine/corpora/scripture_update_element.py b/machine/corpora/scripture_update_element.py deleted file mode 100644 index 7296bd0a..00000000 --- a/machine/corpora/scripture_update_element.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from enum import Enum, auto - -from .scripture_embed import is_embed_style -from .usfm_token import UsfmToken, UsfmTokenType - - -class ScriptureUpdateElementType(Enum): - EXISTING_TEXT = auto() - INSERTED_TEXT = auto() - PARAGRAPH = auto() - EMBED = auto() - STYLE = auto() - OTHER = auto() - - -@dataclass -class ScriptureUpdateElement: - type: ScriptureUpdateElementType - tokens: list[UsfmToken] - marked_for_removal: bool = False - - def get_tokens(self) -> list[UsfmToken]: - if self.marked_for_removal: - return [] - return self.tokens - - -def create_non_text_scripture_element( - tokens: list[UsfmToken], marked_for_removal: bool = False -) -> ScriptureUpdateElement: - tokens = tokens.copy() - # Determine if it is a Paragraph, style, embed or other - if len(tokens) == 0 or tokens[0].marker is None: - return ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, [], marked_for_removal) - if tokens[0].type == UsfmTokenType.PARAGRAPH: - return ScriptureUpdateElement(ScriptureUpdateElementType.PARAGRAPH, tokens, marked_for_removal) - if is_embed_style(tokens[0].marker): - return ScriptureUpdateElement(ScriptureUpdateElementType.EMBED, tokens, marked_for_removal) - else: - return ScriptureUpdateElement(ScriptureUpdateElementType.STYLE, tokens, marked_for_removal) diff --git a/tests/corpora/analysis/test_chapter.py b/tests/corpora/analysis/test_chapter.py new file mode 100644 index 00000000..6f8557f0 --- /dev/null +++ b/tests/corpora/analysis/test_chapter.py @@ -0,0 +1,23 @@ +from machine.corpora.analysis import Chapter, TextSegment, Verse + + +def test_initialize_verse() -> None: + text_segments1 = [ + TextSegment.Builder().set_text("Segment 1").build(), + TextSegment.Builder().set_text("Segment 2").build(), + TextSegment.Builder().set_text("Segment 3").build(), + ] + verse1 = Verse(text_segments1) + + text_segments2 = [ + TextSegment.Builder().set_text("Segment 4").build(), + TextSegment.Builder().set_text("Segment 5").build(), + TextSegment.Builder().set_text("Segment 6").build(), + ] + verse2 = Verse(text_segments2) + + chapter = Chapter([verse1, verse2]) + + assert len(chapter.get_verses()) == 2 + assert chapter.get_verses()[0].get_text_segments() == text_segments1 + assert chapter.get_verses()[1].get_text_segments() == text_segments2 diff --git a/tests/corpora/analysis/test_quotation_mark_string_match.py b/tests/corpora/analysis/test_quotation_mark_string_match.py new file mode 100644 index 00000000..d3608c70 --- /dev/null +++ b/tests/corpora/analysis/test_quotation_mark_string_match.py @@ -0,0 +1,527 @@ +import regex + +from machine.corpora.analysis import ( + QuotationMarkDirection, + QuotationMarkMetadata, + QuotationMarkStringMatch, + QuoteConvention, + QuoteConventionSet, + SingleLevelQuoteConvention, + TextSegment, + UsfmMarkerType, +) + + +def test_get_quotation_mark() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("quick brown fox").build(), 6, 7 + ) + assert quotation_mark_string_match.get_quotation_mark() == "b" + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("quick brown fox").build(), 6, 10 + ) + assert quotation_mark_string_match.get_quotation_mark() == "brow" + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("q").build(), 0, 1) + assert quotation_mark_string_match.get_quotation_mark() == "q" + + +def test_is_valid_opening_quotation_mark() -> None: + standard_english_quote_convention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + assert quotation_mark_string_match.is_valid_opening_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + assert not quotation_mark_string_match.is_valid_opening_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 1, 2) + assert quotation_mark_string_match.is_valid_opening_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 0, 2) + assert not quotation_mark_string_match.is_valid_opening_quotation_mark(standard_english_quote_convention_set) + + +def test_is_valid_closing_quotation_mark() -> None: + standard_english_quote_convention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + assert quotation_mark_string_match.is_valid_closing_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + assert not quotation_mark_string_match.is_valid_closing_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 0, 1) + assert quotation_mark_string_match.is_valid_closing_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 0, 2) + assert not quotation_mark_string_match.is_valid_closing_quotation_mark(standard_english_quote_convention_set) + + +def test_does_quotation_mark_match() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert quotation_mark_string_match.does_quotation_mark_match(regex.compile(r"^s$")) + assert not quotation_mark_string_match.does_quotation_mark_match(regex.compile(r"a")) + assert not quotation_mark_string_match.does_quotation_mark_match(regex.compile(r"sa")) + + +def test_does_next_character_match() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.does_next_character_match(regex.compile(r"^s$")) + assert quotation_mark_string_match.does_next_character_match(regex.compile(r"a")) + assert not quotation_mark_string_match.does_next_character_match(regex.compile(r"sa")) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert not quotation_mark_string_match.does_next_character_match(regex.compile(r".*")) + + +def test_does_previous_character_match() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert quotation_mark_string_match.does_previous_character_match(regex.compile(r"^s$")) + assert not quotation_mark_string_match.does_previous_character_match(regex.compile(r"a")) + assert not quotation_mark_string_match.does_previous_character_match(regex.compile(r"sa")) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.does_previous_character_match(regex.compile(r".*")) + + +def test_get_previous_character() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert quotation_mark_string_match.get_previous_character() == "s" + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert quotation_mark_string_match.get_previous_character() == "x" + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert quotation_mark_string_match.get_previous_character() is None + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) + assert quotation_mark_string_match.get_previous_character() == "“" + + +def test_get_next_character() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert quotation_mark_string_match.get_next_character() == "m" + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert quotation_mark_string_match.get_next_character() == "a" + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert quotation_mark_string_match.get_next_character() is None + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1) + assert quotation_mark_string_match.get_next_character() == "”" + + +def test_does_leading_substring_match() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 5, 6) + assert quotation_mark_string_match.does_leading_substring_match(regex.compile(r"^sampl$")) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.does_leading_substring_match(regex.compile(r".+")) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) + assert quotation_mark_string_match.does_leading_substring_match(regex.compile(r"\u201c")) + + +def test_does_trailing_substring_match() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 5, 6) + assert quotation_mark_string_match.does_trailing_substring_match(regex.compile(r"^ text$")) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 11, 12 + ) + assert not quotation_mark_string_match.does_trailing_substring_match(regex.compile(r".+")) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1) + assert quotation_mark_string_match.does_trailing_substring_match(regex.compile(r"\u201d")) + + +def test_get_context() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("this is a bunch' of sample text").build(), 15, 16 + ) + assert quotation_mark_string_match.get_context() == "is a bunch' of sample" + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("this is a bunch' of sample text").build(), 5, 6 + ) + assert quotation_mark_string_match.get_context() == "this is a bunch'" + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("this is a bunch' of sample text").build(), 25, 26 + ) + assert quotation_mark_string_match.get_context() == "' of sample text" + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("short").build(), 3, 4) + assert quotation_mark_string_match.get_context() == "short" + + +def test_resolve() -> None: + text_segment = TextSegment.Builder().set_text("'").build() + quotation_mark_string_match = QuotationMarkStringMatch(text_segment, 0, 1) + assert quotation_mark_string_match.resolve(2, QuotationMarkDirection.Opening) == QuotationMarkMetadata( + "'", 2, QuotationMarkDirection.Opening, text_segment, 0, 1 + ) + assert quotation_mark_string_match.resolve(1, QuotationMarkDirection.Opening) == QuotationMarkMetadata( + "'", 1, QuotationMarkDirection.Opening, text_segment, 0, 1 + ) + assert quotation_mark_string_match.resolve(1, QuotationMarkDirection.Closing) == QuotationMarkMetadata( + "'", 1, QuotationMarkDirection.Closing, text_segment, 0, 1 + ) + + +def test_is_at_start_of_segment() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert quotation_mark_string_match.is_at_start_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert not quotation_mark_string_match.is_at_start_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201csample text").build(), 0, 1 + ) + assert quotation_mark_string_match.is_at_start_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 15, 16 + ) + assert not quotation_mark_string_match.is_at_start_of_segment() + + +def test_is_at_end_of_segment() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert quotation_mark_string_match.is_at_end_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.is_at_end_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201csample text\u201d").build(), 12, 13 + ) + assert quotation_mark_string_match.is_at_end_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 15, 16 + ) + assert not quotation_mark_string_match.is_at_end_of_segment() + + +def test_has_leading_whitespace() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 7, 8) + assert quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample\ttext").build(), 7, 8) + assert quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), 0, 1 + ) + assert quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.EmbedMarker).build(), 0, 1 + ) + assert quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.VerseMarker).build(), 0, 1 + ) + assert quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.ChapterMarker).build(), 0, 1 + ) + assert not quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.CharacterMarker).build(), 0, 1 + ) + assert not quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201csample text").add_preceding_marker(UsfmMarkerType.VerseMarker).build(), + 0, + 1, + ) + assert quotation_mark_string_match.has_leading_whitespace() + + +def test_has_trailing_whitespace() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 5, 6) + assert quotation_mark_string_match.has_trailing_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample\ttext").build(), 5, 6) + assert quotation_mark_string_match.has_trailing_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert not quotation_mark_string_match.has_trailing_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 10, + 11, + ) + assert not quotation_mark_string_match.has_trailing_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.EmbedMarker).build(), 10, 11 + ) + assert not quotation_mark_string_match.has_trailing_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.VerseMarker).build(), 10, 11 + ) + assert not quotation_mark_string_match.has_trailing_whitespace() + + +def test_has_leading_punctuation() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample)\u201d text").build(), 7, 8 + ) + assert quotation_mark_string_match.has_leading_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample) \u201d text").build(), 8, 9 + ) + assert not quotation_mark_string_match.has_leading_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample,\u201d text").build(), 7, 8 + ) + assert quotation_mark_string_match.has_leading_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample.\u201d text").build(), 7, 8 + ) + assert quotation_mark_string_match.has_leading_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201csample text").build(), 0, 1 + ) + assert not quotation_mark_string_match.has_leading_punctuation() + + +def test_has_trailing_punctuation() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample \u201c-text").build(), 7, 8 + ) + assert quotation_mark_string_match.has_trailing_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample \u201c text").build(), 7, 8 + ) + assert not quotation_mark_string_match.has_trailing_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text\u201d").build(), 11, 12 + ) + assert not quotation_mark_string_match.has_trailing_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample', text\u201d").build(), 6, 7 + ) + assert quotation_mark_string_match.has_trailing_punctuation() + + +def test_has_letter_in_leading_substring() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert quotation_mark_string_match.has_letter_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("ꮪample text").build(), 1, 2) + assert quotation_mark_string_match.has_letter_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.has_letter_in_leading_substring() + + +def test_has_letter_in_trailing_substring() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 9, 10) + assert quotation_mark_string_match.has_letter_in_trailing_substring() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample tex𑢼").build(), 9, 10) + assert quotation_mark_string_match.has_letter_in_trailing_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert not quotation_mark_string_match.has_letter_in_trailing_substring() + + +def test_has_leading_latin_letter() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert quotation_mark_string_match.has_leading_latin_letter() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("5ample text").build(), 1, 2) + assert not quotation_mark_string_match.has_leading_latin_letter() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("Sample text").build(), 1, 2) + assert quotation_mark_string_match.has_leading_latin_letter() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.has_leading_latin_letter() + + +def test_has_trailing_latin_letter() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 9, 10) + assert quotation_mark_string_match.has_trailing_latin_letter() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample texT").build(), 9, 10 + ) + assert quotation_mark_string_match.has_trailing_latin_letter() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert not quotation_mark_string_match.has_trailing_latin_letter() + + +def test_has_quote_introducer_in_leading_substring() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample, \u201ctext").build(), 8, 9 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample,\u201ctext").build(), 7, 8 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample: \u201ctext").build(), 8, 9 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample:\u201ctext").build(), 7, 8 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample, \u201ctext").build(), 9, 10 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample,, \u201ctext").build(), 9, 10 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample, a \u201ctext").build(), 10, 11 + ) + assert not quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample, text").build(), 8, 9) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + +def test_has_leading_closing_quotation_mark() -> None: + standard_english_quote_convention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + + normalized_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention.normalize()]) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample\u2019\u201d text").build(), 7, 8 + ) + assert quotation_mark_string_match.has_leading_closing_quotation_mark(standard_english_quote_convention_set) + assert not quotation_mark_string_match.has_leading_closing_quotation_mark(normalized_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text('sample\u2019" text').build(), 7, 8 + ) + assert quotation_mark_string_match.has_leading_closing_quotation_mark(standard_english_quote_convention_set) + assert not quotation_mark_string_match.has_leading_closing_quotation_mark(normalized_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text('sample"\u201d text').build(), 7, 8 + ) + assert not quotation_mark_string_match.has_leading_closing_quotation_mark(standard_english_quote_convention_set) + assert quotation_mark_string_match.has_leading_closing_quotation_mark(normalized_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text('sample" \u201d text').build(), 8, 9 + ) + assert not quotation_mark_string_match.has_leading_closing_quotation_mark(standard_english_quote_convention_set) + assert not quotation_mark_string_match.has_leading_closing_quotation_mark(normalized_english_quote_convention_set) + + +def test_has_trailing_closing_quotation_mark() -> None: + standard_english_quote_convention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + + normalized_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention.normalize()]) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample\u2019\u201d text").build(), 6, 7 + ) + assert quotation_mark_string_match.has_trailing_closing_quotation_mark(standard_english_quote_convention_set) + assert not quotation_mark_string_match.has_trailing_closing_quotation_mark(normalized_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text('sample\u2019" text').build(), 6, 7 + ) + assert not quotation_mark_string_match.has_trailing_closing_quotation_mark(standard_english_quote_convention_set) + assert quotation_mark_string_match.has_trailing_closing_quotation_mark(normalized_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text('sample"\u201d text').build(), 6, 7 + ) + assert quotation_mark_string_match.has_trailing_closing_quotation_mark(standard_english_quote_convention_set) + assert not quotation_mark_string_match.has_trailing_closing_quotation_mark(normalized_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text('sample" \u201d text').build(), 6, 7 + ) + assert not quotation_mark_string_match.has_trailing_closing_quotation_mark(standard_english_quote_convention_set) + assert not quotation_mark_string_match.has_trailing_closing_quotation_mark(normalized_english_quote_convention_set) diff --git a/tests/corpora/analysis/test_quote_convention.py b/tests/corpora/analysis/test_quote_convention.py index 3e4e7a7f..165b1b4a 100644 --- a/tests/corpora/analysis/test_quote_convention.py +++ b/tests/corpora/analysis/test_quote_convention.py @@ -1,7 +1,371 @@ +from machine.corpora.analysis import QuotationMarkDirection from machine.corpora.analysis.quote_convention import QuoteConvention, SingleLevelQuoteConvention -def test_print_summary(): +def test_single_level_quote_convention_normalize() -> None: + english_level1_quote_convention = SingleLevelQuoteConvention("\u201c", "\u201d") + normalized_english_level1_quote_convention = english_level1_quote_convention.normalize() + assert normalized_english_level1_quote_convention.get_opening_quote() == '"' + assert normalized_english_level1_quote_convention.get_closing_quote() == '"' + + english_level2_quote_convention = SingleLevelQuoteConvention("\u2018", "\u2019") + normalized_english_level2_quote_convention = english_level2_quote_convention.normalize() + assert normalized_english_level2_quote_convention.get_opening_quote() == "'" + assert normalized_english_level2_quote_convention.get_closing_quote() == "'" + + already_normalized_english_level1_quote_convention = SingleLevelQuoteConvention('"', '"') + doubly_normalized_english_level1_quote_convention = already_normalized_english_level1_quote_convention.normalize() + assert doubly_normalized_english_level1_quote_convention.get_opening_quote() == '"' + assert doubly_normalized_english_level1_quote_convention.get_closing_quote() == '"' + + already_normalized_english_level2_quote_convention = SingleLevelQuoteConvention("'", "'") + doubly_normalized_english_level2_quote_convention = already_normalized_english_level2_quote_convention.normalize() + assert doubly_normalized_english_level2_quote_convention.get_opening_quote() == "'" + assert doubly_normalized_english_level2_quote_convention.get_closing_quote() == "'" + + french_level1_quote_convention = SingleLevelQuoteConvention("\u00ab", "\u00bb") + normalized_french_level1_quote_convention = french_level1_quote_convention.normalize() + assert normalized_french_level1_quote_convention.get_opening_quote() == '"' + assert normalized_french_level1_quote_convention.get_closing_quote() == '"' + + french_level2_quote_convention = SingleLevelQuoteConvention("\u2039", "\u203a") + normalized_french_level2_quote_convention = french_level2_quote_convention.normalize() + assert normalized_french_level2_quote_convention.get_opening_quote() == "\u2039" + assert normalized_french_level2_quote_convention.get_closing_quote() == "\u203a" + + typewriter_french_level1_quote_convention = SingleLevelQuoteConvention("<<", ">>") + normalized_typewriter_french_level1_quote_convention = typewriter_french_level1_quote_convention.normalize() + assert normalized_typewriter_french_level1_quote_convention.get_opening_quote() == "<<" + assert normalized_typewriter_french_level1_quote_convention.get_closing_quote() == ">>" + + typewriter_french_level2_quote_convention = SingleLevelQuoteConvention("<", ">") + normalized_typewriter_french_level2_quote_convention = typewriter_french_level2_quote_convention.normalize() + assert normalized_typewriter_french_level2_quote_convention.get_opening_quote() == "<" + assert normalized_typewriter_french_level2_quote_convention.get_closing_quote() == ">" + + central_european_level1_quote_convention = SingleLevelQuoteConvention("\u201e", "\u201c") + normalized_central_european_level1_quote_convention = central_european_level1_quote_convention.normalize() + assert normalized_central_european_level1_quote_convention.get_opening_quote() == '"' + assert normalized_central_european_level1_quote_convention.get_closing_quote() == '"' + + central_european_level2_quote_convention = SingleLevelQuoteConvention("\u201a", "\u2018") + normalized_central_european_level2_quote_convention = central_european_level2_quote_convention.normalize() + assert normalized_central_european_level2_quote_convention.get_opening_quote() == "'" + assert normalized_central_european_level2_quote_convention.get_closing_quote() == "'" + + central_european_guillemets_quote_convention = SingleLevelQuoteConvention("\u00bb", "\u00ab") + normalized_central_european_guillemets_quote_convention = central_european_guillemets_quote_convention.normalize() + assert normalized_central_european_guillemets_quote_convention.get_opening_quote() == '"' + assert normalized_central_european_guillemets_quote_convention.get_closing_quote() == '"' + + swedish_level1_quote_convention = SingleLevelQuoteConvention("\u201d", "\u201d") + normalized_swedish_level1_quote_convention = swedish_level1_quote_convention.normalize() + assert normalized_swedish_level1_quote_convention.get_opening_quote() == '"' + assert normalized_swedish_level1_quote_convention.get_closing_quote() == '"' + + swedish_level2_quote_convention = SingleLevelQuoteConvention("\u2019", "\u2019") + normalized_swedish_level2_quote_convention = swedish_level2_quote_convention.normalize() + assert normalized_swedish_level2_quote_convention.get_opening_quote() == "'" + assert normalized_swedish_level2_quote_convention.get_closing_quote() == "'" + + finnish_level1_quote_convention = SingleLevelQuoteConvention("\u00bb", "\u00bb") + normalized_finnish_level1_quote_convention = finnish_level1_quote_convention.normalize() + assert normalized_finnish_level1_quote_convention.get_opening_quote() == '"' + assert normalized_finnish_level1_quote_convention.get_closing_quote() == '"' + + arabic_level1_quote_convention = SingleLevelQuoteConvention("\u201d", "\u201c") + normalized_arabic_level1_quote_convention = arabic_level1_quote_convention.normalize() + assert normalized_arabic_level1_quote_convention.get_opening_quote() == '"' + assert normalized_arabic_level1_quote_convention.get_closing_quote() == '"' + + arabic_level2_quote_convention = SingleLevelQuoteConvention("\u2019", "\u2018") + normalized_arabic_level2_quote_convention = arabic_level2_quote_convention.normalize() + assert normalized_arabic_level2_quote_convention.get_opening_quote() == "'" + assert normalized_arabic_level2_quote_convention.get_closing_quote() == "'" + + +def test_get_num_levels() -> None: + empty_quote_convention = QuoteConvention("empty-quote-convention", []) + assert empty_quote_convention.get_num_levels() == 0 + + one_level_quote_convention = QuoteConvention( + "one-level-quote-convention", + [SingleLevelQuoteConvention("\u201c", "\u201d")], + ) + assert one_level_quote_convention.get_num_levels() == 1 + + two_level_quote_convention = QuoteConvention( + "two-level-quote-convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + assert two_level_quote_convention.get_num_levels() == 2 + + three_level_quote_convention = QuoteConvention( + "three-level-quote-convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201D", "\u201D"), + ], + ) + assert three_level_quote_convention.get_num_levels() == 3 + + +def test_get_opening_quote_at_level() -> None: + quote_convention = QuoteConvention( + "test-quote-convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert quote_convention.get_opening_quote_at_level(1) == "\u201c" + assert quote_convention.get_opening_quote_at_level(2) == "\u2018" + assert quote_convention.get_opening_quote_at_level(3) == "\u00ab" + + +def test_get_closing_quote_at_level() -> None: + quote_convention = QuoteConvention( + "test-quote-convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert quote_convention.get_closing_quote_at_level(1) == "\u201d" + assert quote_convention.get_closing_quote_at_level(2) == "\u2019" + assert quote_convention.get_closing_quote_at_level(3) == "\u00bb" + + +def test_get_expected_quotation_mark() -> None: + quote_convention = QuoteConvention( + "test-quote-convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert quote_convention.get_expected_quotation_mark(1, QuotationMarkDirection.Opening) == "\u201c" + assert quote_convention.get_expected_quotation_mark(1, QuotationMarkDirection.Closing) == "\u201d" + assert quote_convention.get_expected_quotation_mark(2, QuotationMarkDirection.Opening) == "\u2018" + assert quote_convention.get_expected_quotation_mark(2, QuotationMarkDirection.Closing) == "\u2019" + assert quote_convention.get_expected_quotation_mark(3, QuotationMarkDirection.Opening) == "\u00ab" + assert quote_convention.get_expected_quotation_mark(3, QuotationMarkDirection.Closing) == "\u00bb" + assert quote_convention.get_expected_quotation_mark(4, QuotationMarkDirection.Opening) == "" + assert quote_convention.get_expected_quotation_mark(4, QuotationMarkDirection.Closing) == "" + assert quote_convention.get_expected_quotation_mark(0, QuotationMarkDirection.Opening) == "" + assert quote_convention.get_expected_quotation_mark(0, QuotationMarkDirection.Closing) == "" + + +def test_includes_opening_quotation_mark() -> None: + empty_quote_convention = QuoteConvention("empty quote convention", []) + assert not empty_quote_convention._includes_opening_quotation_mark("\u201c") + + positive_quote_convention1 = QuoteConvention( + "positive quote convention 1", [SingleLevelQuoteConvention("\u201c", "\u201d")] + ) + assert positive_quote_convention1._includes_opening_quotation_mark("\u201c") + + negative_quote_convention1 = QuoteConvention( + "negative quote convention 1", [SingleLevelQuoteConvention("\u2018", "\u2019")] + ) + assert not negative_quote_convention1._includes_opening_quotation_mark("\u201c") + + negative_quote_convention2 = QuoteConvention( + "negative quote convention 2", [SingleLevelQuoteConvention("\u201d", "\u201c")] + ) + assert not negative_quote_convention2._includes_opening_quotation_mark("\u201c") + + positive_quote_convention2 = QuoteConvention( + "positive quote convention 2", + [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019")], + ) + assert positive_quote_convention2._includes_opening_quotation_mark("\u201c") + + positive_quote_convention3 = QuoteConvention( + "positive quote convention 3", + [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u201c", "\u201d")], + ) + assert positive_quote_convention3._includes_opening_quotation_mark("\u201c") + + negative_quote_convention3 = QuoteConvention( + "negative quote convention 3", + [ + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert not negative_quote_convention3._includes_opening_quotation_mark("\u201c") + + +def test_includes_closing_quotation_mark() -> None: + empty_quote_convention = QuoteConvention("empty quote convention", []) + assert not empty_quote_convention._includes_closing_quotation_mark("\u201d") + + positive_quote_convention1 = QuoteConvention( + "positive quote convention 1", [SingleLevelQuoteConvention("\u201c", "\u201d")] + ) + assert positive_quote_convention1._includes_closing_quotation_mark("\u201d") + + negative_quote_convention1 = QuoteConvention( + "negative quote convention 1", [SingleLevelQuoteConvention("\u2018", "\u2019")] + ) + assert not negative_quote_convention1._includes_closing_quotation_mark("\u201d") + + negative_quote_convention2 = QuoteConvention( + "negative quote convention 2", [SingleLevelQuoteConvention("\u201d", "\u201c")] + ) + assert not negative_quote_convention2._includes_closing_quotation_mark("\u201d") + + positive_quote_convention2 = QuoteConvention( + "positive quote convention 2", + [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019")], + ) + assert positive_quote_convention2._includes_closing_quotation_mark("\u201d") + + positive_quote_convention3 = QuoteConvention( + "positive quote convention 3", + [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u201c", "\u201d")], + ) + assert positive_quote_convention3._includes_closing_quotation_mark("\u201d") + + negative_quote_convention3 = QuoteConvention( + "negative quote convention 3", + [ + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert not negative_quote_convention3._includes_closing_quotation_mark("\u201d") + + +def test_get_possible_depths() -> None: + quote_convention = QuoteConvention( + "test-quote-convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + assert quote_convention.get_possible_depths("\u201c", QuotationMarkDirection.Opening) == {1, 3} + assert quote_convention.get_possible_depths("\u201c", QuotationMarkDirection.Closing) == set() + assert quote_convention.get_possible_depths("\u2018", QuotationMarkDirection.Opening) == {2, 4} + assert quote_convention.get_possible_depths("\u2018", QuotationMarkDirection.Closing) == set() + assert quote_convention.get_possible_depths("\u201d", QuotationMarkDirection.Opening) == set() + assert quote_convention.get_possible_depths("\u201d", QuotationMarkDirection.Closing) == {1, 3} + assert quote_convention.get_possible_depths("\u2019", QuotationMarkDirection.Opening) == set() + assert quote_convention.get_possible_depths("\u2019", QuotationMarkDirection.Closing) == {2, 4} + assert quote_convention.get_possible_depths("\u00ab", QuotationMarkDirection.Opening) == set() + assert quote_convention.get_possible_depths("\u00ab", QuotationMarkDirection.Closing) == set() + + +def test_is_compatible_with_observed_quotation_marks() -> None: + quote_convention = QuoteConvention( + "test-quote-convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u2018"], ["\u201d", "\u2019"]) + assert quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u00ab"], ["\u201d", "\u00bb"]) + assert quote_convention.is_compatible_with_observed_quotation_marks(["\u201c"], ["\u201d", "\u2019"]) + assert quote_convention.is_compatible_with_observed_quotation_marks(["\u201c"], ["\u201d"]) + assert quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u00ab"], ["\u201d", "\u2019"]) + + assert not quote_convention.is_compatible_with_observed_quotation_marks(["\u201d", "\u2019"], ["\u201c"]) + + assert not quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u201e"], ["\u201d"]) + + assert not quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u2018"], ["\u201d", "\u201f"]) + + # must have observed the first-level quotes + assert not quote_convention.is_compatible_with_observed_quotation_marks(["\u2018"], ["\u201d"]) + assert not quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u2018"], ["\u00ab"]) + + +def test_normalize() -> None: + empty_quote_convention = QuoteConvention("empty-quote-convention", []) + normalized_empty_quote_convention = empty_quote_convention.normalize() + assert normalized_empty_quote_convention.get_name() == "empty-quote-convention_normalized" + assert normalized_empty_quote_convention.get_num_levels() == 0 + + standard_english_quote_convention = QuoteConvention( + "standard-english-quote-convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + normalized_standard_english_quote_convention = standard_english_quote_convention.normalize() + assert normalized_standard_english_quote_convention.get_name() == "standard-english-quote-convention_normalized" + assert normalized_standard_english_quote_convention.get_num_levels() == 4 + assert normalized_standard_english_quote_convention.get_opening_quote_at_level(1) == '"' + assert normalized_standard_english_quote_convention.get_closing_quote_at_level(1) == '"' + assert normalized_standard_english_quote_convention.get_opening_quote_at_level(2) == "'" + assert normalized_standard_english_quote_convention.get_closing_quote_at_level(2) == "'" + assert normalized_standard_english_quote_convention.get_opening_quote_at_level(3) == '"' + assert normalized_standard_english_quote_convention.get_closing_quote_at_level(3) == '"' + assert normalized_standard_english_quote_convention.get_opening_quote_at_level(4) == "'" + assert normalized_standard_english_quote_convention.get_closing_quote_at_level(4) == "'" + + western_european_quote_convention = QuoteConvention( + "test-quote-convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + normalized_western_european_quote_convention = western_european_quote_convention.normalize() + assert normalized_western_european_quote_convention.get_name() == "test-quote-convention_normalized" + assert normalized_western_european_quote_convention.get_num_levels() == 3 + assert normalized_western_european_quote_convention.get_opening_quote_at_level(1) == '"' + assert normalized_western_european_quote_convention.get_closing_quote_at_level(1) == '"' + assert normalized_western_european_quote_convention.get_opening_quote_at_level(2) == '"' + assert normalized_western_european_quote_convention.get_closing_quote_at_level(2) == '"' + assert normalized_western_european_quote_convention.get_opening_quote_at_level(3) == "'" + assert normalized_western_european_quote_convention.get_closing_quote_at_level(3) == "'" + + hybrid_british_typewriter_english_quote_convention = QuoteConvention( + "hybrid-british-typewriter-english-quote-convention", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + ], + ) + + normalized_hybrid_british_typewriter_english_quote_convention = ( + hybrid_british_typewriter_english_quote_convention.normalize() + ) + assert ( + normalized_hybrid_british_typewriter_english_quote_convention.get_name() + == "hybrid-british-typewriter-english-quote-convention_normalized" + ) + assert normalized_hybrid_british_typewriter_english_quote_convention.get_num_levels() == 3 + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quote_at_level(1) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quote_at_level(1) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quote_at_level(2) == "'" + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quote_at_level(2) == "'" + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quote_at_level(3) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quote_at_level(3) == '"' + + +def test_print_summary() -> None: quote_convention = QuoteConvention( "test-quote-convention", [ @@ -17,4 +381,3 @@ def test_print_summary(): + "\u201DThird-level quote\u201D\n" ) assert quote_convention._get_summary_message() == expected_summary_message - assert True diff --git a/tests/corpora/analysis/test_quote_convention_set.py b/tests/corpora/analysis/test_quote_convention_set.py new file mode 100644 index 00000000..01c79d34 --- /dev/null +++ b/tests/corpora/analysis/test_quote_convention_set.py @@ -0,0 +1,1145 @@ +from machine.corpora.analysis import ( + QuotationMarkDirection, + QuoteConvention, + QuoteConventionSet, + SingleLevelQuoteConvention, +) + + +def test_quote_regexes() -> None: + empty_quote_convention_set = QuoteConventionSet([]) + assert empty_quote_convention_set.opening_quotation_mark_regex.pattern == r"" + assert empty_quote_convention_set.closing_quotation_mark_regex.pattern == r"" + assert empty_quote_convention_set.all_quotation_mark_regex.pattern == r"" + + quote_convention_set_with_empty_conventions = QuoteConventionSet( + [QuoteConvention("empty convention 1", []), QuoteConvention("empty convention 2", [])] + ) + assert quote_convention_set_with_empty_conventions.opening_quotation_mark_regex.pattern == r"" + assert quote_convention_set_with_empty_conventions.closing_quotation_mark_regex.pattern == r"" + assert quote_convention_set_with_empty_conventions.all_quotation_mark_regex.pattern == r"" + + standard_english_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + ] + ) + assert standard_english_quote_convention_set.opening_quotation_mark_regex.pattern == r"[‘“]" + assert standard_english_quote_convention_set.closing_quotation_mark_regex.pattern == r"[’”]" + assert standard_english_quote_convention_set.all_quotation_mark_regex.pattern == r"[‘’“”]" + + western_european_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + ] + ) + assert western_european_quote_convention_set.opening_quotation_mark_regex.pattern == r"[«‘“]" + assert western_european_quote_convention_set.closing_quotation_mark_regex.pattern == r"[»’”]" + assert western_european_quote_convention_set.all_quotation_mark_regex.pattern == r"[«»‘’“”]" + + multiple_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + QuoteConvention( + "typewriter_french", + [ + SingleLevelQuoteConvention("<<", ">>"), + SingleLevelQuoteConvention("<", ">"), + SingleLevelQuoteConvention("<<", ">>"), + SingleLevelQuoteConvention("<", ">"), + ], + ), + QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ), + ] + ) + assert multiple_quote_convention_set.opening_quotation_mark_regex.pattern == r"[<<<«‘“‹]" + assert multiple_quote_convention_set.closing_quotation_mark_regex.pattern == r"[>>>»’”›]" + assert multiple_quote_convention_set.all_quotation_mark_regex.pattern == r"[<<<>>>«»‘’“”‹›]" + + +def test_quotation_mark_pair_map() -> None: + empty_quote_convention_set = QuoteConventionSet([]) + assert empty_quote_convention_set.closing_marks_by_opening_mark == {} + assert empty_quote_convention_set.opening_marks_by_closing_mark == {} + + quote_convention_set_with_empty_conventions = QuoteConventionSet( + [QuoteConvention("empty convention 1", []), QuoteConvention("empty convention 2", [])] + ) + assert quote_convention_set_with_empty_conventions.closing_marks_by_opening_mark == {} + assert quote_convention_set_with_empty_conventions.opening_marks_by_closing_mark == {} + + standard_english_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + ] + ) + assert standard_english_quote_convention_set.closing_marks_by_opening_mark == {"‘": {"’"}, "“": {"”"}} + assert standard_english_quote_convention_set.opening_marks_by_closing_mark == {"’": {"‘"}, "”": {"“"}} + + western_european_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + ] + ) + assert western_european_quote_convention_set.closing_marks_by_opening_mark == {"‘": {"’"}, "“": {"”"}, "«": {"»"}} + assert western_european_quote_convention_set.opening_marks_by_closing_mark == {"’": {"‘"}, "”": {"“"}, "»": {"«"}} + + multiple_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ), + QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ), + ] + ) + assert multiple_quote_convention_set.closing_marks_by_opening_mark == { + "‘": {"’"}, + "“": {"”"}, + "„": {"“"}, + "‚": {"‘"}, + "”": {"”"}, + "’": {"’"}, + } + assert multiple_quote_convention_set.opening_marks_by_closing_mark == { + "’": {"‘", "’"}, + "”": {"“", "”"}, + "“": {"„"}, + "‘": {"‚"}, + } + + +def test_get_quote_convention_by_name() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + multiple_quote_convention_set = QuoteConventionSet( + [standard_english_quote_convention, central_european_quote_convention, standard_swedish_quote_convention] + ) + + assert ( + multiple_quote_convention_set.get_quote_convention_by_name("standard_english") + == standard_english_quote_convention + ) + assert ( + multiple_quote_convention_set.get_quote_convention_by_name("central_european") + == central_european_quote_convention + ) + assert ( + multiple_quote_convention_set.get_quote_convention_by_name("standard_swedish") + == standard_swedish_quote_convention + ) + assert multiple_quote_convention_set.get_quote_convention_by_name("undefined convention") is None + + +def test_get_all_quote_convention_names() -> None: + assert QuoteConventionSet([]).get_all_quote_convention_names() == [] + assert QuoteConventionSet([QuoteConvention("conv", [])]).get_all_quote_convention_names() == ["conv"] + assert QuoteConventionSet( + [QuoteConvention("conv1", []), QuoteConvention("conv2", [])] + ).get_all_quote_convention_names() == ["conv1", "conv2"] + assert QuoteConventionSet( + [QuoteConvention("conv2", []), QuoteConvention("conv1", [])] + ).get_all_quote_convention_names() == ["conv1", "conv2"] + + +def test_get_possible_opening_marks() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.get_possible_opening_marks() == ["‘", "“"] + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.get_possible_opening_marks() == ["‚", "„"] + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.get_possible_opening_marks() == ["’", "”"] + + multiple_quote_convention_set = QuoteConventionSet( + [standard_english_quote_convention, central_european_quote_convention, standard_swedish_quote_convention] + ) + assert multiple_quote_convention_set.get_possible_opening_marks() == ["‘", "’", "‚", "“", "”", "„"] + + +def test_get_possible_closing_marks() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.get_possible_closing_marks() == ["’", "”"] + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.get_possible_closing_marks() == ["‘", "“"] + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.get_possible_closing_marks() == ["’", "”"] + + multiple_quote_convention_set = QuoteConventionSet( + [standard_english_quote_convention, central_european_quote_convention, standard_swedish_quote_convention] + ) + assert multiple_quote_convention_set.get_possible_closing_marks() == ["‘", "’", "“", "”"] + + +def test_is_opening_quotation_mark() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_french_quote_convention: QuoteConvention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.is_valid_opening_quotation_mark("‘") + assert standard_english_quote_convention_set.is_valid_opening_quotation_mark("“") + assert not standard_english_quote_convention_set.is_valid_opening_quotation_mark("”") + assert not standard_english_quote_convention_set.is_valid_opening_quotation_mark("’") + assert not standard_english_quote_convention_set.is_valid_opening_quotation_mark("") + assert not standard_english_quote_convention_set.is_valid_opening_quotation_mark("‘“") + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.is_valid_opening_quotation_mark("‚") + assert central_european_quote_convention_set.is_valid_opening_quotation_mark("„") + assert not central_european_quote_convention_set.is_valid_opening_quotation_mark("‘") + assert not central_european_quote_convention_set.is_valid_opening_quotation_mark("“") + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.is_valid_opening_quotation_mark("’") + assert standard_swedish_quote_convention_set.is_valid_opening_quotation_mark("”") + + standard_french_quote_convention_set = QuoteConventionSet([standard_french_quote_convention]) + assert standard_french_quote_convention_set.is_valid_opening_quotation_mark("«") + assert standard_french_quote_convention_set.is_valid_opening_quotation_mark("‹") + assert not standard_french_quote_convention_set.is_valid_opening_quotation_mark("»") + assert not standard_french_quote_convention_set.is_valid_opening_quotation_mark("›") + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + central_european_quote_convention, + standard_swedish_quote_convention, + standard_french_quote_convention, + ] + ) + assert multiple_quote_convention_set.get_possible_opening_marks() == ["«", "‘", "’", "‚", "“", "”", "„", "‹"] + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("‘") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("’") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("‚") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("“") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("”") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("„") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("«") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("‹") + assert not multiple_quote_convention_set.is_valid_opening_quotation_mark("»") + assert not multiple_quote_convention_set.is_valid_opening_quotation_mark("›") + + +def test_is_closing_quotation_mark() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_french_quote_convention: QuoteConvention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.is_valid_closing_quotation_mark("”") + assert standard_english_quote_convention_set.is_valid_closing_quotation_mark("’") + assert not standard_english_quote_convention_set.is_valid_closing_quotation_mark("‘") + assert not standard_english_quote_convention_set.is_valid_closing_quotation_mark("“") + assert not standard_english_quote_convention_set.is_valid_closing_quotation_mark("") + assert not standard_english_quote_convention_set.is_valid_closing_quotation_mark("”’") + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.is_valid_closing_quotation_mark("‘") + assert central_european_quote_convention_set.is_valid_closing_quotation_mark("“") + assert not central_european_quote_convention_set.is_valid_closing_quotation_mark("„") + assert not central_european_quote_convention_set.is_valid_closing_quotation_mark("‚") + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.is_valid_closing_quotation_mark("’") + assert standard_swedish_quote_convention_set.is_valid_closing_quotation_mark("”") + + standard_french_quote_convention_set = QuoteConventionSet([standard_french_quote_convention]) + assert standard_french_quote_convention_set.is_valid_closing_quotation_mark("»") + assert standard_french_quote_convention_set.is_valid_closing_quotation_mark("›") + assert not standard_french_quote_convention_set.is_valid_closing_quotation_mark("«") + assert not standard_french_quote_convention_set.is_valid_closing_quotation_mark("‹") + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + central_european_quote_convention, + standard_swedish_quote_convention, + standard_french_quote_convention, + ] + ) + assert multiple_quote_convention_set.get_possible_closing_marks() == ["»", "‘", "’", "“", "”", "›"] + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("‘") + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("’") + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("“") + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("”") + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("»") + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("›") + assert not multiple_quote_convention_set.is_valid_closing_quotation_mark("«") + assert not multiple_quote_convention_set.is_valid_closing_quotation_mark("‹") + + +def test_are_marks_a_valid_pair() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_french_quote_convention: QuoteConvention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.are_marks_a_valid_pair("“", "”") + assert not standard_english_quote_convention_set.are_marks_a_valid_pair("”", "“") + assert standard_english_quote_convention_set.are_marks_a_valid_pair("‘", "’") + assert not standard_english_quote_convention_set.are_marks_a_valid_pair("’", "‘") + assert not standard_english_quote_convention_set.are_marks_a_valid_pair("‘", "”") + assert not standard_english_quote_convention_set.are_marks_a_valid_pair("‘", "”") + assert not standard_english_quote_convention_set.are_marks_a_valid_pair("‘", "") + assert not standard_english_quote_convention_set.are_marks_a_valid_pair("", "") + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.are_marks_a_valid_pair("„", "“") + assert central_european_quote_convention_set.are_marks_a_valid_pair("‚", "‘") + assert not central_european_quote_convention_set.are_marks_a_valid_pair("“", "„") + assert not central_european_quote_convention_set.are_marks_a_valid_pair("’", "‚") + assert not central_european_quote_convention_set.are_marks_a_valid_pair("‚", "“") + assert not central_european_quote_convention_set.are_marks_a_valid_pair("‚", "’") + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.are_marks_a_valid_pair("”", "”") + assert standard_swedish_quote_convention_set.are_marks_a_valid_pair("’", "’") + assert not standard_swedish_quote_convention_set.are_marks_a_valid_pair("”", "’") + assert not standard_swedish_quote_convention_set.are_marks_a_valid_pair("’", "”") + + standard_french_quote_convention_set = QuoteConventionSet([standard_french_quote_convention]) + assert standard_french_quote_convention_set.are_marks_a_valid_pair("«", "»") + assert standard_french_quote_convention_set.are_marks_a_valid_pair("‹", "›") + assert not standard_french_quote_convention_set.are_marks_a_valid_pair("«", "›") + assert not standard_french_quote_convention_set.are_marks_a_valid_pair("‹", "»") + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + central_european_quote_convention, + standard_swedish_quote_convention, + standard_french_quote_convention, + ] + ) + assert multiple_quote_convention_set.are_marks_a_valid_pair("“", "”") + assert multiple_quote_convention_set.are_marks_a_valid_pair("‘", "’") + assert multiple_quote_convention_set.are_marks_a_valid_pair("„", "“") + assert multiple_quote_convention_set.are_marks_a_valid_pair("‚", "‘") + assert multiple_quote_convention_set.are_marks_a_valid_pair("”", "”") + assert multiple_quote_convention_set.are_marks_a_valid_pair("’", "’") + assert multiple_quote_convention_set.are_marks_a_valid_pair("«", "»") + assert multiple_quote_convention_set.are_marks_a_valid_pair("‹", "›") + assert not multiple_quote_convention_set.are_marks_a_valid_pair("‹", "»") + assert not multiple_quote_convention_set.are_marks_a_valid_pair("‹", "”") + assert not multiple_quote_convention_set.are_marks_a_valid_pair("„", "”") + assert not multiple_quote_convention_set.are_marks_a_valid_pair("’", "‘") + + +def test_is_quotation_mark_direction_ambiguous() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + typewriter_english_quote_convention: QuoteConvention = QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + eastern_european_quote_convention = QuoteConvention( + "eastern_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert not standard_english_quote_convention_set.is_quotation_mark_direction_ambiguous("“") + assert not standard_english_quote_convention_set.is_quotation_mark_direction_ambiguous("”") + assert not standard_english_quote_convention_set.is_quotation_mark_direction_ambiguous("‘") + assert not standard_english_quote_convention_set.is_quotation_mark_direction_ambiguous("’") + assert not standard_english_quote_convention_set.is_quotation_mark_direction_ambiguous('"') + + typewriter_english_quote_convention_set = QuoteConventionSet([typewriter_english_quote_convention]) + assert typewriter_english_quote_convention_set.is_quotation_mark_direction_ambiguous('"') + assert typewriter_english_quote_convention_set.is_quotation_mark_direction_ambiguous("'") + assert not typewriter_english_quote_convention_set.is_quotation_mark_direction_ambiguous("‘") + assert not typewriter_english_quote_convention_set.is_quotation_mark_direction_ambiguous("’") + assert not typewriter_english_quote_convention_set.is_quotation_mark_direction_ambiguous("«") + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert not central_european_quote_convention_set.is_quotation_mark_direction_ambiguous("“") + assert not central_european_quote_convention_set.is_quotation_mark_direction_ambiguous("„") + assert not central_european_quote_convention_set.is_quotation_mark_direction_ambiguous("‘") + assert not central_european_quote_convention_set.is_quotation_mark_direction_ambiguous("‚") + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.is_quotation_mark_direction_ambiguous("”") + assert standard_swedish_quote_convention_set.is_quotation_mark_direction_ambiguous("’") + + eastern_european_quote_convention_set = QuoteConventionSet([eastern_european_quote_convention]) + assert not eastern_european_quote_convention_set.is_quotation_mark_direction_ambiguous("”") + assert not eastern_european_quote_convention_set.is_quotation_mark_direction_ambiguous("„") + assert not eastern_european_quote_convention_set.is_quotation_mark_direction_ambiguous("’") + assert not eastern_european_quote_convention_set.is_quotation_mark_direction_ambiguous("‚") + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + typewriter_english_quote_convention, + central_european_quote_convention, + standard_swedish_quote_convention, + eastern_european_quote_convention, + ] + ) + assert multiple_quote_convention_set.is_quotation_mark_direction_ambiguous('"') + assert multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("'") + assert multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("”") + assert multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("’") + assert not multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("„") + assert not multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("‚") + + # these are unambiguous because they are never the opening and closing in the same convention + assert not multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("“") + assert not multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("‘") + + +def test_get_possible_paired_quotation_marks() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + eastern_european_quote_convention = QuoteConvention( + "eastern_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.get_possible_paired_quotation_marks("“") == {"”"} + assert standard_english_quote_convention_set.get_possible_paired_quotation_marks("”") == {"“"} + assert standard_english_quote_convention_set.get_possible_paired_quotation_marks("‘") == {"’"} + assert standard_english_quote_convention_set.get_possible_paired_quotation_marks("’") == {"‘"} + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.get_possible_paired_quotation_marks("„") == {"“"} + assert central_european_quote_convention_set.get_possible_paired_quotation_marks("“") == {"„"} + assert central_european_quote_convention_set.get_possible_paired_quotation_marks("‚") == {"‘"} + assert central_european_quote_convention_set.get_possible_paired_quotation_marks("‘") == {"‚"} + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.get_possible_paired_quotation_marks("”") == {"”"} + assert standard_swedish_quote_convention_set.get_possible_paired_quotation_marks("’") == {"’"} + + eastern_european_quote_convention_set = QuoteConventionSet([eastern_european_quote_convention]) + assert eastern_european_quote_convention_set.get_possible_paired_quotation_marks("„") == {"”"} + assert eastern_european_quote_convention_set.get_possible_paired_quotation_marks("”") == {"„"} + assert eastern_european_quote_convention_set.get_possible_paired_quotation_marks("‚") == {"’"} + assert eastern_european_quote_convention_set.get_possible_paired_quotation_marks("’") == {"‚"} + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + central_european_quote_convention, + standard_swedish_quote_convention, + eastern_european_quote_convention, + ] + ) + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("“") == {"”", "„"} + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("”") == {"“", "”", "„"} + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("‘") == {"’", "‚"} + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("’") == {"‘", "’", "‚"} + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("„") == {"“", "”"} + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("‚") == {"‘", "’"} + + +def test_get_possible_depths() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + british_english_quote_convention: QuoteConvention = QuoteConvention( + "british_english", + [ + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + ], + ) + + normalized_western_european_quote_convention = QuoteConvention( + "western_european_normalized", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Opening) == {1, 3} + assert standard_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Closing) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Closing) == {1, 3} + assert standard_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Opening) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Opening) == {2, 4} + assert standard_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Closing) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Closing) == {2, 4} + assert standard_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Opening) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Opening) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Closing) == set() + assert standard_english_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.Opening) == set() + assert standard_english_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.Closing) == set() + + british_english_quote_convention_set = QuoteConventionSet([british_english_quote_convention]) + assert british_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Opening) == {1, 3} + assert british_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Closing) == set() + assert british_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Closing) == {1, 3} + assert british_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Opening) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Opening) == {2, 4} + assert british_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Closing) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Closing) == {2, 4} + assert british_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Opening) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Opening) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Closing) == set() + assert british_english_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.Opening) == set() + assert british_english_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.Closing) == set() + + normalized_western_european_quote_convention_set = QuoteConventionSet( + [normalized_western_european_quote_convention] + ) + assert normalized_western_european_quote_convention_set.get_possible_depths( + '"', QuotationMarkDirection.Opening + ) == {1, 2} + assert normalized_western_european_quote_convention_set.get_possible_depths( + '"', QuotationMarkDirection.Closing + ) == {1, 2} + assert normalized_western_european_quote_convention_set.get_possible_depths( + "'", QuotationMarkDirection.Opening + ) == {3} + assert normalized_western_european_quote_convention_set.get_possible_depths( + "'", QuotationMarkDirection.Closing + ) == {3} + assert ( + normalized_western_european_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Opening) + == set() + ) + assert ( + normalized_western_european_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Closing) + == set() + ) + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + british_english_quote_convention, + normalized_western_european_quote_convention, + ] + ) + assert multiple_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Opening) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Closing) == set() + assert multiple_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Closing) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Opening) == set() + assert multiple_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Opening) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Closing) == set() + assert multiple_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Closing) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Opening) == set() + assert multiple_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Opening) == set() + assert multiple_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Closing) == set() + assert multiple_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.Opening) == {1, 2} + assert multiple_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.Closing) == {1, 2} + assert multiple_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.Opening) == {3} + assert multiple_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.Closing) == {3} + + +def test_does_metadata_match_quotation_mark() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201c", 1, QuotationMarkDirection.Opening + ) + assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201c", 3, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201c", 2, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201c", 4, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201c", 1, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201c", 2, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201c", 3, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201c", 4, QuotationMarkDirection.Closing + ) + assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201d", 1, QuotationMarkDirection.Closing + ) + assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201d", 3, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201d", 2, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201d", 4, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201d", 1, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201d", 2, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201d", 3, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201d", 4, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2018", 1, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2018", 3, QuotationMarkDirection.Opening + ) + assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2018", 2, QuotationMarkDirection.Opening + ) + assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2018", 4, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2018", 1, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2018", 2, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2018", 3, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2018", 4, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2019", 1, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2019", 3, QuotationMarkDirection.Closing + ) + assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2019", 2, QuotationMarkDirection.Closing + ) + assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2019", 4, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2019", 1, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2019", 2, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2019", 3, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u2019", 4, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201e", 1, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201e", 1, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201e", 2, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201e", 2, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201e", 3, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201e", 3, QuotationMarkDirection.Closing + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201e", 4, QuotationMarkDirection.Opening + ) + assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( + "\u201e", 4, QuotationMarkDirection.Closing + ) + + +def test_filter_to_compatible_quote_conventions() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + standard_french_quote_convention: QuoteConvention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + western_european_quote_convention: QuoteConvention = QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c"], ["\u201d"] + ).get_all_quote_convention_names() == ["standard_english"] + assert standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c", "\u2018"], ["\u201d", "\u2019"] + ).get_all_quote_convention_names() == ["standard_english"] + assert standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c", "\u2018"], ["\u201d"] + ).get_all_quote_convention_names() == ["standard_english"] + assert standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c"], ["\u201d", "\u2019"] + ).get_all_quote_convention_names() == ["standard_english"] + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u2018"], ["\u201d"] + ).get_all_quote_convention_names() + == [] + ) + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c"], ["\u2019"] + ).get_all_quote_convention_names() + == [] + ) + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201d"], ["\u201c"] + ).get_all_quote_convention_names() + == [] + ) + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c", "\u201d"], ["\u201d"] + ).get_all_quote_convention_names() + == [] + ) + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c", "\u201e"], ["\u201d"] + ).get_all_quote_convention_names() + == [] + ) + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + [], [] + ).get_all_quote_convention_names() + == [] + ) + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + standard_french_quote_convention, + western_european_quote_convention, + standard_swedish_quote_convention, + ] + ) + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c"], ["\u201d"] + ).get_all_quote_convention_names() == ["standard_english"] + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c", "\u2018"], ["\u201d", "\u2019"] + ).get_all_quote_convention_names() == ["standard_english"] + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201d"], ["\u201d"] + ).get_all_quote_convention_names() == ["standard_swedish"] + assert ( + multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c"], ["\u201c"] + ).get_all_quote_convention_names() + == [] + ) + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u00ab"], ["\u00bb"] + ).get_all_quote_convention_names() == ["standard_french", "western_european"] + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u00ab", "\u2039"], ["\u00bb"] + ).get_all_quote_convention_names() == ["standard_french"] + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u00ab"], ["\u00bb", "\u201d"] + ).get_all_quote_convention_names() == ["western_european"] + assert ( + multiple_quote_convention_set.filter_to_compatible_quote_conventions([], []).get_all_quote_convention_names() + == [] + ) + + +def test_find_most_similar_convention() -> None: + # TODO: test this after testing QuotationMarkTabulator + pass diff --git a/tests/corpora/analysis/test_text_segment.py b/tests/corpora/analysis/test_text_segment.py new file mode 100644 index 00000000..9deaa60e --- /dev/null +++ b/tests/corpora/analysis/test_text_segment.py @@ -0,0 +1,320 @@ +from machine.corpora import UsfmToken, UsfmTokenType +from machine.corpora.analysis import TextSegment, UsfmMarkerType + + +def test_builder_initialization() -> None: + builder = TextSegment.Builder() + + assert builder.text_segment.text == "" + assert builder.text_segment.previous_segment is None + assert builder.text_segment.next_segment is None + assert builder.text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker + assert builder.text_segment.markers_in_preceding_context == set() + assert builder.text_segment.index_in_verse == 0 + assert builder.text_segment.num_segments_in_verse == 0 + assert builder.text_segment.usfm_token is None + + +def test_builder_set_text() -> None: + builder = TextSegment.Builder() + text = "Example text" + builder.set_text(text) + + assert builder.text_segment.text == text + + +def test_builder_set_previous_segment() -> None: + builder = TextSegment.Builder() + previous_segment = TextSegment.Builder().set_text("previous segment text").build() + builder.set_previous_segment(previous_segment) + + assert builder.text_segment.previous_segment == previous_segment + assert builder.text_segment.next_segment is None + assert builder.text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker + assert builder.text_segment.markers_in_preceding_context == set() + assert builder.text_segment.index_in_verse == 0 + assert builder.text_segment.num_segments_in_verse == 0 + + +def test_builder_add_preceding_marker() -> None: + builder = TextSegment.Builder() + builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) + + assert builder.text_segment.immediate_preceding_marker is UsfmMarkerType.ChapterMarker + assert builder.text_segment.markers_in_preceding_context == {UsfmMarkerType.ChapterMarker} + assert builder.text_segment.previous_segment is None + assert builder.text_segment.next_segment is None + + builder.add_preceding_marker(UsfmMarkerType.VerseMarker) + assert builder.text_segment.immediate_preceding_marker == UsfmMarkerType.VerseMarker + assert builder.text_segment.markers_in_preceding_context == { + UsfmMarkerType.ChapterMarker, + UsfmMarkerType.VerseMarker, + } + assert builder.text_segment.previous_segment is None + assert builder.text_segment.next_segment is None + + +def test_builder_set_usfm_token() -> None: + builder = TextSegment.Builder() + builder.set_usfm_token(UsfmToken(type=UsfmTokenType.TEXT, text="USFM token text")) + + assert builder.text_segment.usfm_token is not None + assert builder.text_segment.usfm_token.type == UsfmTokenType.TEXT + assert builder.text_segment.usfm_token.text == "USFM token text" + assert builder.text_segment.text == "" + assert builder.text_segment.previous_segment is None + assert builder.text_segment.next_segment is None + + +def test_set_previous_segment() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + previous_segment = TextSegment.Builder().set_text("previous segment text").build() + text_segment.set_previous_segment(previous_segment) + + assert text_segment.previous_segment == previous_segment + assert text_segment.next_segment is None + assert text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker + assert text_segment.markers_in_preceding_context == set() + assert text_segment.index_in_verse == 0 + assert text_segment.num_segments_in_verse == 0 + + +def test_set_next_segment() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + next_segment = TextSegment.Builder().set_text("next segment text").build() + text_segment.set_next_segment(next_segment) + + assert text_segment.previous_segment is None + assert text_segment.next_segment == next_segment + assert text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker + assert text_segment.markers_in_preceding_context == set() + assert text_segment.index_in_verse == 0 + assert text_segment.num_segments_in_verse == 0 + + +def test_set_index_in_verse() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + text_segment.set_index_in_verse(2) + + assert text_segment.index_in_verse == 2 + assert text_segment.previous_segment is None + assert text_segment.next_segment is None + assert text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker + assert text_segment.markers_in_preceding_context == set() + assert text_segment.num_segments_in_verse == 0 + + +def test_set_num_segments_in_verse() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + text_segment.set_num_segments_in_verse(5) + + assert text_segment.num_segments_in_verse == 5 + assert text_segment.previous_segment is None + assert text_segment.next_segment is None + assert text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker + assert text_segment.markers_in_preceding_context == set() + assert text_segment.index_in_verse == 0 + + +def test_equals() -> None: + basic_segment = TextSegment.Builder().set_text("text1").build() + same_text_segment = TextSegment.Builder().set_text("text1").build() + different_text_segment = TextSegment.Builder().set_text("different text").build() + + assert basic_segment == basic_segment + assert basic_segment != UsfmToken(type=UsfmTokenType.TEXT, text="text1") + assert basic_segment == same_text_segment + assert basic_segment != different_text_segment + + segment_with_index = TextSegment.Builder().set_text("text1").build() + segment_with_index.set_index_in_verse(1) + segment_with_same_index = TextSegment.Builder().set_text("text1").build() + segment_with_same_index.set_index_in_verse(1) + segment_with_different_index = TextSegment.Builder().set_text("text1").build() + segment_with_different_index.set_index_in_verse(2) + + assert segment_with_index == segment_with_same_index + assert segment_with_index != segment_with_different_index + assert segment_with_index != basic_segment + + segment_with_preceding_marker = ( + TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.VerseMarker).build() + ) + segment_with_same_preceding_marker = ( + TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.VerseMarker).build() + ) + segment_with_different_preceding_marker = ( + TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.ChapterMarker).build() + ) + segment_with_multiple_preceding_markers = ( + TextSegment.Builder() + .set_text("text1") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .build() + ) + + usfm_token = UsfmToken(type=UsfmTokenType.TEXT, text="USFM token text") + segment_with_usfm_token = TextSegment.Builder().set_text("text1").set_usfm_token(usfm_token).build() + segment_with_same_usfm_token = TextSegment.Builder().set_text("text1").set_usfm_token(usfm_token).build() + segment_with_different_usfm_token = ( + TextSegment.Builder() + .set_text("text1") + .set_usfm_token(UsfmToken(type=UsfmTokenType.TEXT, text="Different USFM token text")) + .build() + ) + + assert segment_with_usfm_token == segment_with_same_usfm_token + assert segment_with_usfm_token != segment_with_different_usfm_token + assert basic_segment != segment_with_usfm_token + + # attributes that are not used in equality checks + segment_with_num_verses = TextSegment.Builder().set_text("text1").build() + segment_with_num_verses.set_num_segments_in_verse(3) + segment_with_same_num_verses = TextSegment.Builder().set_text("text1").build() + segment_with_same_num_verses.set_num_segments_in_verse(3) + segment_with_different_num_verses = TextSegment.Builder().set_text("text1").build() + segment_with_different_num_verses.set_num_segments_in_verse(4) + + assert segment_with_num_verses == segment_with_same_num_verses + assert segment_with_num_verses == segment_with_different_num_verses + assert segment_with_num_verses == basic_segment + + assert segment_with_preceding_marker == segment_with_same_preceding_marker + assert segment_with_preceding_marker != segment_with_different_preceding_marker + assert segment_with_preceding_marker == segment_with_multiple_preceding_markers + assert segment_with_preceding_marker != basic_segment + + segment_with_previous_segment = TextSegment.Builder().set_text("text1").build() + segment_with_previous_segment.set_previous_segment(segment_with_num_verses) + + segment_with_next_segment = TextSegment.Builder().set_text("text1").build() + segment_with_next_segment.set_next_segment(segment_with_num_verses) + + assert basic_segment == segment_with_previous_segment + assert basic_segment == segment_with_next_segment + + +def test_get_text() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + assert text_segment.get_text() == "example text" + + text_segment = TextSegment.Builder().set_text("new example text").build() + assert text_segment.get_text() == "new example text" + + +def test_length() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + assert text_segment.length() == len("example text") + + text_segment = TextSegment.Builder().set_text("new example text").build() + assert text_segment.length() == len("new example text") + + +def test_substring_before() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + assert text_segment.substring_before(7) == "example" + assert text_segment.substring_before(8) == "example " + assert text_segment.substring_before(0) == "" + assert text_segment.substring_before(12) == "example text" + + +def test_substring_after() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + assert text_segment.substring_after(7) == " text" + assert text_segment.substring_after(8) == "text" + assert text_segment.substring_after(0) == "example text" + assert text_segment.substring_after(12) == "" + assert text_segment.substring_after(11) == "t" + + +def test_is_marker_in_preceding_context() -> None: + no_preceding_marker_segment = TextSegment.Builder().set_text("example text").build() + assert no_preceding_marker_segment.is_marker_in_preceding_context(UsfmMarkerType.ChapterMarker) is False + assert no_preceding_marker_segment.is_marker_in_preceding_context(UsfmMarkerType.VerseMarker) is False + assert no_preceding_marker_segment.is_marker_in_preceding_context(UsfmMarkerType.CharacterMarker) is False + + one_preceding_marker_text_segment = ( + TextSegment.Builder().set_text("example text").add_preceding_marker(UsfmMarkerType.CharacterMarker).build() + ) + + assert one_preceding_marker_text_segment.is_marker_in_preceding_context(UsfmMarkerType.CharacterMarker) is True + assert one_preceding_marker_text_segment.is_marker_in_preceding_context(UsfmMarkerType.VerseMarker) is False + assert one_preceding_marker_text_segment.is_marker_in_preceding_context(UsfmMarkerType.ChapterMarker) is False + + two_preceding_markers_text_segment = ( + TextSegment.Builder() + .set_text("example text") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .build() + ) + assert two_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.ChapterMarker) is True + assert two_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.VerseMarker) is True + assert two_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.CharacterMarker) is False + + three_preceding_markers_text_segment = ( + TextSegment.Builder() + .set_text("example text") + .add_preceding_marker(UsfmMarkerType.ChapterMarker) + .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CharacterMarker) + .build() + ) + assert three_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.ChapterMarker) is True + assert three_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.VerseMarker) is True + assert three_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.CharacterMarker) is True + + +def test_is_first_segment_in_verse() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + text_segment.set_index_in_verse(0) + assert text_segment.is_first_segment_in_verse() is True + + text_segment.set_index_in_verse(1) + assert text_segment.is_first_segment_in_verse() is False + + +def test_is_last_segment_in_verse() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + text_segment.set_index_in_verse(0) + text_segment.set_num_segments_in_verse(1) + assert text_segment.is_last_segment_in_verse() is True + + text_segment.set_index_in_verse(0) + text_segment.set_num_segments_in_verse(2) + assert text_segment.is_last_segment_in_verse() is False + + text_segment.set_index_in_verse(1) + assert text_segment.is_last_segment_in_verse() is True + + +def test_replace_substring() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + text_segment.replace_substring(0, 7, "sample") + assert text_segment.get_text() == "sample text" + + text_segment.replace_substring(7, 11, "text") + assert text_segment.get_text() == "sample text" + + text_segment.replace_substring(0, 7, "") + assert text_segment.get_text() == "text" + + text_segment.replace_substring(0, 4, "new'") + assert text_segment.get_text() == "new'" + + text_segment.replace_substring(3, 4, "\u2019") + assert text_segment.get_text() == "new\u2019" + + text_segment.replace_substring(0, 0, "prefix ") + assert text_segment.get_text() == "prefix new\u2019" + + text_segment.replace_substring(0, 0, "") + assert text_segment.get_text() == "prefix new\u2019" + + text_segment.replace_substring(11, 11, " suffix") + assert text_segment.get_text() == "prefix new\u2019 suffix" + + text_segment.replace_substring(6, 6, "-") + assert text_segment.get_text() == "prefix- new\u2019 suffix" diff --git a/tests/corpora/analysis/test_verse.py b/tests/corpora/analysis/test_verse.py new file mode 100644 index 00000000..aa8e3fb4 --- /dev/null +++ b/tests/corpora/analysis/test_verse.py @@ -0,0 +1,42 @@ +from machine.corpora.analysis import TextSegment, Verse + + +def test_initialize_verse() -> None: + text_segments = [ + TextSegment.Builder().set_text("Segment 1").build(), + TextSegment.Builder().set_text("Segment 2").build(), + TextSegment.Builder().set_text("Segment 3").build(), + ] + + verse = Verse(text_segments) + + assert len(verse.get_text_segments()) == 3 + assert verse.get_text_segments() == text_segments + + +def test_segment_indices() -> None: + text_segments = [ + TextSegment.Builder().set_text("Segment 1").build(), + TextSegment.Builder().set_text("Segment 1").build(), + TextSegment.Builder().set_text("Segment 1").build(), + ] + + verse = Verse(text_segments) + + assert verse.get_text_segments()[0].index_in_verse == 0 + assert verse.get_text_segments()[1].index_in_verse == 1 + assert verse.get_text_segments()[2].index_in_verse == 2 + + +def test_num_segments_in_verse() -> None: + text_segments = [ + TextSegment.Builder().set_text("Segment 1").build(), + TextSegment.Builder().set_text("Segment 2").build(), + TextSegment.Builder().set_text("Segment 3").build(), + ] + + verse = Verse(text_segments) + + assert verse.get_text_segments()[0].num_segments_in_verse == 3 + assert verse.get_text_segments()[1].num_segments_in_verse == 3 + assert verse.get_text_segments()[2].num_segments_in_verse == 3 diff --git a/tests/corpora/test_quotation_denormalization_first_pass.py b/tests/corpora/test_quotation_denormalization_first_pass.py index 99fd83e3..d87fb918 100644 --- a/tests/corpora/test_quotation_denormalization_first_pass.py +++ b/tests/corpora/test_quotation_denormalization_first_pass.py @@ -526,7 +526,7 @@ def run_quotation_denormalization_first_pass( first_pass_analyzer = QuotationDenormalizationFirstPass(source_quote_convention, target_quote_convention) parse_usfm(normalized_usfm, first_pass_analyzer) - return first_pass_analyzer.get_best_actions_by_chapter(normalized_usfm) + return first_pass_analyzer.get_best_actions_by_chapter() def run_quotation_denormalization_first_pass_on_chapter( diff --git a/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py similarity index 66% rename from tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py rename to tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py index aa330091..4c7730b4 100644 --- a/tests/corpora/test_quotation_denormalization_scripture_block_update_handler.py +++ b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py @@ -1,11 +1,31 @@ +from typing import Generator, List, Union + from machine.corpora import ( QuotationDenormalizationAction, - QuotationDenormalizationScriptureUpdateBlockHandler, QuotationDenormalizationSettings, + QuotationDenormalizationUsfmUpdateBlockHandler, + ScriptureRef, UpdateUsfmParserHandler, + UsfmToken, + UsfmTokenType, + UsfmUpdateBlock, + UsfmUpdateBlockElement, + UsfmUpdateBlockElementType, parse_usfm, ) -from machine.corpora.analysis import standard_quote_conventions +from machine.corpora.analysis import ( + QuotationMarkDirection, + QuotationMarkFinder, + QuotationMarkMetadata, + QuotationMarkResolutionIssue, + QuotationMarkResolutionSettings, + QuotationMarkResolver, + QuotationMarkStringMatch, + QuoteConventionSet, + TextSegment, + UsfmMarkerType, + standard_quote_conventions, +) simple_normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal @@ -398,10 +418,7 @@ def test_basic_quotation_denormalization_same_as_full() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_default_chapter_action(QuotationDenormalizationAction.APPLY_BASIC) - .build(), + QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_BASIC), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -423,10 +440,7 @@ def test_basic_quotation_denormalization_incorrectly_nested() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_default_chapter_action(QuotationDenormalizationAction.APPLY_BASIC) - .build(), + QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_BASIC), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -448,10 +462,7 @@ def test_basic_quotation_denormalization_incorrectly_nested_second_case() -> Non normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_default_chapter_action(QuotationDenormalizationAction.APPLY_BASIC) - .build(), + QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_BASIC), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -473,10 +484,7 @@ def test_basic_quotation_denormalization_unclosed_quote() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_default_chapter_action(QuotationDenormalizationAction.APPLY_BASIC) - .build(), + QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_BASIC), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -510,7 +518,6 @@ def test_default_denormalization_action() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder().run_on_existing_text().build(), ) assert_usfm_equal(observed_usfm, expected_full_usfm) @@ -518,10 +525,7 @@ def test_default_denormalization_action() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_default_chapter_action(QuotationDenormalizationAction.APPLY_FULL) - .build(), + QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_FULL), ) assert_usfm_equal(observed_usfm, expected_full_usfm) @@ -529,10 +533,7 @@ def test_default_denormalization_action() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_default_chapter_action(QuotationDenormalizationAction.APPLY_BASIC) - .build(), + QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_BASIC), ) assert_usfm_equal(observed_usfm, expected_basic_usfm) @@ -540,10 +541,7 @@ def test_default_denormalization_action() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_default_chapter_action(QuotationDenormalizationAction.SKIP) - .build(), + QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.SKIP), ) assert_usfm_equal(observed_usfm, expected_skipped_usfm) @@ -577,10 +575,7 @@ def test_single_chapter_denormalization_action() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_chapter_actions([QuotationDenormalizationAction.APPLY_FULL]) - .build(), + QuotationDenormalizationSettings(chapter_actions=[QuotationDenormalizationAction.APPLY_FULL]), ) assert_usfm_equal(observed_usfm, expected_full_usfm) @@ -588,10 +583,7 @@ def test_single_chapter_denormalization_action() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_chapter_actions([QuotationDenormalizationAction.APPLY_BASIC]) - .build(), + QuotationDenormalizationSettings(chapter_actions=[QuotationDenormalizationAction.APPLY_BASIC]), ) assert_usfm_equal(observed_usfm, expected_basic_usfm) @@ -599,10 +591,7 @@ def test_single_chapter_denormalization_action() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_chapter_actions([QuotationDenormalizationAction.SKIP]) - .build(), + QuotationDenormalizationSettings(chapter_actions=[QuotationDenormalizationAction.SKIP]), ) assert_usfm_equal(observed_usfm, expected_skipped_usfm) @@ -633,10 +622,9 @@ def test_multiple_chapter_same_denormalization_action() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_chapter_actions([QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_FULL]) - .build(), + QuotationDenormalizationSettings( + chapter_actions=[QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_FULL] + ), ) assert_usfm_equal(observed_usfm, expected_full_usfm) @@ -644,10 +632,9 @@ def test_multiple_chapter_same_denormalization_action() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_chapter_actions([QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_BASIC]) - .build(), + QuotationDenormalizationSettings( + chapter_actions=[QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_BASIC] + ), ) assert_usfm_equal(observed_usfm, expected_basic_usfm) @@ -685,10 +672,9 @@ def test_multiple_chapter_multiple_denormalization_actions() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_chapter_actions([QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_BASIC]) - .build(), + QuotationDenormalizationSettings( + chapter_actions=[QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_BASIC] + ), ) assert_usfm_equal(observed_usfm, expected_full_then_basic_usfm) @@ -696,10 +682,9 @@ def test_multiple_chapter_multiple_denormalization_actions() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_chapter_actions([QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_FULL]) - .build(), + QuotationDenormalizationSettings( + chapter_actions=[QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_FULL] + ), ) assert_usfm_equal(observed_usfm, expected_basic_then_full_usfm) @@ -707,22 +692,220 @@ def test_multiple_chapter_multiple_denormalization_actions() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .set_chapter_actions([QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.SKIP]) - .build(), + QuotationDenormalizationSettings( + chapter_actions=[QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.SKIP] + ), ) assert_usfm_equal(observed_usfm, expected_basic_then_skip_usfm) +def test_process_scripture_element() -> None: + quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( + create_quotation_denormalization_usfm_update_block_handler("standard_english", "british_english") + ) + quotation_denormalizer._quotation_mark_finder = MockQuotationMarkFinder() + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, + tokens=[UsfmToken(UsfmTokenType.TEXT, text="test segment")], + ) + mock_quotation_mark_resolver: QuotationMarkResolver = MockQuotationMarkResolver() + quotation_denormalizer._process_scripture_element(update_element, mock_quotation_mark_resolver) + + assert quotation_denormalizer._quotation_mark_finder.num_times_called == 1 + assert mock_quotation_mark_resolver.num_times_called == 1 + assert quotation_denormalizer._quotation_mark_finder.matches_to_return[0].text_segment.text == "this is a ‘test" + assert quotation_denormalizer._quotation_mark_finder.matches_to_return[1].text_segment.text == "the test ends” here" + + +def test_create_text_segments_basic() -> None: + quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( + create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") + ) + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, tokens=[UsfmToken(UsfmTokenType.TEXT, text="test segment")] + ) + text_segments: List[TextSegment] = quotation_denormalizer._create_text_segments(update_element) + + assert len(text_segments) == 1 + assert text_segments[0].text == "test segment" + assert text_segments[0].immediate_preceding_marker is UsfmMarkerType.NoMarker + assert text_segments[0].markers_in_preceding_context == set() + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment is None + + +def test_create_text_segments_with_preceding_markers() -> None: + quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( + create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") + ) + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, + tokens=[ + UsfmToken(UsfmTokenType.VERSE), + UsfmToken(UsfmTokenType.PARAGRAPH), + UsfmToken(UsfmTokenType.TEXT, text="test segment"), + ], + ) + text_segments: List[TextSegment] = quotation_denormalizer._create_text_segments(update_element) + + assert len(text_segments) == 1 + assert text_segments[0].text == "test segment" + assert text_segments[0].immediate_preceding_marker == UsfmMarkerType.ParagraphMarker + assert text_segments[0].markers_in_preceding_context == { + UsfmMarkerType.VerseMarker, + UsfmMarkerType.ParagraphMarker, + } + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment is None + + +def test_create_text_segments_with_multiple_text_tokens() -> None: + quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( + create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") + ) + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, + tokens=[ + UsfmToken(UsfmTokenType.VERSE), + UsfmToken(UsfmTokenType.PARAGRAPH), + UsfmToken(UsfmTokenType.TEXT, text="test segment1"), + UsfmToken(UsfmTokenType.VERSE), + UsfmToken(UsfmTokenType.CHARACTER), + UsfmToken(UsfmTokenType.TEXT, text="test segment2"), + UsfmToken(UsfmTokenType.PARAGRAPH), + ], + ) + text_segments: List[TextSegment] = quotation_denormalizer._create_text_segments(update_element) + + assert len(text_segments) == 2 + assert text_segments[0].text == "test segment1" + assert text_segments[0].immediate_preceding_marker == UsfmMarkerType.ParagraphMarker + assert text_segments[0].markers_in_preceding_context == {UsfmMarkerType.VerseMarker, UsfmMarkerType.ParagraphMarker} + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment == text_segments[1] + assert text_segments[1].text == "test segment2" + assert text_segments[1].immediate_preceding_marker == UsfmMarkerType.CharacterMarker + assert text_segments[1].markers_in_preceding_context == {UsfmMarkerType.VerseMarker, UsfmMarkerType.CharacterMarker} + assert text_segments[1].previous_segment == text_segments[0] + assert text_segments[1].next_segment is None + + +def test_create_text_segment() -> None: + quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( + create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") + ) + + usfm_token: UsfmToken = UsfmToken(UsfmTokenType.TEXT, text="test segment") + segment: Union[TextSegment, None] = quotation_denormalizer._create_text_segment(usfm_token) + + assert segment is not None + assert segment.text == "test segment" + assert segment.immediate_preceding_marker is UsfmMarkerType.NoMarker + assert segment.markers_in_preceding_context == set() + assert segment.usfm_token == usfm_token + + +def test_set_previous_and_next_for_segments() -> None: + quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( + create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") + ) + + segments: List[TextSegment] = [ + TextSegment.Builder().set_text("segment 1 text").build(), + TextSegment.Builder().set_text("segment 2 text").build(), + TextSegment.Builder().set_text("segment 3 text").build(), + ] + + quotation_denormalizer._set_previous_and_next_for_segments(segments) + + assert segments[0].previous_segment is None + assert segments[0].next_segment == segments[1] + assert segments[1].previous_segment == segments[0] + assert segments[1].next_segment == segments[2] + assert segments[2].previous_segment == segments[1] + assert segments[2].next_segment is None + + +def test_check_for_chapter_change() -> None: + quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( + create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") + ) + + assert quotation_denormalizer._current_chapter_number == 0 + + quotation_denormalizer._check_for_chapter_change(UsfmUpdateBlock([ScriptureRef.parse("MAT 1:1")], [])) + + assert quotation_denormalizer._current_chapter_number == 1 + + quotation_denormalizer._check_for_chapter_change(UsfmUpdateBlock([ScriptureRef.parse("ISA 15:22")], [])) + + assert quotation_denormalizer._current_chapter_number == 15 + + +def test_start_new_chapter() -> None: + quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( + create_quotation_denormalization_usfm_update_block_handler( + "standard_english", + "standard_english", + QuotationDenormalizationSettings( + chapter_actions=[ + QuotationDenormalizationAction.SKIP, + QuotationDenormalizationAction.APPLY_FULL, + QuotationDenormalizationAction.APPLY_BASIC, + ] + ), + ) + ) + + quotation_denormalizer._next_scripture_text_segment_builder.add_preceding_marker( + UsfmMarkerType.EmbedMarker + ).set_text("this text should be erased") + quotation_denormalizer._verse_text_quotation_mark_resolver._issues.add( + QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK + ) + + quotation_denormalizer._start_new_chapter(1) + segment = quotation_denormalizer._next_scripture_text_segment_builder.build() + assert quotation_denormalizer._current_denormalization_action == QuotationDenormalizationAction.SKIP + assert segment.immediate_preceding_marker == UsfmMarkerType.ChapterMarker + assert segment.text == "" + assert UsfmMarkerType.EmbedMarker not in segment.markers_in_preceding_context + assert quotation_denormalizer._verse_text_quotation_mark_resolver._issues == set() + + quotation_denormalizer._start_new_chapter(2) + assert quotation_denormalizer._current_denormalization_action == QuotationDenormalizationAction.APPLY_FULL + + quotation_denormalizer._start_new_chapter(3) + assert quotation_denormalizer._current_denormalization_action == QuotationDenormalizationAction.APPLY_BASIC + + def denormalize_quotation_marks( normalized_usfm: str, source_quote_convention_name: str, target_quote_convention_name: str, - quotation_denormalization_settings: QuotationDenormalizationSettings = QuotationDenormalizationSettings.Builder() - .run_on_existing_text() - .build(), + quotation_denormalization_settings: QuotationDenormalizationSettings = QuotationDenormalizationSettings(), ) -> str: + quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( + create_quotation_denormalization_usfm_update_block_handler( + source_quote_convention_name, target_quote_convention_name, quotation_denormalization_settings + ) + ) + + updater = UpdateUsfmParserHandler(update_block_handlers=[quotation_denormalizer]) + parse_usfm(normalized_usfm, updater) + + return updater.get_usfm() + + +def create_quotation_denormalization_usfm_update_block_handler( + source_quote_convention_name: str, + target_quote_convention_name: str, + quotation_denormalization_settings: QuotationDenormalizationSettings = QuotationDenormalizationSettings(), +) -> QuotationDenormalizationUsfmUpdateBlockHandler: source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( source_quote_convention_name ) @@ -733,20 +916,50 @@ def denormalize_quotation_marks( ) assert target_quote_convention is not None - quotation_denormalizer: QuotationDenormalizationScriptureUpdateBlockHandler = ( - QuotationDenormalizationScriptureUpdateBlockHandler( - source_quote_convention, - target_quote_convention, - quotation_denormalization_settings, - ) + return QuotationDenormalizationUsfmUpdateBlockHandler( + source_quote_convention, + target_quote_convention, + quotation_denormalization_settings, ) - updater = UpdateUsfmParserHandler(update_block_handlers=[quotation_denormalizer]) - parse_usfm(normalized_usfm, updater) - - return updater.get_usfm() - def assert_usfm_equal(observed_usfm: str, expected_usfm: str) -> None: for observed_line, expected_line in zip(observed_usfm.split("\n"), expected_usfm.split("\n")): assert observed_line.strip() == expected_line.strip() + + +class MockQuotationMarkFinder(QuotationMarkFinder): + def __init__(self) -> None: + super().__init__(QuoteConventionSet([])) + self.num_times_called = 0 + self.matches_to_return = [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('this is a "test').build(), 10, 11), + QuotationMarkStringMatch(TextSegment.Builder().set_text('the test ends" here').build(), 13, 14), + ] + + def find_all_potential_quotation_marks_in_text_segments( + self, text_segments: List[TextSegment] + ) -> List[QuotationMarkStringMatch]: + self.num_times_called += 1 + return self.matches_to_return + + +class MockQuotationMarkResolver(QuotationMarkResolver): + def __init__(self): + super().__init__(QuotationMarkResolutionSettings()) + self.num_times_called = 0 + + def resolve_quotation_marks( + self, quote_matches: List[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: + self.num_times_called += 1 + current_depth = 1 + current_direction = QuotationMarkDirection.Opening + for quote_match in quote_matches: + yield quote_match.resolve(current_depth, current_direction) + current_depth += 1 + current_direction = ( + QuotationMarkDirection.Closing + if current_direction == QuotationMarkDirection.Opening + else QuotationMarkDirection.Opening + ) From 51ee352778c22cf4fd6b4f39fe54bcfd75276fd4 Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 6 Jun 2025 15:32:01 -0400 Subject: [PATCH 14/31] Fix a bug related to verse markers before quotation marks --- ...normalization_usfm_update_block_handler.py | 14 +++++++ .../scripture_ref_usfm_parser_handler.py | 41 +++++++------------ 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/machine/corpora/quotation_denormalization_usfm_update_block_handler.py b/machine/corpora/quotation_denormalization_usfm_update_block_handler.py index 4d848c38..1128bfb0 100644 --- a/machine/corpora/quotation_denormalization_usfm_update_block_handler.py +++ b/machine/corpora/quotation_denormalization_usfm_update_block_handler.py @@ -54,9 +54,11 @@ def __init__( ) self._current_denormalization_action = QuotationDenormalizationAction.APPLY_FULL self._current_chapter_number: int = 0 + self._current_verse_number: int = 0 def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: self._check_for_chapter_change(block) + self._check_for_verse_change(block) if self._current_denormalization_action is QuotationDenormalizationAction.SKIP: return block if self._current_denormalization_action is QuotationDenormalizationAction.APPLY_BASIC: @@ -134,3 +136,15 @@ def _start_new_chapter(self, new_chapter_number: int) -> None: self._verse_text_quotation_mark_resolver.reset() self._next_scripture_text_segment_builder = TextSegment.Builder() self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) + + def _check_for_verse_change(self, block: UsfmUpdateBlock) -> None: + for scripture_ref in block.refs: + if ( + scripture_ref.chapter_num == self._current_chapter_number + and scripture_ref.verse_num != self._current_verse_number + ): + self._current_verse_number = scripture_ref.verse_num + self._start_new_verse(self._current_verse_number) + + def _start_new_verse(self, new_chapter_number: int) -> None: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler.py index febc4922..db9081b7 100644 --- a/machine/corpora/scripture_ref_usfm_parser_handler.py +++ b/machine/corpora/scripture_ref_usfm_parser_handler.py @@ -5,7 +5,6 @@ from ..scripture.verse_ref import VerseRef, are_overlapping_verse_ranges from .corpora_utils import merge_verse_ranges from .scripture_element import ScriptureElement -from .scripture_embed import EMBED_PART_START_CHAR_STYLES, is_embed_part_style, is_embed_style, is_note_text from .scripture_ref import ScriptureRef from .usfm_parser_handler import UsfmParserHandler from .usfm_parser_state import UsfmParserState @@ -22,6 +21,10 @@ class ScriptureTextType(Enum): _EMBED_STYLES = {"f", "fe", "x", "fig"} +def _is_embed_style(marker: Optional[str]) -> bool: + return marker is not None and (marker.strip("*") in _EMBED_STYLES or marker.startswith("z")) + + class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC): def __init__(self) -> None: self._cur_verse_ref: VerseRef = VerseRef() @@ -119,29 +122,23 @@ def opt_break(self, state: UsfmParserState) -> None: def start_char( self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]] ) -> None: - if is_embed_part_style(marker) and self._in_note_text: - self._in_nested_embed = True # if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment self._check_convert_verse_para_to_non_verse(state) - if is_embed_style(marker): - self._in_embed = True - self._start_embed_wrapper(state, marker) - - if is_note_text(marker): - self._start_note_text_wrapper(state) + if _is_embed_style(marker): + self._start_embed_text_wrapper(state, marker) def end_char( self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool ) -> None: - if is_embed_part_style(marker): - if self._in_nested_embed: - self._in_nested_embed = False - else: - self._end_note_text_wrapper(state) - if is_embed_style(marker): - self._end_embed(state, marker, attributes, closed) - self._in_embed = False + if _is_embed_style(marker): + self._end_embed_text_wrapper(state) + + def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None: + self._start_embed_text_wrapper(state, marker) + + def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: + self._end_embed_text_wrapper(state) def _start_verse_text(self, state: UsfmParserState, scripture_refs: Optional[Sequence[ScriptureRef]]) -> None: ... @@ -212,7 +209,7 @@ def _end_parent_element(self) -> None: self._cur_elements_stack.pop() def _end_embed_elements(self) -> None: - if self._cur_elements_stack and is_embed_style(self._cur_elements_stack[-1].name): + if self._cur_elements_stack and _is_embed_style(self._cur_elements_stack[-1].name): self._cur_elements_stack.pop() def _create_verse_refs(self) -> List[ScriptureRef]: @@ -241,11 +238,3 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None ): self._start_parent_element(para_tag.marker) self._start_non_verse_text_wrapper(state) - - def _is_in_embed(self, marker: Optional[str]) -> bool: - return self._in_embed or is_embed_style(marker) - - def _is_in_nested_embed(self, marker: Optional[str]) -> bool: - return self._in_nested_embed or ( - marker is not None and marker.startswith("+") and marker[1] in EMBED_PART_START_CHAR_STYLES - ) From f826ab2333a63471183214414c6cab0711aae5b1 Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 20 Jun 2025 17:39:58 -0400 Subject: [PATCH 15/31] Refactoring to allow arbitrary quote convention changes + more unit tests --- machine/corpora/__init__.py | 20 +- machine/corpora/analysis/__init__.py | 3 + machine/corpora/analysis/quote_convention.py | 14 + ...py => fallback_quotation_mark_resolver.py} | 2 +- .../quotation_denormalization_first_pass.py | 79 +- ...normalization_usfm_update_block_handler.py | 146 +--- .../quotation_mark_update_first_pass.py | 82 +++ ...tation_mark_update_resolution_settings.py} | 16 +- ...s.py => quotation_mark_update_settings.py} | 10 +- ...n.py => quotation_mark_update_strategy.py} | 4 +- ...tion_changing_usfm_update_block_handler.py | 150 ++++ .../analysis/test_quotation_mark_metadata.py | 52 ++ .../analysis/test_quotation_mark_tabulator.py | 139 ++++ ... test_fallback_quotation_mark_resolver.py} | 39 +- ...normalization_usfm_block_update_handler.py | 590 +-------------- ... test_quotation_mark_update_first_pass.py} | 350 ++++++--- ...tion_changing_usfm_block_update_handler.py | 692 ++++++++++++++++++ 17 files changed, 1444 insertions(+), 944 deletions(-) rename machine/corpora/{basic_quotation_mark_resolver.py => fallback_quotation_mark_resolver.py} (98%) create mode 100644 machine/corpora/quotation_mark_update_first_pass.py rename machine/corpora/{quotation_denormalization_resolution_settings.py => quotation_mark_update_resolution_settings.py} (63%) rename machine/corpora/{quotation_denormalization_settings.py => quotation_mark_update_settings.py} (54%) rename machine/corpora/{quotation_denormalization_action.py => quotation_mark_update_strategy.py} (51%) create mode 100644 machine/corpora/quote_convention_changing_usfm_update_block_handler.py create mode 100644 tests/corpora/analysis/test_quotation_mark_metadata.py create mode 100644 tests/corpora/analysis/test_quotation_mark_tabulator.py rename tests/corpora/{test_basic_quotation_mark_resolver.py => test_fallback_quotation_mark_resolver.py} (88%) rename tests/corpora/{test_quotation_denormalization_first_pass.py => test_quotation_mark_update_first_pass.py} (54%) create mode 100644 tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 518901fa..25c5b4d3 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -2,12 +2,12 @@ from .alignment_collection import AlignmentCollection from .alignment_corpus import AlignmentCorpus from .alignment_row import AlignmentRow -from .basic_quotation_mark_resolver import BasicQuotationMarkResolver from .corpora_utils import batch from .corpus import Corpus from .dbl_bundle_text_corpus import DblBundleTextCorpus from .dictionary_alignment_corpus import DictionaryAlignmentCorpus from .dictionary_text_corpus import DictionaryTextCorpus +from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser from .file_paratext_project_text_updater import FileParatextProjectTextUpdater from .flatten import flatten @@ -25,11 +25,13 @@ from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler -from .quotation_denormalization_action import QuotationDenormalizationAction from .quotation_denormalization_first_pass import QuotationDenormalizationFirstPass -from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings -from .quotation_denormalization_settings import QuotationDenormalizationSettings from .quotation_denormalization_usfm_update_block_handler import QuotationDenormalizationUsfmUpdateBlockHandler +from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass +from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings +from .quotation_mark_update_settings import QuotationMarkUpdateSettings +from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy +from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler from .scripture_element import ScriptureElement from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType @@ -92,7 +94,7 @@ "AlignmentCollection", "AlignmentCorpus", "AlignmentRow", - "BasicQuotationMarkResolver", + "FallbackQuotationMarkResolver", "batch", "Corpus", "create_versification_ref_corpus", @@ -128,11 +130,13 @@ "PlaceMarkersAlignmentInfo", "PlaceMarkersUsfmUpdateBlockHandler", "parse_usfm", - "QuotationDenormalizationAction", + "QuoteConventionChangingUsfmUpdateBlockHandler", + "QuotationMarkUpdateResolutionSettings", + "QuotationMarkUpdateStrategy", + "QuotationMarkUpdateFirstPass", "QuotationDenormalizationFirstPass", "QuotationDenormalizationUsfmUpdateBlockHandler", - "QuotationDenormalizationResolutionSettings", - "QuotationDenormalizationSettings", + "QuotationMarkUpdateSettings", "RtlReferenceOrder", "ScriptureElement", "ScriptureRef", diff --git a/machine/corpora/analysis/__init__.py b/machine/corpora/analysis/__init__.py index 90579e4a..9741cb8d 100644 --- a/machine/corpora/analysis/__init__.py +++ b/machine/corpora/analysis/__init__.py @@ -7,6 +7,7 @@ from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings from .quotation_mark_resolver import QuotationMarkResolver from .quotation_mark_string_match import QuotationMarkStringMatch +from .quotation_mark_tabulator import QuotationMarkCounts, QuotationMarkTabulator from .quote_convention import QuoteConvention, SingleLevelQuoteConvention from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector @@ -20,6 +21,7 @@ "Chapter", "DepthBasedQuotationMarkResolver", "SingleLevelQuoteConvention", + "QuotationMarkCounts", "QuotationMarkDirection", "QuotationMarkMetadata", "QuotationMarkStringMatch", @@ -30,6 +32,7 @@ "QuotationMarkResolutionIssue", "QuotationMarkResolutionSettings", "QuotationMarkResolver", + "QuotationMarkTabulator", "QuoteConventionDetector", "QuoteConventionSet", "TextSegment", diff --git a/machine/corpora/analysis/quote_convention.py b/machine/corpora/analysis/quote_convention.py index 9a5ebe0b..944b8ee4 100644 --- a/machine/corpora/analysis/quote_convention.py +++ b/machine/corpora/analysis/quote_convention.py @@ -48,6 +48,20 @@ def __init__(self, name: str, levels: list[SingleLevelQuoteConvention]): self.name = name self.levels = levels + def __eq__(self, value): + if not isinstance(value, QuoteConvention): + return False + if self.name != value.name: + return False + if len(self.levels) != len(value.levels): + return False + for level, other_level in zip(self.levels, value.levels): + if level.get_opening_quote() != other_level.get_opening_quote(): + return False + if level.get_closing_quote() != other_level.get_closing_quote(): + return False + return True + def get_name(self) -> str: return self.name diff --git a/machine/corpora/basic_quotation_mark_resolver.py b/machine/corpora/fallback_quotation_mark_resolver.py similarity index 98% rename from machine/corpora/basic_quotation_mark_resolver.py rename to machine/corpora/fallback_quotation_mark_resolver.py index 5e945c6a..106ef460 100644 --- a/machine/corpora/basic_quotation_mark_resolver.py +++ b/machine/corpora/fallback_quotation_mark_resolver.py @@ -8,7 +8,7 @@ from .analysis.quotation_mark_string_match import QuotationMarkStringMatch -class BasicQuotationMarkResolver(QuotationMarkResolver): +class FallbackQuotationMarkResolver(QuotationMarkResolver): def __init__(self, settings: QuotationMarkResolutionSettings): self._settings: QuotationMarkResolutionSettings = settings diff --git a/machine/corpora/quotation_denormalization_first_pass.py b/machine/corpora/quotation_denormalization_first_pass.py index d1767715..19ecdb9d 100644 --- a/machine/corpora/quotation_denormalization_first_pass.py +++ b/machine/corpora/quotation_denormalization_first_pass.py @@ -1,81 +1,8 @@ -from typing import Dict, List, Set - -from .analysis.chapter import Chapter -from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver -from .analysis.quotation_mark_finder import QuotationMarkFinder -from .analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue -from .analysis.quotation_mark_resolver import QuotationMarkResolver -from .analysis.quotation_mark_string_match import QuotationMarkStringMatch from .analysis.quote_convention import QuoteConvention -from .analysis.quote_convention_set import QuoteConventionSet -from .analysis.usfm_structure_extractor import UsfmStructureExtractor -from .quotation_denormalization_action import QuotationDenormalizationAction -from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings +from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass -class QuotationDenormalizationFirstPass(UsfmStructureExtractor): +class QuotationDenormalizationFirstPass(QuotationMarkUpdateFirstPass): def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): - super().__init__() - self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( - QuoteConventionSet([source_quote_convention.normalize()]) - ) - self._quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( - QuotationDenormalizationResolutionSettings(source_quote_convention, target_quote_convention) - ) - self._will_basic_denormalization_work: bool = self._check_whether_basic_denormalization_will_work( - source_quote_convention, target_quote_convention - ) - - def _check_whether_basic_denormalization_will_work( - self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention - ) -> bool: - normalized_source_quote_convention: QuoteConvention = source_quote_convention.normalize() - target_marks_by_normalized_source_marks: Dict[str, Set[str]] = {} - for level in range(1, normalized_source_quote_convention.get_num_levels() + 1): - normalized_opening_quotation_mark = normalized_source_quote_convention.get_opening_quote_at_level(level) - if normalized_opening_quotation_mark not in target_marks_by_normalized_source_marks: - target_marks_by_normalized_source_marks[normalized_opening_quotation_mark] = set() - if level <= target_quote_convention.get_num_levels(): - target_marks_by_normalized_source_marks[normalized_opening_quotation_mark].add( - target_quote_convention.get_closing_quote_at_level(level) - ) - - for normalized_source_mark in target_marks_by_normalized_source_marks: - if len(target_marks_by_normalized_source_marks[normalized_source_mark]) > 1: - return False - return True - - def get_best_actions_by_chapter(self) -> List[QuotationDenormalizationAction]: - best_actions_by_chapter: List[QuotationDenormalizationAction] = [] - - for chapter in self.get_chapters(): - best_actions_by_chapter.append(self._find_best_action_for_chapter(chapter)) - - return best_actions_by_chapter - - def _find_best_action_for_chapter(self, chapter: Chapter) -> QuotationDenormalizationAction: - quotation_mark_matches: List[QuotationMarkStringMatch] = ( - self._quotation_mark_finder.find_all_potential_quotation_marks_in_chapter(chapter) - ) - - self._quotation_mark_resolver.reset() - - # use list() to force evaluation of the generator - list(self._quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches)) - - return self._choose_best_action_based_on_observed_issues(self._quotation_mark_resolver.get_issues()) - - def _choose_best_action_based_on_observed_issues(self, issues) -> QuotationDenormalizationAction: - if QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK in issues: - return QuotationDenormalizationAction.SKIP - - if ( - QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK in issues - or QuotationMarkResolutionIssue.TOO_DEEP_NESTING in issues - ): - if self._will_basic_denormalization_work: - return QuotationDenormalizationAction.APPLY_BASIC - return QuotationDenormalizationAction.SKIP - - return QuotationDenormalizationAction.APPLY_FULL + super().__init__(source_quote_convention.normalize(), target_quote_convention) diff --git a/machine/corpora/quotation_denormalization_usfm_update_block_handler.py b/machine/corpora/quotation_denormalization_usfm_update_block_handler.py index 1128bfb0..6b412202 100644 --- a/machine/corpora/quotation_denormalization_usfm_update_block_handler.py +++ b/machine/corpora/quotation_denormalization_usfm_update_block_handler.py @@ -1,150 +1,14 @@ -from typing import List, Union - -from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver -from .analysis.quotation_mark_finder import QuotationMarkFinder -from .analysis.quotation_mark_resolver import QuotationMarkResolver -from .analysis.quotation_mark_string_match import QuotationMarkStringMatch from .analysis.quote_convention import QuoteConvention -from .analysis.quote_convention_set import QuoteConventionSet -from .analysis.text_segment import TextSegment -from .analysis.usfm_marker_type import UsfmMarkerType -from .basic_quotation_mark_resolver import BasicQuotationMarkResolver -from .quotation_denormalization_action import QuotationDenormalizationAction -from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings -from .quotation_denormalization_settings import QuotationDenormalizationSettings -from .usfm_token import UsfmToken, UsfmTokenType -from .usfm_update_block import UsfmUpdateBlock -from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType -from .usfm_update_block_handler import UsfmUpdateBlockHandler +from .quotation_mark_update_settings import QuotationMarkUpdateSettings +from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler -class QuotationDenormalizationUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): +class QuotationDenormalizationUsfmUpdateBlockHandler(QuoteConventionChangingUsfmUpdateBlockHandler): def __init__( self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention, - settings: QuotationDenormalizationSettings = QuotationDenormalizationSettings(), + settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), ): - super().__init__() - self._source_quote_convention: QuoteConvention = source_quote_convention - self._target_quote_convention: QuoteConvention = target_quote_convention - self._settings: QuotationDenormalizationSettings = settings - - self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( - QuoteConventionSet([self._source_quote_convention.normalize()]) - ) - self._next_scripture_text_segment_builder: TextSegment.Builder = TextSegment.Builder() - - resolution_settings = QuotationDenormalizationResolutionSettings( - self._source_quote_convention, self._target_quote_convention - ) - - # Each embed represents a separate context for quotation marks - # (i.e. you can't open a quote in one context and close it in another) - # so we need to keep track of the verse and embed contexts separately. - self._verse_text_quotation_mark_resolver: DepthBasedQuotationMarkResolver = DepthBasedQuotationMarkResolver( - resolution_settings - ) - self._embed_quotation_mark_resolver: DepthBasedQuotationMarkResolver = DepthBasedQuotationMarkResolver( - resolution_settings - ) - self._simple_quotation_mark_resolver: BasicQuotationMarkResolver = BasicQuotationMarkResolver( - resolution_settings - ) - self._current_denormalization_action = QuotationDenormalizationAction.APPLY_FULL - self._current_chapter_number: int = 0 - self._current_verse_number: int = 0 - - def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: - self._check_for_chapter_change(block) - self._check_for_verse_change(block) - if self._current_denormalization_action is QuotationDenormalizationAction.SKIP: - return block - if self._current_denormalization_action is QuotationDenormalizationAction.APPLY_BASIC: - return self._apply_simple_denormalization(block) - return self._apply_full_denormalization(block) - - def _apply_simple_denormalization(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: - for element in block._elements: - self._process_scripture_element(element, self._simple_quotation_mark_resolver) - return block - - def _apply_full_denormalization(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: - for element in block._elements: - if element.type == UsfmUpdateBlockElementType.EMBED: - self._embed_quotation_mark_resolver.reset() - self._process_scripture_element(element, self._embed_quotation_mark_resolver) - else: - self._process_scripture_element(element, self._verse_text_quotation_mark_resolver) - - return block - - def _process_scripture_element( - self, element: UsfmUpdateBlockElement, quotation_mark_resolver: QuotationMarkResolver - ) -> None: - text_segments: List[TextSegment] = self._create_text_segments(element) - quotation_mark_matches: List[QuotationMarkStringMatch] = ( - self._quotation_mark_finder.find_all_potential_quotation_marks_in_text_segments(text_segments) - ) - for resolved_quotation_mark in quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches): - resolved_quotation_mark.update_quotation_mark(self._target_quote_convention) - - def _create_text_segments(self, element: UsfmUpdateBlockElement) -> List[TextSegment]: - text_segments: List[TextSegment] = [] - for token in element.get_tokens(): - if token.type == UsfmTokenType.VERSE: - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) - elif token.type == UsfmTokenType.PARAGRAPH: - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ParagraphMarker) - elif token.type == UsfmTokenType.CHARACTER: - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.CharacterMarker) - elif token.type == UsfmTokenType.NOTE: - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) - elif token.type == UsfmTokenType.TEXT: - text_segment: Union[TextSegment, None] = self._create_text_segment(token) - if text_segment is not None: - text_segments.append(text_segment) - return self._set_previous_and_next_for_segments(text_segments) - - def _create_text_segment(self, token: UsfmToken) -> Union[TextSegment, None]: - self._next_scripture_text_segment_builder.set_usfm_token(token) - if token.text is not None: - self._next_scripture_text_segment_builder.set_text(token.text) - text_segment_to_return: TextSegment = self._next_scripture_text_segment_builder.build() - self._next_scripture_text_segment_builder = TextSegment.Builder() - return text_segment_to_return - else: - self._next_scripture_text_segment_builder = TextSegment.Builder() - - def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) -> List[TextSegment]: - for i in range(len(text_segments)): - if i > 0: - text_segments[i].set_previous_segment(text_segments[i - 1]) - if i < len(text_segments) - 1: - text_segments[i].set_next_segment(text_segments[i + 1]) - return text_segments - - def _check_for_chapter_change(self, block: UsfmUpdateBlock) -> None: - for scripture_ref in block.refs: - if scripture_ref.chapter_num != self._current_chapter_number: - self._current_chapter_number = scripture_ref.chapter_num - self._start_new_chapter(self._current_chapter_number) - - def _start_new_chapter(self, new_chapter_number: int) -> None: - self._current_denormalization_action = self._settings.get_action_for_chapter(new_chapter_number) - self._verse_text_quotation_mark_resolver.reset() - self._next_scripture_text_segment_builder = TextSegment.Builder() - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) - - def _check_for_verse_change(self, block: UsfmUpdateBlock) -> None: - for scripture_ref in block.refs: - if ( - scripture_ref.chapter_num == self._current_chapter_number - and scripture_ref.verse_num != self._current_verse_number - ): - self._current_verse_number = scripture_ref.verse_num - self._start_new_verse(self._current_verse_number) - - def _start_new_verse(self, new_chapter_number: int) -> None: - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) + super().__init__(source_quote_convention.normalize(), target_quote_convention, settings) diff --git a/machine/corpora/quotation_mark_update_first_pass.py b/machine/corpora/quotation_mark_update_first_pass.py new file mode 100644 index 00000000..e4b294e3 --- /dev/null +++ b/machine/corpora/quotation_mark_update_first_pass.py @@ -0,0 +1,82 @@ +from typing import Dict, List, Set + +from .analysis.chapter import Chapter +from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .analysis.quotation_mark_finder import QuotationMarkFinder +from .analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .analysis.quotation_mark_resolver import QuotationMarkResolver +from .analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .analysis.quote_convention import QuoteConvention +from .analysis.quote_convention_set import QuoteConventionSet +from .analysis.usfm_structure_extractor import UsfmStructureExtractor +from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings +from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy + + +class QuotationMarkUpdateFirstPass(UsfmStructureExtractor): + + def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): + super().__init__() + self._source_quote_convention: QuoteConvention = source_quote_convention + self._target_quote_convention: QuoteConvention = target_quote_convention + self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( + QuoteConventionSet([source_quote_convention]) + ) + self._quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(source_quote_convention, target_quote_convention) + ) + self._will_fallback_mode_work: bool = self._check_whether_fallback_mode_will_work( + source_quote_convention, target_quote_convention + ) + + def _check_whether_fallback_mode_will_work( + self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention + ) -> bool: + target_marks_by_source_marks: Dict[str, Set[str]] = {} + for level in range(1, source_quote_convention.get_num_levels() + 1): + opening_quotation_mark = source_quote_convention.get_opening_quote_at_level(level) + if opening_quotation_mark not in target_marks_by_source_marks: + target_marks_by_source_marks[opening_quotation_mark] = set() + if level <= target_quote_convention.get_num_levels(): + target_marks_by_source_marks[opening_quotation_mark].add( + target_quote_convention.get_closing_quote_at_level(level) + ) + + for source_mark in target_marks_by_source_marks: + if len(target_marks_by_source_marks[source_mark]) > 1: + return False + return True + + def get_best_actions_by_chapter(self) -> List[QuotationMarkUpdateStrategy]: + best_actions_by_chapter: List[QuotationMarkUpdateStrategy] = [] + + for chapter in self.get_chapters(): + best_actions_by_chapter.append(self._find_best_action_for_chapter(chapter)) + + return best_actions_by_chapter + + def _find_best_action_for_chapter(self, chapter: Chapter) -> QuotationMarkUpdateStrategy: + quotation_mark_matches: List[QuotationMarkStringMatch] = ( + self._quotation_mark_finder.find_all_potential_quotation_marks_in_chapter(chapter) + ) + + self._quotation_mark_resolver.reset() + + # use list() to force evaluation of the generator + list(self._quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches)) + + return self._choose_best_action_based_on_observed_issues(self._quotation_mark_resolver.get_issues()) + + def _choose_best_action_based_on_observed_issues(self, issues) -> QuotationMarkUpdateStrategy: + if QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK in issues: + return QuotationMarkUpdateStrategy.SKIP + + if ( + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK in issues + or QuotationMarkResolutionIssue.TOO_DEEP_NESTING in issues + ): + if self._will_fallback_mode_work: + return QuotationMarkUpdateStrategy.APPLY_FALLBACK + return QuotationMarkUpdateStrategy.SKIP + + return QuotationMarkUpdateStrategy.APPLY_FULL diff --git a/machine/corpora/quotation_denormalization_resolution_settings.py b/machine/corpora/quotation_mark_update_resolution_settings.py similarity index 63% rename from machine/corpora/quotation_denormalization_resolution_settings.py rename to machine/corpora/quotation_mark_update_resolution_settings.py index 8f5dc0c5..77d9009b 100644 --- a/machine/corpora/quotation_denormalization_resolution_settings.py +++ b/machine/corpora/quotation_mark_update_resolution_settings.py @@ -7,28 +7,28 @@ from .analysis.quote_convention_set import QuoteConventionSet -class QuotationDenormalizationResolutionSettings(QuotationMarkResolutionSettings): +class QuotationMarkUpdateResolutionSettings(QuotationMarkResolutionSettings): def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): - self._normalized_source_quote_convention = source_quote_convention.normalize() - self._normalized_quote_convention_singleton_set = QuoteConventionSet([self._normalized_source_quote_convention]) + self._source_quote_convention = source_quote_convention + self._quote_convention_singleton_set = QuoteConventionSet([self._source_quote_convention]) self._target_quote_convention = target_quote_convention def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: - return quotation_mark_match.is_valid_opening_quotation_mark(self._normalized_quote_convention_singleton_set) + return quotation_mark_match.is_valid_opening_quotation_mark(self._quote_convention_singleton_set) def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: - return quotation_mark_match.is_valid_closing_quotation_mark(self._normalized_quote_convention_singleton_set) + return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_convention_singleton_set) def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: - return self._normalized_quote_convention_singleton_set.are_marks_a_valid_pair(opening_mark, closing_mark) + return self._quote_convention_singleton_set.are_marks_a_valid_pair(opening_mark, closing_mark) def should_rely_on_paragraph_markers(self): return False def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: - return self._normalized_source_quote_convention.get_possible_depths(quotation_mark, direction) + return self._source_quote_convention.get_possible_depths(quotation_mark, direction) def does_metadata_match_quotation_mark( self, quotation_mark: str, depth: int, direction: QuotationMarkDirection ) -> bool: - return self._normalized_source_quote_convention.get_expected_quotation_mark(depth, direction) == quotation_mark + return self._source_quote_convention.get_expected_quotation_mark(depth, direction) == quotation_mark diff --git a/machine/corpora/quotation_denormalization_settings.py b/machine/corpora/quotation_mark_update_settings.py similarity index 54% rename from machine/corpora/quotation_denormalization_settings.py rename to machine/corpora/quotation_mark_update_settings.py index 76c85f5d..cb4de267 100644 --- a/machine/corpora/quotation_denormalization_settings.py +++ b/machine/corpora/quotation_mark_update_settings.py @@ -1,17 +1,17 @@ -from .quotation_denormalization_action import QuotationDenormalizationAction +from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy -class QuotationDenormalizationSettings: +class QuotationMarkUpdateSettings: def __init__( self, - default_chapter_action: QuotationDenormalizationAction = QuotationDenormalizationAction.APPLY_FULL, - chapter_actions: list[QuotationDenormalizationAction] = [], + default_chapter_action: QuotationMarkUpdateStrategy = QuotationMarkUpdateStrategy.APPLY_FULL, + chapter_actions: list[QuotationMarkUpdateStrategy] = [], ): self._default_chapter_action = default_chapter_action self._chapter_actions = chapter_actions - def get_action_for_chapter(self, chapter_number: int) -> QuotationDenormalizationAction: + def get_action_for_chapter(self, chapter_number: int) -> QuotationMarkUpdateStrategy: if chapter_number <= len(self._chapter_actions): return self._chapter_actions[chapter_number - 1] return self._default_chapter_action diff --git a/machine/corpora/quotation_denormalization_action.py b/machine/corpora/quotation_mark_update_strategy.py similarity index 51% rename from machine/corpora/quotation_denormalization_action.py rename to machine/corpora/quotation_mark_update_strategy.py index d036421b..ea66e5e2 100644 --- a/machine/corpora/quotation_denormalization_action.py +++ b/machine/corpora/quotation_mark_update_strategy.py @@ -1,7 +1,7 @@ from enum import Enum, auto -class QuotationDenormalizationAction(Enum): +class QuotationMarkUpdateStrategy(Enum): APPLY_FULL = auto() - APPLY_BASIC = auto() + APPLY_FALLBACK = auto() SKIP = auto() diff --git a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py new file mode 100644 index 00000000..153fcafe --- /dev/null +++ b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py @@ -0,0 +1,150 @@ +from typing import List, Union + +from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .analysis.quotation_mark_finder import QuotationMarkFinder +from .analysis.quotation_mark_resolver import QuotationMarkResolver +from .analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .analysis.quote_convention import QuoteConvention +from .analysis.quote_convention_set import QuoteConventionSet +from .analysis.text_segment import TextSegment +from .analysis.usfm_marker_type import UsfmMarkerType +from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver +from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings +from .quotation_mark_update_settings import QuotationMarkUpdateSettings +from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy +from .usfm_token import UsfmToken, UsfmTokenType +from .usfm_update_block import UsfmUpdateBlock +from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType +from .usfm_update_block_handler import UsfmUpdateBlockHandler + + +class QuoteConventionChangingUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): + + def __init__( + self, + source_quote_convention: QuoteConvention, + target_quote_convention: QuoteConvention, + settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), + ): + super().__init__() + self._source_quote_convention: QuoteConvention = source_quote_convention + self._target_quote_convention: QuoteConvention = target_quote_convention + self._settings: QuotationMarkUpdateSettings = settings + + self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( + QuoteConventionSet([self._source_quote_convention]) + ) + self._next_scripture_text_segment_builder: TextSegment.Builder = TextSegment.Builder() + + resolution_settings = QuotationMarkUpdateResolutionSettings( + self._source_quote_convention, self._target_quote_convention + ) + + # Each embed represents a separate context for quotation marks + # (i.e. you can't open a quote in one context and close it in another) + # so we need to keep track of the verse and embed contexts separately. + self._verse_text_quotation_mark_resolver: DepthBasedQuotationMarkResolver = DepthBasedQuotationMarkResolver( + resolution_settings + ) + self._embed_quotation_mark_resolver: DepthBasedQuotationMarkResolver = DepthBasedQuotationMarkResolver( + resolution_settings + ) + self._simple_quotation_mark_resolver: FallbackQuotationMarkResolver = FallbackQuotationMarkResolver( + resolution_settings + ) + self._current_strategy = QuotationMarkUpdateStrategy.APPLY_FULL + self._current_chapter_number: int = 0 + self._current_verse_number: int = 0 + + def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: + self._check_for_chapter_change(block) + self._check_for_verse_change(block) + if self._current_strategy is QuotationMarkUpdateStrategy.SKIP: + return block + if self._current_strategy is QuotationMarkUpdateStrategy.APPLY_FALLBACK: + return self._apply_fallback_updating(block) + return self._apply_standard_updating(block) + + def _apply_fallback_updating(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: + for element in block._elements: + self._process_scripture_element(element, self._simple_quotation_mark_resolver) + return block + + def _apply_standard_updating(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: + for element in block._elements: + if element.type == UsfmUpdateBlockElementType.EMBED: + self._embed_quotation_mark_resolver.reset() + self._process_scripture_element(element, self._embed_quotation_mark_resolver) + else: + self._process_scripture_element(element, self._verse_text_quotation_mark_resolver) + + return block + + def _process_scripture_element( + self, element: UsfmUpdateBlockElement, quotation_mark_resolver: QuotationMarkResolver + ) -> None: + text_segments: List[TextSegment] = self._create_text_segments(element) + quotation_mark_matches: List[QuotationMarkStringMatch] = ( + self._quotation_mark_finder.find_all_potential_quotation_marks_in_text_segments(text_segments) + ) + for resolved_quotation_mark in quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches): + resolved_quotation_mark.update_quotation_mark(self._target_quote_convention) + + def _create_text_segments(self, element: UsfmUpdateBlockElement) -> List[TextSegment]: + text_segments: List[TextSegment] = [] + for token in element.get_tokens(): + if token.type == UsfmTokenType.VERSE: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) + elif token.type == UsfmTokenType.PARAGRAPH: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ParagraphMarker) + elif token.type == UsfmTokenType.CHARACTER: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.CharacterMarker) + elif token.type == UsfmTokenType.NOTE: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + elif token.type == UsfmTokenType.TEXT: + text_segment: Union[TextSegment, None] = self._create_text_segment(token) + if text_segment is not None: + text_segments.append(text_segment) + return self._set_previous_and_next_for_segments(text_segments) + + def _create_text_segment(self, token: UsfmToken) -> Union[TextSegment, None]: + self._next_scripture_text_segment_builder.set_usfm_token(token) + if token.text is not None: + self._next_scripture_text_segment_builder.set_text(token.text) + text_segment_to_return: TextSegment = self._next_scripture_text_segment_builder.build() + self._next_scripture_text_segment_builder = TextSegment.Builder() + return text_segment_to_return + else: + self._next_scripture_text_segment_builder = TextSegment.Builder() + + def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) -> List[TextSegment]: + for i in range(len(text_segments)): + if i > 0: + text_segments[i].set_previous_segment(text_segments[i - 1]) + if i < len(text_segments) - 1: + text_segments[i].set_next_segment(text_segments[i + 1]) + return text_segments + + def _check_for_chapter_change(self, block: UsfmUpdateBlock) -> None: + for scripture_ref in block.refs: + if scripture_ref.chapter_num != self._current_chapter_number: + self._current_chapter_number = scripture_ref.chapter_num + self._start_new_chapter(self._current_chapter_number) + + def _start_new_chapter(self, new_chapter_number: int) -> None: + self._current_strategy = self._settings.get_action_for_chapter(new_chapter_number) + self._verse_text_quotation_mark_resolver.reset() + self._next_scripture_text_segment_builder = TextSegment.Builder() + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) + + def _check_for_verse_change(self, block: UsfmUpdateBlock) -> None: + for scripture_ref in block.refs: + if ( + scripture_ref.chapter_num == self._current_chapter_number + and scripture_ref.verse_num != self._current_verse_number + ): + self._current_verse_number = scripture_ref.verse_num + self._start_new_verse(self._current_verse_number) + + def _start_new_verse(self, new_chapter_number: int) -> None: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) diff --git a/tests/corpora/analysis/test_quotation_mark_metadata.py b/tests/corpora/analysis/test_quotation_mark_metadata.py new file mode 100644 index 00000000..c81954a1 --- /dev/null +++ b/tests/corpora/analysis/test_quotation_mark_metadata.py @@ -0,0 +1,52 @@ +from typing import Union + +from machine.corpora.analysis import ( + QuotationMarkDirection, + QuotationMarkMetadata, + QuoteConvention, + TextSegment, + standard_quote_conventions, +) + + +def test_update_quotation_mark() -> None: + quotation_mark_metadata = QuotationMarkMetadata( + quotation_mark='"', + depth=1, + direction=QuotationMarkDirection.Opening, + text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), + start_index=22, + end_index=23, + ) + quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english")) + assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said," + + quotation_mark_metadata = QuotationMarkMetadata( + quotation_mark='"', + depth=1, + direction=QuotationMarkDirection.Opening, + text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), + start_index=22, + end_index=23, + ) + quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) + assert quotation_mark_metadata.text_segment.text == "He said to the woman, «Has God really said," + + quotation_mark_metadata = QuotationMarkMetadata( + quotation_mark='"', + depth=1, + direction=QuotationMarkDirection.Opening, + text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), + start_index=23, + end_index=24, + ) + quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) + assert quotation_mark_metadata.text_segment.text == 'He said to the woman, "«as God really said,' + + +def get_quote_convention_by_name(name: str) -> QuoteConvention: + quote_convention: Union[QuoteConvention, None] = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name(name) + ) + assert quote_convention is not None + return quote_convention diff --git a/tests/corpora/analysis/test_quotation_mark_tabulator.py b/tests/corpora/analysis/test_quotation_mark_tabulator.py new file mode 100644 index 00000000..25052a07 --- /dev/null +++ b/tests/corpora/analysis/test_quotation_mark_tabulator.py @@ -0,0 +1,139 @@ +# QuotationMarkCounts tests +from pytest import approx + +from machine.corpora.analysis import ( + QuotationMarkCounts, + QuotationMarkDirection, + QuotationMarkMetadata, + QuotationMarkTabulator, + QuoteConvention, + SingleLevelQuoteConvention, + TextSegment, +) + + +def test_get_observed_count() -> None: + counts = QuotationMarkCounts() + assert counts.get_observed_count() == 0 + + counts.count_quotation_mark('"') + assert counts.get_observed_count() == 1 + + counts.count_quotation_mark('"') + assert counts.get_observed_count() == 2 + + counts.count_quotation_mark("'") + assert counts.get_observed_count() == 3 + + +def test_get_best_proportion() -> None: + counts = QuotationMarkCounts() + counts.count_quotation_mark('"') + counts.count_quotation_mark('"') + counts.count_quotation_mark("'") + + best_str, best_count, total_count = counts.get_best_proportion() + assert best_str == '"' + assert best_count == 2 + assert total_count == 3 + + counts.count_quotation_mark("'") + counts.count_quotation_mark("'") + + best_str, best_count, total_count = counts.get_best_proportion() + assert best_str == "'" + assert best_count == 3 + assert total_count == 5 + + +def test_calculate_num_differences() -> None: + counts = QuotationMarkCounts() + counts.count_quotation_mark('"') + counts.count_quotation_mark('"') + counts.count_quotation_mark("'") + + assert counts.calculate_num_differences('"') == 1 + assert counts.calculate_num_differences("'") == 2 + assert counts.calculate_num_differences("\u201c") == 3 + + counts.count_quotation_mark("'") + assert counts.calculate_num_differences('"') == 2 + assert counts.calculate_num_differences("'") == 2 + assert counts.calculate_num_differences("\u201c") == 4 + + +# QuotationMarkTabulator tests +def test_calculate_similarity() -> None: + single_level_quotation_mark_tabulator = QuotationMarkTabulator() + single_level_quotation_mark_tabulator.tabulate( + [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 0, 1), + ] + ) + + assert ( + single_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention("", [SingleLevelQuoteConvention("\u201c", "\u201d")]) + ) + == 1.0 + ) + assert ( + single_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention("", [SingleLevelQuoteConvention("\u201d", "\u201c")]) + ) + == 0.0 + ) + assert ( + single_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention("", [SingleLevelQuoteConvention("\u201c", '"')]) + ) + == 0.5 + ) + assert ( + single_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention( + "", [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u00ab", "\u00bb")] + ) + ) + == 1.0 + ) + + empty_quotation_mark_tabulator = QuotationMarkTabulator() + assert ( + empty_quotation_mark_tabulator.calculate_similarity( + QuoteConvention("", [SingleLevelQuoteConvention("\u201c", "\u201d")]) + ) + == 0.0 + ) + + two_level_quotation_mark_tabulator = QuotationMarkTabulator() + two_level_quotation_mark_tabulator.tabulate( + [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 2), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 0, 2), + ] + ) + assert two_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention("", [SingleLevelQuoteConvention("\u201c", "\u201d")]) + ) == approx(0.66666666666667, rel=1e-9) + assert ( + two_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention( + "", [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019")] + ) + ) + == 1.0 + ) + assert two_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention( + "", [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u00ab", "\u00bb")] + ) + ) == approx(0.66666666666667, rel=1e-9) + assert two_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention( + "", [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u2018", "\u2019")] + ) + ) == approx(0.33333333333333, rel=1e-9) diff --git a/tests/corpora/test_basic_quotation_mark_resolver.py b/tests/corpora/test_fallback_quotation_mark_resolver.py similarity index 88% rename from tests/corpora/test_basic_quotation_mark_resolver.py rename to tests/corpora/test_fallback_quotation_mark_resolver.py index 8ea7a362..16ca3fac 100644 --- a/tests/corpora/test_basic_quotation_mark_resolver.py +++ b/tests/corpora/test_fallback_quotation_mark_resolver.py @@ -1,4 +1,4 @@ -from machine.corpora import BasicQuotationMarkResolver, QuotationDenormalizationResolutionSettings +from machine.corpora import FallbackQuotationMarkResolver, QuotationMarkUpdateResolutionSettings from machine.corpora.analysis import ( QuotationMarkDirection, QuotationMarkMetadata, @@ -17,8 +17,8 @@ def test_reset(): ) assert english_quote_convention is not None - basic_quotation_mark_resolver = BasicQuotationMarkResolver( - QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention, english_quote_convention) ) basic_quotation_mark_resolver._last_quotation_mark = QuotationMarkMetadata( @@ -37,8 +37,8 @@ def test_simple_quotation_mark_resolution(): ) assert english_quote_convention is not None - basic_quotation_mark_resolver = BasicQuotationMarkResolver( - QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) ) actual_resolved_quotation_marks = list( @@ -70,8 +70,8 @@ def test_is_opening_quote(): ) assert english_quote_convention is not None - basic_quotation_mark_resolver = BasicQuotationMarkResolver( - QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) ) # valid opening quote at start of segment @@ -113,7 +113,7 @@ def test_is_opening_quote_with_unambiguous_quote_convention(): ) assert english_quote_convention is not None - basic_quotation_mark_resolver = BasicQuotationMarkResolver( + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( QuoteConventionDetectionResolutionSettings(QuoteConventionSet([english_quote_convention])) ) @@ -140,8 +140,8 @@ def test_is_opening_quote_stateful(): ) assert english_quote_convention is not None - basic_quotation_mark_resolver = BasicQuotationMarkResolver( - QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) ) # no preceding quote @@ -161,8 +161,8 @@ def test_does_most_recent_opening_mark_immediately_precede(): ) assert english_quote_convention is not None - basic_quotation_mark_resolver = BasicQuotationMarkResolver( - QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention, english_quote_convention) ) # no preceding quote @@ -201,8 +201,8 @@ def test_is_closing_quote(): ) assert english_quote_convention is not None - basic_quotation_mark_resolver = BasicQuotationMarkResolver( - QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) ) # valid closing quote at end of segment @@ -244,7 +244,7 @@ def test_is_closing_quote_with_unambiguous_quote_convention(): ) assert english_quote_convention is not None - basic_quotation_mark_resolver = BasicQuotationMarkResolver( + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( QuoteConventionDetectionResolutionSettings(QuoteConventionSet([english_quote_convention])) ) @@ -271,8 +271,8 @@ def test_resolve_opening_quote(): ) assert english_quote_convention is not None - basic_quotation_mark_resolver = BasicQuotationMarkResolver( - QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) ) expected_resolved_quotation_mark = QuotationMarkMetadata( @@ -291,8 +291,8 @@ def test_resolve_closing_quote(): ) assert english_quote_convention is not None - basic_quotation_mark_resolver = BasicQuotationMarkResolver( - QuotationDenormalizationResolutionSettings(english_quote_convention, english_quote_convention) + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) ) expected_resolved_quotation_mark = QuotationMarkMetadata( @@ -310,4 +310,5 @@ def assert_resolved_quotation_marks_equal( ) -> None: assert len(actual_resolved_quotation_marks) == len(expected_resolved_quotation_marks) for actual_mark, expected_mark in zip(actual_resolved_quotation_marks, expected_resolved_quotation_marks): + print(f"Actual: {actual_mark.get_quotation_mark()}, Expected: {expected_mark.get_quotation_mark()}") assert actual_mark == expected_mark diff --git a/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py index 4c7730b4..3bd706df 100644 --- a/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py +++ b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py @@ -1,31 +1,13 @@ -from typing import Generator, List, Union +from typing import Union from machine.corpora import ( - QuotationDenormalizationAction, - QuotationDenormalizationSettings, QuotationDenormalizationUsfmUpdateBlockHandler, - ScriptureRef, + QuotationMarkUpdateSettings, + QuotationMarkUpdateStrategy, UpdateUsfmParserHandler, - UsfmToken, - UsfmTokenType, - UsfmUpdateBlock, - UsfmUpdateBlockElement, - UsfmUpdateBlockElementType, parse_usfm, ) -from machine.corpora.analysis import ( - QuotationMarkDirection, - QuotationMarkFinder, - QuotationMarkMetadata, - QuotationMarkResolutionIssue, - QuotationMarkResolutionSettings, - QuotationMarkResolver, - QuotationMarkStringMatch, - QuoteConventionSet, - TextSegment, - UsfmMarkerType, - standard_quote_conventions, -) +from machine.corpora.analysis import QuoteConvention, standard_quote_conventions simple_normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal @@ -308,105 +290,7 @@ def test_simple_arabic_quote_denormalization() -> None: assert_usfm_equal(observed_usfm, expected_usfm) -def test_quotes_spanning_verses() -> None: - normalized_usfm = """\\c 1 - \\v 1 Now the serpent was more subtle than any animal - of the field which Yahweh God had made. - He said to the woman, "Has God really said, - \\v 2 'You shall not eat of any tree of the garden'?" - """ - - expected_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " - + "the woman, “Has God really said, \n" - + "\\v 2 ‘You shall not eat of any tree of the garden’?”" - ) - - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") - assert_usfm_equal(observed_usfm, expected_usfm) - - -def test_single_embed() -> None: - normalized_usfm = """\\c 1 - \\v 1 Now the serpent was more subtle than any animal - \\f + \\ft "This is a 'footnote'" \\f* - of the field which Yahweh God had made. - """ - - expected_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal " - + "\\f + \\ft “This is a ‘footnote’” \\f* of the field which Yahweh God had made." - ) - - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") - assert_usfm_equal(observed_usfm, expected_usfm) - - -def test_multiple_embeds() -> None: - normalized_usfm = """\\c 1 - \\v 1 Now the serpent was more subtle than any animal - \\f + \\ft "This is a 'footnote'" \\f* - of the field \\f + \\ft Second "footnote" here \\f* which Yahweh God had made. - """ - - expected_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal " - + "\\f + \\ft “This is a ‘footnote’” \\f* of the field \\f + \\ft Second " - + "“footnote” here \\f* which Yahweh God had made." - ) - - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") - assert_usfm_equal(observed_usfm, expected_usfm) - - -def test_quotes_in_text_and_embed() -> None: - normalized_usfm = """\\c 1 - \\v 1 Now the serpent was more subtle than any animal - of the field which Yahweh God had made. - He said to the woman, "Has God really \\f + \\ft a - "footnote" in the "midst of 'text'" \\f* said, - 'You shall not eat of any tree of the garden'?" - """ - - expected_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " - + "the woman, “Has God really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " - + "said, ‘You shall not eat of any tree of the garden’?”" - ) - - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") - assert_usfm_equal(observed_usfm, expected_usfm) - - -def test_quotes_in_multiple_verses_and_embed() -> None: - normalized_usfm = """\\c 1 - \\v 1 Now the serpent was more subtle than any animal - of the field which Yahweh God had made. - He said to the woman, "Has God - \\v 2 really \\f + \\ft a - "footnote" in the "midst of 'text'" \\f* said, - 'You shall not eat of any tree of the garden'?" - """ - - expected_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " - + "the woman, “Has God\n" - + "\\v 2 really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " - + "said, ‘You shall not eat of any tree of the garden’?”" - ) - - observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") - assert_usfm_equal(observed_usfm, expected_usfm) - - -# Basic denormalization does not consider the nesting of quotation marks, -# but only determines opening/closing marks and maps based on that. -def test_basic_quotation_denormalization_same_as_full() -> None: +def test_fallback_quotation_denormalization_same_as_full() -> None: normalized_usfm = simple_normalized_usfm expected_usfm = ( "\\c 1\n" @@ -418,12 +302,12 @@ def test_basic_quotation_denormalization_same_as_full() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_BASIC), + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) -def test_basic_quotation_denormalization_incorrectly_nested() -> None: +def test_fallback_quotation_denormalization_incorrectly_nested() -> None: normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -440,12 +324,12 @@ def test_basic_quotation_denormalization_incorrectly_nested() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_BASIC), + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) -def test_basic_quotation_denormalization_incorrectly_nested_second_case() -> None: +def test_fallback_quotation_denormalization_incorrectly_nested_second_case() -> None: normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -462,12 +346,12 @@ def test_basic_quotation_denormalization_incorrectly_nested_second_case() -> Non normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_BASIC), + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) -def test_basic_quotation_denormalization_unclosed_quote() -> None: +def test_fallback_quotation_denormalization_unclosed_quote() -> None: normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. @@ -484,410 +368,16 @@ def test_basic_quotation_denormalization_unclosed_quote() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_BASIC), + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) -def test_default_denormalization_action() -> None: - normalized_usfm = """\\c 1 - \\v 1 Now the serpent was more subtle than any animal - of the field which Yahweh God had made. - He said to the woman, "Has God really said, - You shall not eat of any tree of the garden'?" - """ - expected_full_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " - + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" - ) - - expected_basic_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " - + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" - ) - - expected_skipped_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " - + 'the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' - ) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - ) - assert_usfm_equal(observed_usfm, expected_full_usfm) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_FULL), - ) - assert_usfm_equal(observed_usfm, expected_full_usfm) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.APPLY_BASIC), - ) - assert_usfm_equal(observed_usfm, expected_basic_usfm) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings(default_chapter_action=QuotationDenormalizationAction.SKIP), - ) - assert_usfm_equal(observed_usfm, expected_skipped_usfm) - - -def test_single_chapter_denormalization_action() -> None: - normalized_usfm = """\\c 1 - \\v 1 Now the serpent was more subtle than any animal - of the field which Yahweh God had made. - He said to the woman, "Has God really said, - You shall not eat of any tree of the garden'?" - """ - expected_full_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " - + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" - ) - - expected_basic_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " - + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" - ) - - expected_skipped_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " - + 'the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' - ) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings(chapter_actions=[QuotationDenormalizationAction.APPLY_FULL]), - ) - assert_usfm_equal(observed_usfm, expected_full_usfm) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings(chapter_actions=[QuotationDenormalizationAction.APPLY_BASIC]), - ) - assert_usfm_equal(observed_usfm, expected_basic_usfm) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings(chapter_actions=[QuotationDenormalizationAction.SKIP]), - ) - assert_usfm_equal(observed_usfm, expected_skipped_usfm) - - -def test_multiple_chapter_same_denormalization_action() -> None: - normalized_usfm = """\\c 1 - \\v 1 Now the serpent was more subtle" than any animal - of the field which Yahweh God had made. - \\c 2 - \\v 1 He said to the woman, "Has God really said, - You shall not eat of any tree of the garden'?" - """ - expected_full_usfm = ( - "\\c 1\n" - + '\\v 1 Now the serpent was more subtle" than any animal of the field which Yahweh God had made.\n' - + "\\c 2\n" - + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" - ) - - expected_basic_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" - + "\\c 2\n" - + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" - ) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings( - chapter_actions=[QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_FULL] - ), - ) - assert_usfm_equal(observed_usfm, expected_full_usfm) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings( - chapter_actions=[QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_BASIC] - ), - ) - assert_usfm_equal(observed_usfm, expected_basic_usfm) - - -def test_multiple_chapter_multiple_denormalization_actions() -> None: - normalized_usfm = """\\c 1 - \\v 1 Now the serpent was more subtle" than any animal - of the field which Yahweh God had made. - \\c 2 - \\v 1 He said to the woman, "Has God really said, - You shall not eat of any tree of the garden'?" - """ - expected_full_then_basic_usfm = ( - "\\c 1\n" - + '\\v 1 Now the serpent was more subtle" than any animal of the field which Yahweh God had made.\n' - + "\\c 2\n" - + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" - ) - - expected_basic_then_full_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" - + "\\c 2\n" - + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" - ) - - expected_basic_then_skip_usfm = ( - "\\c 1\n" - + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" - + "\\c 2\n" - + '\\v 1 He said to the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' - ) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings( - chapter_actions=[QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_BASIC] - ), - ) - assert_usfm_equal(observed_usfm, expected_full_then_basic_usfm) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings( - chapter_actions=[QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_FULL] - ), - ) - assert_usfm_equal(observed_usfm, expected_basic_then_full_usfm) - - observed_usfm = denormalize_quotation_marks( - normalized_usfm, - "standard_english", - "standard_english", - QuotationDenormalizationSettings( - chapter_actions=[QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.SKIP] - ), - ) - assert_usfm_equal(observed_usfm, expected_basic_then_skip_usfm) - - -def test_process_scripture_element() -> None: - quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( - create_quotation_denormalization_usfm_update_block_handler("standard_english", "british_english") - ) - quotation_denormalizer._quotation_mark_finder = MockQuotationMarkFinder() - - update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( - UsfmUpdateBlockElementType.TEXT, - tokens=[UsfmToken(UsfmTokenType.TEXT, text="test segment")], - ) - mock_quotation_mark_resolver: QuotationMarkResolver = MockQuotationMarkResolver() - quotation_denormalizer._process_scripture_element(update_element, mock_quotation_mark_resolver) - - assert quotation_denormalizer._quotation_mark_finder.num_times_called == 1 - assert mock_quotation_mark_resolver.num_times_called == 1 - assert quotation_denormalizer._quotation_mark_finder.matches_to_return[0].text_segment.text == "this is a ‘test" - assert quotation_denormalizer._quotation_mark_finder.matches_to_return[1].text_segment.text == "the test ends” here" - - -def test_create_text_segments_basic() -> None: - quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( - create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") - ) - - update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( - UsfmUpdateBlockElementType.TEXT, tokens=[UsfmToken(UsfmTokenType.TEXT, text="test segment")] - ) - text_segments: List[TextSegment] = quotation_denormalizer._create_text_segments(update_element) - - assert len(text_segments) == 1 - assert text_segments[0].text == "test segment" - assert text_segments[0].immediate_preceding_marker is UsfmMarkerType.NoMarker - assert text_segments[0].markers_in_preceding_context == set() - assert text_segments[0].previous_segment is None - assert text_segments[0].next_segment is None - - -def test_create_text_segments_with_preceding_markers() -> None: - quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( - create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") - ) - - update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( - UsfmUpdateBlockElementType.TEXT, - tokens=[ - UsfmToken(UsfmTokenType.VERSE), - UsfmToken(UsfmTokenType.PARAGRAPH), - UsfmToken(UsfmTokenType.TEXT, text="test segment"), - ], - ) - text_segments: List[TextSegment] = quotation_denormalizer._create_text_segments(update_element) - - assert len(text_segments) == 1 - assert text_segments[0].text == "test segment" - assert text_segments[0].immediate_preceding_marker == UsfmMarkerType.ParagraphMarker - assert text_segments[0].markers_in_preceding_context == { - UsfmMarkerType.VerseMarker, - UsfmMarkerType.ParagraphMarker, - } - assert text_segments[0].previous_segment is None - assert text_segments[0].next_segment is None - - -def test_create_text_segments_with_multiple_text_tokens() -> None: - quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( - create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") - ) - - update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( - UsfmUpdateBlockElementType.TEXT, - tokens=[ - UsfmToken(UsfmTokenType.VERSE), - UsfmToken(UsfmTokenType.PARAGRAPH), - UsfmToken(UsfmTokenType.TEXT, text="test segment1"), - UsfmToken(UsfmTokenType.VERSE), - UsfmToken(UsfmTokenType.CHARACTER), - UsfmToken(UsfmTokenType.TEXT, text="test segment2"), - UsfmToken(UsfmTokenType.PARAGRAPH), - ], - ) - text_segments: List[TextSegment] = quotation_denormalizer._create_text_segments(update_element) - - assert len(text_segments) == 2 - assert text_segments[0].text == "test segment1" - assert text_segments[0].immediate_preceding_marker == UsfmMarkerType.ParagraphMarker - assert text_segments[0].markers_in_preceding_context == {UsfmMarkerType.VerseMarker, UsfmMarkerType.ParagraphMarker} - assert text_segments[0].previous_segment is None - assert text_segments[0].next_segment == text_segments[1] - assert text_segments[1].text == "test segment2" - assert text_segments[1].immediate_preceding_marker == UsfmMarkerType.CharacterMarker - assert text_segments[1].markers_in_preceding_context == {UsfmMarkerType.VerseMarker, UsfmMarkerType.CharacterMarker} - assert text_segments[1].previous_segment == text_segments[0] - assert text_segments[1].next_segment is None - - -def test_create_text_segment() -> None: - quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( - create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") - ) - - usfm_token: UsfmToken = UsfmToken(UsfmTokenType.TEXT, text="test segment") - segment: Union[TextSegment, None] = quotation_denormalizer._create_text_segment(usfm_token) - - assert segment is not None - assert segment.text == "test segment" - assert segment.immediate_preceding_marker is UsfmMarkerType.NoMarker - assert segment.markers_in_preceding_context == set() - assert segment.usfm_token == usfm_token - - -def test_set_previous_and_next_for_segments() -> None: - quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( - create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") - ) - - segments: List[TextSegment] = [ - TextSegment.Builder().set_text("segment 1 text").build(), - TextSegment.Builder().set_text("segment 2 text").build(), - TextSegment.Builder().set_text("segment 3 text").build(), - ] - - quotation_denormalizer._set_previous_and_next_for_segments(segments) - - assert segments[0].previous_segment is None - assert segments[0].next_segment == segments[1] - assert segments[1].previous_segment == segments[0] - assert segments[1].next_segment == segments[2] - assert segments[2].previous_segment == segments[1] - assert segments[2].next_segment is None - - -def test_check_for_chapter_change() -> None: - quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( - create_quotation_denormalization_usfm_update_block_handler("standard_english", "standard_english") - ) - - assert quotation_denormalizer._current_chapter_number == 0 - - quotation_denormalizer._check_for_chapter_change(UsfmUpdateBlock([ScriptureRef.parse("MAT 1:1")], [])) - - assert quotation_denormalizer._current_chapter_number == 1 - - quotation_denormalizer._check_for_chapter_change(UsfmUpdateBlock([ScriptureRef.parse("ISA 15:22")], [])) - - assert quotation_denormalizer._current_chapter_number == 15 - - -def test_start_new_chapter() -> None: - quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( - create_quotation_denormalization_usfm_update_block_handler( - "standard_english", - "standard_english", - QuotationDenormalizationSettings( - chapter_actions=[ - QuotationDenormalizationAction.SKIP, - QuotationDenormalizationAction.APPLY_FULL, - QuotationDenormalizationAction.APPLY_BASIC, - ] - ), - ) - ) - - quotation_denormalizer._next_scripture_text_segment_builder.add_preceding_marker( - UsfmMarkerType.EmbedMarker - ).set_text("this text should be erased") - quotation_denormalizer._verse_text_quotation_mark_resolver._issues.add( - QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK - ) - - quotation_denormalizer._start_new_chapter(1) - segment = quotation_denormalizer._next_scripture_text_segment_builder.build() - assert quotation_denormalizer._current_denormalization_action == QuotationDenormalizationAction.SKIP - assert segment.immediate_preceding_marker == UsfmMarkerType.ChapterMarker - assert segment.text == "" - assert UsfmMarkerType.EmbedMarker not in segment.markers_in_preceding_context - assert quotation_denormalizer._verse_text_quotation_mark_resolver._issues == set() - - quotation_denormalizer._start_new_chapter(2) - assert quotation_denormalizer._current_denormalization_action == QuotationDenormalizationAction.APPLY_FULL - - quotation_denormalizer._start_new_chapter(3) - assert quotation_denormalizer._current_denormalization_action == QuotationDenormalizationAction.APPLY_BASIC - - def denormalize_quotation_marks( normalized_usfm: str, source_quote_convention_name: str, target_quote_convention_name: str, - quotation_denormalization_settings: QuotationDenormalizationSettings = QuotationDenormalizationSettings(), + quotation_denormalization_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), ) -> str: quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( create_quotation_denormalization_usfm_update_block_handler( @@ -904,17 +394,10 @@ def denormalize_quotation_marks( def create_quotation_denormalization_usfm_update_block_handler( source_quote_convention_name: str, target_quote_convention_name: str, - quotation_denormalization_settings: QuotationDenormalizationSettings = QuotationDenormalizationSettings(), + quotation_denormalization_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), ) -> QuotationDenormalizationUsfmUpdateBlockHandler: - source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( - source_quote_convention_name - ) - assert source_quote_convention is not None - - target_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( - target_quote_convention_name - ) - assert target_quote_convention is not None + source_quote_convention = get_quote_convention_by_name(source_quote_convention_name) + target_quote_convention = get_quote_convention_by_name(target_quote_convention_name) return QuotationDenormalizationUsfmUpdateBlockHandler( source_quote_convention, @@ -928,38 +411,9 @@ def assert_usfm_equal(observed_usfm: str, expected_usfm: str) -> None: assert observed_line.strip() == expected_line.strip() -class MockQuotationMarkFinder(QuotationMarkFinder): - def __init__(self) -> None: - super().__init__(QuoteConventionSet([])) - self.num_times_called = 0 - self.matches_to_return = [ - QuotationMarkStringMatch(TextSegment.Builder().set_text('this is a "test').build(), 10, 11), - QuotationMarkStringMatch(TextSegment.Builder().set_text('the test ends" here').build(), 13, 14), - ] - - def find_all_potential_quotation_marks_in_text_segments( - self, text_segments: List[TextSegment] - ) -> List[QuotationMarkStringMatch]: - self.num_times_called += 1 - return self.matches_to_return - - -class MockQuotationMarkResolver(QuotationMarkResolver): - def __init__(self): - super().__init__(QuotationMarkResolutionSettings()) - self.num_times_called = 0 - - def resolve_quotation_marks( - self, quote_matches: List[QuotationMarkStringMatch] - ) -> Generator[QuotationMarkMetadata, None, None]: - self.num_times_called += 1 - current_depth = 1 - current_direction = QuotationMarkDirection.Opening - for quote_match in quote_matches: - yield quote_match.resolve(current_depth, current_direction) - current_depth += 1 - current_direction = ( - QuotationMarkDirection.Closing - if current_direction == QuotationMarkDirection.Opening - else QuotationMarkDirection.Opening - ) +def get_quote_convention_by_name(name: str) -> QuoteConvention: + quote_convention: Union[QuoteConvention, None] = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name(name) + ) + assert quote_convention is not None + return quote_convention diff --git a/tests/corpora/test_quotation_denormalization_first_pass.py b/tests/corpora/test_quotation_mark_update_first_pass.py similarity index 54% rename from tests/corpora/test_quotation_denormalization_first_pass.py rename to tests/corpora/test_quotation_mark_update_first_pass.py index d87fb918..3ccc937c 100644 --- a/tests/corpora/test_quotation_denormalization_first_pass.py +++ b/tests/corpora/test_quotation_mark_update_first_pass.py @@ -1,6 +1,6 @@ from typing import List, Union -from machine.corpora import QuotationDenormalizationAction, QuotationDenormalizationFirstPass, parse_usfm +from machine.corpora import QuotationMarkUpdateFirstPass, QuotationMarkUpdateStrategy, parse_usfm from machine.corpora.analysis import ( Chapter, QuotationMarkResolutionIssue, @@ -11,114 +11,218 @@ ) -def test_check_whether_basic_denormalization_will_work() -> None: +def test_check_whether_fallback_mode_will_work() -> None: - first_pass_analyzer = QuotationDenormalizationFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + first_pass_analyzer = QuotationMarkUpdateFirstPass(QuoteConvention("", []), QuoteConvention("", [])) - # Cases where we expect basic denormalization to work + # Cases where we expect fallback mode to work assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( + first_pass_analyzer._check_whether_fallback_mode_will_work( get_quote_convention_by_name("standard_english"), get_quote_convention_by_name("standard_english"), ) is True ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( + first_pass_analyzer._check_whether_fallback_mode_will_work( get_quote_convention_by_name("standard_french"), get_quote_convention_by_name("british_english"), ) is True ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( + first_pass_analyzer._check_whether_fallback_mode_will_work( get_quote_convention_by_name("typewriter_western_european"), get_quote_convention_by_name("standard_russian"), ) is True ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( + first_pass_analyzer._check_whether_fallback_mode_will_work( get_quote_convention_by_name("typewriter_western_european_variant"), get_quote_convention_by_name("standard_arabic"), ) is True ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( + first_pass_analyzer._check_whether_fallback_mode_will_work( get_quote_convention_by_name("central_european"), get_quote_convention_by_name("british_typewriter_english"), ) is True ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( + first_pass_analyzer._check_whether_fallback_mode_will_work( get_quote_convention_by_name("standard_swedish"), get_quote_convention_by_name("typewriter_french"), ) is True ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( + first_pass_analyzer._check_whether_fallback_mode_will_work( get_quote_convention_by_name("standard_finnish"), get_quote_convention_by_name("british_inspired_western_european"), ) is True ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( + first_pass_analyzer._check_whether_fallback_mode_will_work( get_quote_convention_by_name("eastern_european"), get_quote_convention_by_name("central_european"), ) is True ) - # Cases where we expect basic denormalization to fail + # Cases where we expect fallback mode to fail assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( - get_quote_convention_by_name("western_european"), + first_pass_analyzer._check_whether_fallback_mode_will_work( get_quote_convention_by_name("standard_english"), + get_quote_convention_by_name("western_european"), ) is False ) + assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("typewriter_french"), + get_quote_convention_by_name("western_european"), + ) + is False + ) + + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_french"), get_quote_convention_by_name("french_variant"), - get_quote_convention_by_name("hybrid_typewriter_english"), ) is False ) + + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("central_european"), + get_quote_convention_by_name("typewriter_western_european"), + ) + is False + ) + + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("eastern_european"), + get_quote_convention_by_name("standard_russian"), + ) + is False + ) + + +def test_check_whether_fallback_mode_will_work_with_normalized_conventions() -> None: + + first_pass_analyzer = QuotationMarkUpdateFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + + # Cases where we expect fallback mode to work + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_english").normalize(), + get_quote_convention_by_name("standard_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_french").normalize(), + get_quote_convention_by_name("british_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("typewriter_western_european").normalize(), + get_quote_convention_by_name("standard_russian"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("typewriter_western_european_variant").normalize(), + get_quote_convention_by_name("standard_arabic"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("central_european").normalize(), + get_quote_convention_by_name("british_typewriter_english"), + ) + is True + ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_swedish").normalize(), + get_quote_convention_by_name("typewriter_french"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_finnish").normalize(), get_quote_convention_by_name("british_inspired_western_european"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("eastern_european").normalize(), + get_quote_convention_by_name("central_european"), + ) + is True + ) + + # Cases where we expect fallback mode to fail + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("western_european").normalize(), + get_quote_convention_by_name("standard_english"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("french_variant").normalize(), + get_quote_convention_by_name("hybrid_typewriter_english"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("british_inspired_western_european").normalize(), get_quote_convention_by_name("standard_russian"), ) is False ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( - get_quote_convention_by_name("typewriter_english"), + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("typewriter_english").normalize(), get_quote_convention_by_name("western_european"), ) is False ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( - get_quote_convention_by_name("central_european_guillemets"), + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("central_european_guillemets").normalize(), get_quote_convention_by_name("french_variant"), ) is False ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( - get_quote_convention_by_name("standard_arabic"), + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_arabic").normalize(), get_quote_convention_by_name("hybrid_typewriter_english"), ) is False ) assert ( - first_pass_analyzer._check_whether_basic_denormalization_will_work( - get_quote_convention_by_name("standard_russian"), + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_russian").normalize(), get_quote_convention_by_name("standard_french"), ) is False @@ -127,116 +231,130 @@ def test_check_whether_basic_denormalization_will_work() -> None: def test_choose_best_action_for_chapter() -> None: # Verse text with no issues - actual_action = run_quotation_denormalization_first_pass_on_chapter( + actual_action = run_first_pass_on_chapter( [ "Now the serpent was more subtle than any animal " + "of the field which Yahweh God had made. " - + 'He said to the woman, "Has God really said, ' - + "'You shall not eat of any tree of the garden'?\"" + + "He said to the woman, “Has God really said, " + + "‘You shall not eat of any tree of the garden’?”" ], "standard_english", "standard_english", ) - expected_action = QuotationDenormalizationAction.APPLY_FULL + expected_action = QuotationMarkUpdateStrategy.APPLY_FULL assert actual_action == expected_action # Verse text with unpaired opening quotation mark - actual_action = run_quotation_denormalization_first_pass_on_chapter( + actual_action = run_first_pass_on_chapter( [ "Now the serpent was more subtle than any animal " + "of the field which Yahweh God had made. " - + 'He said to the woman, "Has God really said, ' - + "'You shall not eat of any tree of the garden'?" + + "He said to the woman, “Has God really said, " + + "‘You shall not eat of any tree of the garden’?" ], "standard_english", "standard_english", ) - expected_action = QuotationDenormalizationAction.APPLY_BASIC + expected_action = QuotationMarkUpdateStrategy.APPLY_FALLBACK assert actual_action == expected_action # Verse text with unpaired closing quotation mark - actual_action = run_quotation_denormalization_first_pass_on_chapter( + actual_action = run_first_pass_on_chapter( [ "Now the serpent was more subtle than any animal " + "of the field which Yahweh God had made. " + "He said to the woman, Has God really said, " - + 'You shall not eat of any tree of the garden?"' + + "You shall not eat of any tree of the garden?”" ], "standard_english", "standard_english", ) - expected_action = QuotationDenormalizationAction.APPLY_BASIC + expected_action = QuotationMarkUpdateStrategy.APPLY_FALLBACK assert actual_action == expected_action # Verse text with too deeply nested quotation marks - actual_action = run_quotation_denormalization_first_pass_on_chapter( + actual_action = run_first_pass_on_chapter( [ - '"Now the serpent was more "subtle than any animal ' - + 'of the "field which "Yahweh God had made. ' - + 'He said to the woman, "Has God really said, ' - + '"You shall not eat of any tree of the garden?' + "“Now the serpent was more “subtle than any animal " + + "of the “field which “Yahweh God had made. " + + "He said to the woman, “Has God really said, " + + "“You shall not eat of any tree of the garden?" ], "standard_english", "standard_english", ) - expected_action = QuotationDenormalizationAction.SKIP + expected_action = QuotationMarkUpdateStrategy.APPLY_FALLBACK assert actual_action == expected_action # Verse text with an ambiguous quotation mark - actual_action = run_quotation_denormalization_first_pass_on_chapter( + actual_action = run_first_pass_on_chapter( [ "Now the serpent was more subtle than any animal " + "of the field which Yahweh God had made. " + 'He said to the woman"Has God really said, ' + "You shall not eat of any tree of the garden?" ], - "standard_english", + "typewriter_english", "standard_english", ) - expected_action = QuotationDenormalizationAction.SKIP + expected_action = QuotationMarkUpdateStrategy.SKIP assert actual_action == expected_action # Verse text with an ambiguous quotation mark - actual_action = run_quotation_denormalization_first_pass_on_chapter( + actual_action = run_first_pass_on_chapter( [ "Now the serpent was more subtle than any animal " + "of the field which Yahweh God had made. " + 'He said to the woman"Has God really said, ' + "You shall not eat of any tree of the garden?" ], + "typewriter_english", "standard_english", + ) + expected_action = QuotationMarkUpdateStrategy.SKIP + assert actual_action == expected_action + + # Verse text with too deeply nested ambiguous quotation marks + actual_action = run_first_pass_on_chapter( + [ + '"Now the serpent was more "subtle than any animal ' + + 'of the "field which "Yahweh God had made. ' + + 'He said to the woman, "Has God really said, ' + + '"You shall not eat of any tree of the garden?' + ], + "typewriter_english", "standard_english", ) - expected_action = QuotationDenormalizationAction.SKIP + expected_action = QuotationMarkUpdateStrategy.SKIP assert actual_action == expected_action def test_choose_best_action_based_on_observed_issues() -> None: - first_pass_analyzer = QuotationDenormalizationFirstPass(QuoteConvention("", []), QuoteConvention("", [])) - first_pass_analyzer._will_basic_denormalization_work = False + first_pass_analyzer = QuotationMarkUpdateFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + first_pass_analyzer._will_fallback_mode_work = False # Test with no issues best_action = first_pass_analyzer._choose_best_action_based_on_observed_issues([]) - assert best_action == QuotationDenormalizationAction.APPLY_FULL + assert best_action == QuotationMarkUpdateStrategy.APPLY_FULL # Test with one issue assert ( first_pass_analyzer._choose_best_action_based_on_observed_issues( [QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK] ) - == QuotationDenormalizationAction.SKIP + == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_action_based_on_observed_issues( [QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK] ) - == QuotationDenormalizationAction.SKIP + == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_action_based_on_observed_issues( [QuotationMarkResolutionIssue.TOO_DEEP_NESTING] ) - == QuotationDenormalizationAction.SKIP + == QuotationMarkUpdateStrategy.SKIP ) # Test with multiple issues @@ -247,7 +365,7 @@ def test_choose_best_action_based_on_observed_issues() -> None: QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, ] ) - == QuotationDenormalizationAction.SKIP + == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_action_based_on_observed_issues( @@ -256,7 +374,7 @@ def test_choose_best_action_based_on_observed_issues() -> None: QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, ] ) - == QuotationDenormalizationAction.SKIP + == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_action_based_on_observed_issues( @@ -265,36 +383,36 @@ def test_choose_best_action_based_on_observed_issues() -> None: QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, ] ) - == QuotationDenormalizationAction.SKIP + == QuotationMarkUpdateStrategy.SKIP ) def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> None: - first_pass_analyzer = QuotationDenormalizationFirstPass(QuoteConvention("", []), QuoteConvention("", [])) - first_pass_analyzer._will_basic_denormalization_work = True + first_pass_analyzer = QuotationMarkUpdateFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + first_pass_analyzer._will_fallback_mode_work = True # Test with no issues best_action = first_pass_analyzer._choose_best_action_based_on_observed_issues([]) - assert best_action == QuotationDenormalizationAction.APPLY_FULL + assert best_action == QuotationMarkUpdateStrategy.APPLY_FULL # Test with one issue assert ( first_pass_analyzer._choose_best_action_based_on_observed_issues( [QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK] ) - == QuotationDenormalizationAction.APPLY_BASIC + == QuotationMarkUpdateStrategy.APPLY_FALLBACK ) assert ( first_pass_analyzer._choose_best_action_based_on_observed_issues( [QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK] ) - == QuotationDenormalizationAction.SKIP + == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_action_based_on_observed_issues( [QuotationMarkResolutionIssue.TOO_DEEP_NESTING] ) - == QuotationDenormalizationAction.APPLY_BASIC + == QuotationMarkUpdateStrategy.APPLY_FALLBACK ) # Test with multiple issues @@ -305,7 +423,7 @@ def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> No QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, ] ) - == QuotationDenormalizationAction.SKIP + == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_action_based_on_observed_issues( @@ -314,7 +432,7 @@ def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> No QuotationMarkResolutionIssue.TOO_DEEP_NESTING, ] ) - == QuotationDenormalizationAction.SKIP + == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_action_based_on_observed_issues( @@ -323,7 +441,7 @@ def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> No QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, ] ) - == QuotationDenormalizationAction.APPLY_BASIC + == QuotationMarkUpdateStrategy.APPLY_FALLBACK ) @@ -332,11 +450,11 @@ def test_no_issues_in_usfm() -> None: normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. - He said to the woman, "Has God really said, - 'You shall not eat of any tree of the garden'?" + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” """ - expected_actions = [QuotationDenormalizationAction.APPLY_FULL] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FULL] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") assert expected_actions == observed_actions @@ -345,11 +463,11 @@ def test_unpaired_opening_mark() -> None: normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. - He said to the woman, "Has God really said, - 'You shall not eat of any tree of the garden'? + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’? """ - expected_actions = [QuotationDenormalizationAction.APPLY_BASIC] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") assert expected_actions == observed_actions @@ -359,23 +477,23 @@ def test_unpaired_closing_mark() -> None: \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, Has God really said, - You shall not eat of any tree of the garden?" + You shall not eat of any tree of the garden?” """ - expected_actions = [QuotationDenormalizationAction.APPLY_BASIC] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") assert expected_actions == observed_actions def test_too_deep_nesting() -> None: normalized_usfm = """\\c 1 - \\v 1 "Now the serpent was more "subtle than any animal - of the "field which "Yahweh God had made. - He said to the woman, "Has God really said, - "You shall not eat of any tree of the garden? + \\v 1 “Now the serpent was more “subtle than any animal + of the “field which “Yahweh God had made. + He said to the woman, “Has God really said, + “You shall not eat of any tree of the garden? """ - expected_actions = [QuotationDenormalizationAction.APPLY_BASIC] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") assert expected_actions == observed_actions @@ -387,8 +505,8 @@ def test_ambiguous_quotation_mark() -> None: He said to the woman"Has God really said, You shall not eat of any tree of the garden? """ - expected_actions = [QuotationDenormalizationAction.SKIP] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.SKIP] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") assert expected_actions == observed_actions @@ -397,11 +515,11 @@ def test_no_issues_in_multiple_chapters() -> None: normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. - \\c 2 \\v 1 He said to the woman, "Has God really said, - 'You shall not eat of any tree of the garden'?" + \\c 2 \\v 1 He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” """ - expected_actions = [QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_FULL] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FULL] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") assert expected_actions == observed_actions @@ -411,10 +529,10 @@ def test_unpaired_quotation_mark_in_second_chapter() -> None: \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. \\c 2 \\v 1 He said to the woman, Has God really said, - You shall not eat of any tree of the garden?" + You shall not eat of any tree of the garden?” """ - expected_actions = [QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.APPLY_BASIC] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") assert expected_actions == observed_actions @@ -422,12 +540,12 @@ def test_unpaired_quotation_mark_in_second_chapter() -> None: def test_unpaired_quotation_mark_in_first_chapter() -> None: normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal - of the field which Yahweh God had" made. + of the field which Yahweh God had” made. \\c 2 \\v 1 He said to the woman, Has God really said, - "You shall not eat of any tree of the garden?" + “You shall not eat of any tree of the garden?” """ - expected_actions = [QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_FULL] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FULL] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") assert expected_actions == observed_actions @@ -439,8 +557,8 @@ def test_ambiguous_quotation_mark_in_second_chapter() -> None: \\c 2 \\v 1 He said to the woman, Has God really said, You shall not"eat of any tree of the garden?" """ - expected_actions = [QuotationDenormalizationAction.APPLY_FULL, QuotationDenormalizationAction.SKIP] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.SKIP] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") assert expected_actions == observed_actions @@ -452,8 +570,8 @@ def test_ambiguous_quotation_mark_in_first_chapter() -> None: \\c 2 \\v 1 He said to the woman, Has God really said, "You shall not eat of any tree of the garden?" """ - expected_actions = [QuotationDenormalizationAction.SKIP, QuotationDenormalizationAction.APPLY_FULL] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.SKIP, QuotationMarkUpdateStrategy.APPLY_FULL] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") assert expected_actions == observed_actions @@ -461,12 +579,12 @@ def test_ambiguous_quotation_mark_in_first_chapter() -> None: def test_unpaired_quotation_mark_in_both_chapters() -> None: normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal - of the field which Yahweh God had" made. + of the field which Yahweh God had” made. \\c 2 \\v 1 He said to the woman, Has God really said, - You shall not eat of any tree of the garden?" + You shall not eat of any tree of the garden?” """ - expected_actions = [QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.APPLY_BASIC] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") assert expected_actions == observed_actions @@ -478,8 +596,8 @@ def test_ambiguous_quotation_mark_in_both_chapters() -> None: \\c 2 \\v 1 He said to the woman, Has God really said, You shall not eat of any"tree of the garden? """ - expected_actions = [QuotationDenormalizationAction.SKIP, QuotationDenormalizationAction.SKIP] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.SKIP, QuotationMarkUpdateStrategy.SKIP] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") assert expected_actions == observed_actions @@ -491,8 +609,8 @@ def test_unpaired_in_first_ambiguous_in_second() -> None: \\c 2 \\v 1 He said to the woman, Has God really said, You shall not eat of any"tree of the garden? """ - expected_actions = [QuotationDenormalizationAction.APPLY_BASIC, QuotationDenormalizationAction.SKIP] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.SKIP] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") assert expected_actions == observed_actions @@ -504,15 +622,15 @@ def test_ambiguous_in_first_unpaired_in_second() -> None: \\c 2 \\v 1 He said to the woman, Has God really said, You shall not eat of any tree of the garden?" """ - expected_actions = [QuotationDenormalizationAction.SKIP, QuotationDenormalizationAction.APPLY_BASIC] - observed_actions = run_quotation_denormalization_first_pass(normalized_usfm, "standard_english", "standard_english") + expected_actions = [QuotationMarkUpdateStrategy.SKIP, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") assert expected_actions == observed_actions -def run_quotation_denormalization_first_pass( +def run_first_pass( normalized_usfm: str, source_quote_convention_name: str, target_quote_convention_name: str -) -> List[QuotationDenormalizationAction]: +) -> List[QuotationMarkUpdateStrategy]: source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( source_quote_convention_name ) @@ -523,15 +641,15 @@ def run_quotation_denormalization_first_pass( ) assert target_quote_convention is not None - first_pass_analyzer = QuotationDenormalizationFirstPass(source_quote_convention, target_quote_convention) + first_pass_analyzer = QuotationMarkUpdateFirstPass(source_quote_convention, target_quote_convention) parse_usfm(normalized_usfm, first_pass_analyzer) return first_pass_analyzer.get_best_actions_by_chapter() -def run_quotation_denormalization_first_pass_on_chapter( +def run_first_pass_on_chapter( verse_texts: List[str], source_quote_convention_name: str, target_quote_convention_name: str -) -> QuotationDenormalizationAction: +) -> QuotationMarkUpdateStrategy: source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( source_quote_convention_name ) @@ -542,7 +660,7 @@ def run_quotation_denormalization_first_pass_on_chapter( ) assert target_quote_convention is not None - first_pass_analyzer = QuotationDenormalizationFirstPass(source_quote_convention, target_quote_convention) + first_pass_analyzer = QuotationMarkUpdateFirstPass(source_quote_convention, target_quote_convention) chapter = Chapter([Verse([TextSegment.Builder().set_text(verse_text).build() for verse_text in verse_texts])]) diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py new file mode 100644 index 00000000..baab9180 --- /dev/null +++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py @@ -0,0 +1,692 @@ +from typing import Generator, List, Union + +from machine.corpora import ( + QuotationMarkUpdateSettings, + QuotationMarkUpdateStrategy, + QuoteConventionChangingUsfmUpdateBlockHandler, + ScriptureRef, + UpdateUsfmParserHandler, + UsfmToken, + UsfmTokenType, + UsfmUpdateBlock, + UsfmUpdateBlockElement, + UsfmUpdateBlockElementType, + parse_usfm, +) +from machine.corpora.analysis import ( + QuotationMarkDirection, + QuotationMarkFinder, + QuotationMarkMetadata, + QuotationMarkResolutionIssue, + QuotationMarkResolutionSettings, + QuotationMarkResolver, + QuotationMarkStringMatch, + QuoteConventionSet, + TextSegment, + UsfmMarkerType, + standard_quote_conventions, +) + + +def test_quotes_spanning_verses() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + \\v 2 “You shall not eat of any tree of the garden”?» + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, \n" + + "\\v 2 ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks(input_usfm, "western_european", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_single_embed() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + \\f + \\ft «This is a “footnote”» \\f* + of the field which Yahweh God had made. + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal " + + "\\f + \\ft “This is a ‘footnote’” \\f* of the field which Yahweh God had made." + ) + + observed_usfm = change_quotation_marks(input_usfm, "western_european", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_multiple_embeds() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + \\f + \\ft «This is a “footnote”» \\f* + of the field \\f + \\ft Second «footnote» here \\f* which Yahweh God had made. + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal " + + "\\f + \\ft “This is a ‘footnote’” \\f* of the field \\f + \\ft Second " + + "“footnote” here \\f* which Yahweh God had made." + ) + + observed_usfm = change_quotation_marks(input_usfm, "western_european", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_quotes_in_text_and_embed() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really \\f + \\ft a + «footnote» in the «midst of “text”» \\f* said, + “You shall not eat of any tree of the garden”?» + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + + "said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks(input_usfm, "western_european", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_quotes_in_multiple_verses_and_embed() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God + \\v 2 really \\f + \\ft a + «footnote» in the «midst of “text”» \\f* said, + “You shall not eat of any tree of the garden”?» + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God\n" + + "\\v 2 really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + + "said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks(input_usfm, "western_european", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# Fallback mode does not consider the nesting of quotation marks, +# but only determines opening/closing marks and maps based on that. +def test_fallback_strategy_same_as_full() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + “You shall not eat of any tree of the garden”?’ + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "british_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_fallback_strategy_incorrectly_nested() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + ‘You shall not eat of any tree of the garden’?’ + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, “You shall not eat of any tree of the garden”?”" + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "british_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_fallback_strategy_incorrectly_nested_second_case() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?’ + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?”" + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "british_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_fallback_strategy_unclosed_quote() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + You shall not eat of any tree of the garden”?’ + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "british_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_default_quotation_mark_update_strategy() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_basic_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + expected_skipped_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FULL), + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_basic_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.SKIP), + ) + assert_usfm_equal(observed_usfm, expected_skipped_usfm) + + +def test_single_chapter_quotation_mark_update_strategy() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_basic_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + expected_skipped_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FULL]), + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FALLBACK]), + ) + assert_usfm_equal(observed_usfm, expected_basic_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(chapter_actions=[QuotationMarkUpdateStrategy.SKIP]), + ) + assert_usfm_equal(observed_usfm, expected_skipped_usfm) + + +def test_multiple_chapter_same_strategy() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle" than any animal + of the field which Yahweh God had made. + \\c 2 + \\v 1 He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_usfm = ( + "\\c 1\n" + + '\\v 1 Now the serpent was more subtle" than any animal of the field which Yahweh God had made.\n' + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_fallback_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FULL] + ), + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + ), + ) + assert_usfm_equal(observed_usfm, expected_fallback_usfm) + + +def test_multiple_chapter_multiple_strategies() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle" than any animal + of the field which Yahweh God had made. + \\c 2 + \\v 1 He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_then_fallback_usfm = ( + "\\c 1\n" + + '\\v 1 Now the serpent was more subtle" than any animal of the field which Yahweh God had made.\n' + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + expected_fallback_then_full_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_fallback_then_skip_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + '\\v 1 He said to the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + ), + ) + assert_usfm_equal(observed_usfm, expected_full_then_fallback_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FULL] + ), + ) + assert_usfm_equal(observed_usfm, expected_fallback_then_full_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.SKIP] + ), + ) + assert_usfm_equal(observed_usfm, expected_fallback_then_skip_usfm) + + +def test_process_scripture_element() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "british_english") + ) + quote_convention_changer._quotation_mark_finder = MockQuotationMarkFinder() + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, + tokens=[UsfmToken(UsfmTokenType.TEXT, text="test segment")], + ) + mock_quotation_mark_resolver: QuotationMarkResolver = MockQuotationMarkResolver() + quote_convention_changer._process_scripture_element(update_element, mock_quotation_mark_resolver) + + assert quote_convention_changer._quotation_mark_finder.num_times_called == 1 + assert mock_quotation_mark_resolver.num_times_called == 1 + assert quote_convention_changer._quotation_mark_finder.matches_to_return[0].text_segment.text == "this is a ‘test" + assert ( + quote_convention_changer._quotation_mark_finder.matches_to_return[1].text_segment.text == "the test ends” here" + ) + + +def test_create_text_segments_basic() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, tokens=[UsfmToken(UsfmTokenType.TEXT, text="test segment")] + ) + text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) + + assert len(text_segments) == 1 + assert text_segments[0].text == "test segment" + assert text_segments[0].immediate_preceding_marker is UsfmMarkerType.NoMarker + assert text_segments[0].markers_in_preceding_context == set() + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment is None + + +def test_create_text_segments_with_preceding_markers() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, + tokens=[ + UsfmToken(UsfmTokenType.VERSE), + UsfmToken(UsfmTokenType.PARAGRAPH), + UsfmToken(UsfmTokenType.TEXT, text="test segment"), + ], + ) + text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) + + assert len(text_segments) == 1 + assert text_segments[0].text == "test segment" + assert text_segments[0].immediate_preceding_marker == UsfmMarkerType.ParagraphMarker + assert text_segments[0].markers_in_preceding_context == { + UsfmMarkerType.VerseMarker, + UsfmMarkerType.ParagraphMarker, + } + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment is None + + +def test_create_text_segments_with_multiple_text_tokens() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, + tokens=[ + UsfmToken(UsfmTokenType.VERSE), + UsfmToken(UsfmTokenType.PARAGRAPH), + UsfmToken(UsfmTokenType.TEXT, text="test segment1"), + UsfmToken(UsfmTokenType.VERSE), + UsfmToken(UsfmTokenType.CHARACTER), + UsfmToken(UsfmTokenType.TEXT, text="test segment2"), + UsfmToken(UsfmTokenType.PARAGRAPH), + ], + ) + text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) + + assert len(text_segments) == 2 + assert text_segments[0].text == "test segment1" + assert text_segments[0].immediate_preceding_marker == UsfmMarkerType.ParagraphMarker + assert text_segments[0].markers_in_preceding_context == {UsfmMarkerType.VerseMarker, UsfmMarkerType.ParagraphMarker} + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment == text_segments[1] + assert text_segments[1].text == "test segment2" + assert text_segments[1].immediate_preceding_marker == UsfmMarkerType.CharacterMarker + assert text_segments[1].markers_in_preceding_context == {UsfmMarkerType.VerseMarker, UsfmMarkerType.CharacterMarker} + assert text_segments[1].previous_segment == text_segments[0] + assert text_segments[1].next_segment is None + + +def test_create_text_segment() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + usfm_token: UsfmToken = UsfmToken(UsfmTokenType.TEXT, text="test segment") + segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token) + + assert segment is not None + assert segment.text == "test segment" + assert segment.immediate_preceding_marker is UsfmMarkerType.NoMarker + assert segment.markers_in_preceding_context == set() + assert segment.usfm_token == usfm_token + + +def test_set_previous_and_next_for_segments() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + segments: List[TextSegment] = [ + TextSegment.Builder().set_text("segment 1 text").build(), + TextSegment.Builder().set_text("segment 2 text").build(), + TextSegment.Builder().set_text("segment 3 text").build(), + ] + + quote_convention_changer._set_previous_and_next_for_segments(segments) + + assert segments[0].previous_segment is None + assert segments[0].next_segment == segments[1] + assert segments[1].previous_segment == segments[0] + assert segments[1].next_segment == segments[2] + assert segments[2].previous_segment == segments[1] + assert segments[2].next_segment is None + + +def test_check_for_chapter_change() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + assert quote_convention_changer._current_chapter_number == 0 + + quote_convention_changer._check_for_chapter_change(UsfmUpdateBlock([ScriptureRef.parse("MAT 1:1")], [])) + + assert quote_convention_changer._current_chapter_number == 1 + + quote_convention_changer._check_for_chapter_change(UsfmUpdateBlock([ScriptureRef.parse("ISA 15:22")], [])) + + assert quote_convention_changer._current_chapter_number == 15 + + +def test_start_new_chapter() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler( + "standard_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_actions=[ + QuotationMarkUpdateStrategy.SKIP, + QuotationMarkUpdateStrategy.APPLY_FULL, + QuotationMarkUpdateStrategy.APPLY_FALLBACK, + ] + ), + ) + ) + + quote_convention_changer._next_scripture_text_segment_builder.add_preceding_marker( + UsfmMarkerType.EmbedMarker + ).set_text("this text should be erased") + quote_convention_changer._verse_text_quotation_mark_resolver._issues.add( + QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK + ) + + quote_convention_changer._start_new_chapter(1) + segment = quote_convention_changer._next_scripture_text_segment_builder.build() + assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP + assert segment.immediate_preceding_marker == UsfmMarkerType.ChapterMarker + assert segment.text == "" + assert UsfmMarkerType.EmbedMarker not in segment.markers_in_preceding_context + assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set() + + quote_convention_changer._start_new_chapter(2) + assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.APPLY_FULL + + quote_convention_changer._start_new_chapter(3) + assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.APPLY_FALLBACK + + +def change_quotation_marks( + normalized_usfm: str, + source_quote_convention_name: str, + target_quote_convention_name: str, + quotation_mark_update_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), +) -> str: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler( + source_quote_convention_name, target_quote_convention_name, quotation_mark_update_settings + ) + ) + + updater = UpdateUsfmParserHandler(update_block_handlers=[quote_convention_changer]) + parse_usfm(normalized_usfm, updater) + + return updater.get_usfm() + + +def create_quote_convention_changing_usfm_update_block_handler( + source_quote_convention_name: str, + target_quote_convention_name: str, + quotation_mark_update_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), +) -> QuoteConventionChangingUsfmUpdateBlockHandler: + source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + source_quote_convention_name + ) + assert source_quote_convention is not None + + target_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + target_quote_convention_name + ) + assert target_quote_convention is not None + + return QuoteConventionChangingUsfmUpdateBlockHandler( + source_quote_convention, + target_quote_convention, + quotation_mark_update_settings, + ) + + +def assert_usfm_equal(observed_usfm: str, expected_usfm: str) -> None: + for observed_line, expected_line in zip(observed_usfm.split("\n"), expected_usfm.split("\n")): + assert observed_line.strip() == expected_line.strip() + + +class MockQuotationMarkFinder(QuotationMarkFinder): + def __init__(self) -> None: + super().__init__(QuoteConventionSet([])) + self.num_times_called = 0 + self.matches_to_return = [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('this is a "test').build(), 10, 11), + QuotationMarkStringMatch(TextSegment.Builder().set_text('the test ends" here').build(), 13, 14), + ] + + def find_all_potential_quotation_marks_in_text_segments( + self, text_segments: List[TextSegment] + ) -> List[QuotationMarkStringMatch]: + self.num_times_called += 1 + return self.matches_to_return + + +class MockQuotationMarkResolver(QuotationMarkResolver): + def __init__(self): + super().__init__(QuotationMarkResolutionSettings()) + self.num_times_called = 0 + + def resolve_quotation_marks( + self, quote_matches: List[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: + self.num_times_called += 1 + current_depth = 1 + current_direction = QuotationMarkDirection.Opening + for quote_match in quote_matches: + yield quote_match.resolve(current_depth, current_direction) + current_depth += 1 + current_direction = ( + QuotationMarkDirection.Closing + if current_direction == QuotationMarkDirection.Opening + else QuotationMarkDirection.Opening + ) From e7c279c0b56e86771745c39bda85b5ec601b4954 Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 27 Jun 2025 17:32:45 -0400 Subject: [PATCH 16/31] Remaining unit tests --- machine/corpora/analysis/__init__.py | 26 +- .../depth_based_quotation_mark_resolver.py | 283 +- .../preliminary_quotation_analyzer.py | 206 +- .../quotation_mark_resolution_settings.py | 6 + .../analysis/quotation_mark_string_match.py | 37 +- ...onvention_detection_resolution_settings.py | 8 + .../corpora/analysis/quote_convention_set.py | 5 + ...otation_mark_update_resolution_settings.py | 8 + ...est_depth_based_quotation_mark_resolver.py | 2655 +++++++++++++++++ .../test_preliminary_quotation_analyzer.py | 987 ++++++ .../analysis/test_quotation_mark_finder.py | 290 ++ .../test_quotation_mark_string_match.py | 78 - .../analysis/test_quote_convention_set.py | 185 +- 13 files changed, 4475 insertions(+), 299 deletions(-) create mode 100644 tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py create mode 100644 tests/corpora/analysis/test_preliminary_quotation_analyzer.py create mode 100644 tests/corpora/analysis/test_quotation_mark_finder.py diff --git a/machine/corpora/analysis/__init__.py b/machine/corpora/analysis/__init__.py index 9741cb8d..081254f6 100644 --- a/machine/corpora/analysis/__init__.py +++ b/machine/corpora/analysis/__init__.py @@ -1,5 +1,19 @@ from .chapter import Chapter -from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .depth_based_quotation_mark_resolver import ( + DepthBasedQuotationMarkResolver, + QuotationContinuerState, + QuotationContinuerStyle, + QuotationMarkCategorizer, + QuotationMarkResolverState, +) +from .preliminary_quotation_analyzer import ( + ApostropheProportionStatistics, + PreliminaryApostropheAnalyzer, + PreliminaryQuotationAnalyzer, + QuotationMarkGrouper, + QuotationMarkSequences, + QuotationMarkWordPositions, +) from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_finder import QuotationMarkFinder from .quotation_mark_metadata import QuotationMarkMetadata @@ -18,13 +32,23 @@ from .verse import Verse __all__ = [ + "ApostropheProportionStatistics", "Chapter", "DepthBasedQuotationMarkResolver", + "PreliminaryApostropheAnalyzer", + "PreliminaryQuotationAnalyzer", "SingleLevelQuoteConvention", + "QuotationContinuerState", + "QuotationContinuerStyle", + "QuotationMarkCategorizer", "QuotationMarkCounts", "QuotationMarkDirection", + "QuotationMarkGrouper", "QuotationMarkMetadata", + "QuotationMarkResolverState", + "QuotationMarkSequences", "QuotationMarkStringMatch", + "QuotationMarkWordPositions", "QuoteConvention", "QuoteConventionAnalysis", "QuoteConventionDetectionResolutionSettings", diff --git a/machine/corpora/analysis/depth_based_quotation_mark_resolver.py b/machine/corpora/analysis/depth_based_quotation_mark_resolver.py index e522faea..9695b65a 100644 --- a/machine/corpora/analysis/depth_based_quotation_mark_resolver.py +++ b/machine/corpora/analysis/depth_based_quotation_mark_resolver.py @@ -1,3 +1,4 @@ +from enum import Enum, auto from typing import Generator, Set, Union import regex @@ -14,6 +15,9 @@ class QuotationMarkResolverState: def __init__(self): + self.reset() + + def reset(self) -> None: self.quotation_stack: list[QuotationMarkMetadata] = [] self.current_depth: int = 0 @@ -53,10 +57,20 @@ def get_deepest_opening_quotation_mark(self) -> str: return self.quotation_stack[-1].get_quotation_mark() +class QuotationContinuerStyle(Enum): + UNDETERMINED = auto() + ENGLISH = auto() + SPANISH = auto() + + class QuotationContinuerState: def __init__(self): + self.reset() + + def reset(self) -> None: self.quotation_continuer_stack: list[QuotationMarkMetadata] = [] self.current_depth = 0 + self.continuer_style = QuotationContinuerStyle.UNDETERMINED def get_current_depth(self) -> int: return self.current_depth @@ -64,88 +78,47 @@ def get_current_depth(self) -> int: def has_continuer_been_observed(self) -> bool: return len(self.quotation_continuer_stack) > 0 + def get_continuer_style(self) -> QuotationContinuerStyle: + return self.continuer_style + def add_quotation_continuer( - self, quote_match: QuotationMarkStringMatch, quotation_mark_resolver_state: QuotationMarkResolverState + self, + quote_match: QuotationMarkStringMatch, + quotation_mark_resolver_state: QuotationMarkResolverState, + quotation_continuer_style: QuotationContinuerStyle, ) -> QuotationMarkMetadata: quote = quote_match.resolve(len(self.quotation_continuer_stack) + 1, QuotationMarkDirection.Opening) self.quotation_continuer_stack.append(quote) self.current_depth += 1 + self.continuer_style = quotation_continuer_style if len(self.quotation_continuer_stack) == len(quotation_mark_resolver_state.quotation_stack): self.quotation_continuer_stack.clear() self.current_depth = 0 return quote -class DepthBasedQuotationMarkResolver(QuotationMarkResolver): +class QuotationMarkCategorizer: apostrophe_pattern = regex.compile(r"[\'\u2019\u2018]", regex.U) - def __init__(self, settings: QuotationMarkResolutionSettings): - self._settings = settings - self._quotation_mark_resolver_state = QuotationMarkResolverState() - self._quotation_continuer_state = QuotationContinuerState() - self._issues: Set[QuotationMarkResolutionIssue] = set() - - def reset(self) -> None: - self._quotation_mark_resolver_state = QuotationMarkResolverState() - self._quotation_continuer_state = QuotationContinuerState() - self._issues = set() - - def resolve_quotation_marks( - self, quote_matches: list[QuotationMarkStringMatch] - ) -> Generator[QuotationMarkMetadata, None, None]: - for quote_index, quote_match in enumerate(quote_matches): - previous_mark = None if quote_index == 0 else quote_matches[quote_index - 1] - next_mark = None if quote_index == len(quote_matches) - 1 else quote_matches[quote_index + 1] - yield from self._resolve_quotation_mark(quote_match, previous_mark, next_mark) - if self._quotation_mark_resolver_state.has_open_quotation_mark(): - self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) - - def _resolve_quotation_mark( + def __init__( self, - quote_match: QuotationMarkStringMatch, - previous_mark: Union[QuotationMarkStringMatch, None], - next_mark: Union[QuotationMarkStringMatch, None], - ) -> Generator[QuotationMarkMetadata, None, None]: - if self._is_opening_quote(quote_match): - if self._is_quotation_continuer(quote_match, previous_mark, next_mark): - yield self._process_quotation_continuer(quote_match) - else: - if self._is_depth_too_great(): - self._issues.add(QuotationMarkResolutionIssue.TOO_DEEP_NESTING) - return - - yield self._process_opening_mark(quote_match) - elif self._is_apostrophe(quote_match, next_mark): - pass - elif self._is_closing_quote(quote_match): - if not self._quotation_mark_resolver_state.has_open_quotation_mark(): - self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) - return - yield self._process_closing_mark(quote_match) - elif self._is_malformed_closing_quote(quote_match): - yield self._process_closing_mark(quote_match) - elif self._is_malformed_opening_quote(quote_match): - yield self._process_opening_mark(quote_match) - elif self._is_unpaired_closing_quote(quote_match): - self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) - else: - self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) - - def _is_quotation_continuer( + quotation_mark_resolution_settings: QuotationMarkResolutionSettings, + quotation_mark_resolver_state: QuotationMarkResolverState, + quotation_continuer_state: QuotationContinuerState, + ): + self._settings = quotation_mark_resolution_settings + self._quotation_mark_resolver_state = quotation_mark_resolver_state + self._quotation_continuer_state = quotation_continuer_state + + def is_english_quotation_continuer( self, quote_match: QuotationMarkStringMatch, previous_match: Union[QuotationMarkStringMatch, None], next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if ( - self._settings.should_rely_on_paragraph_markers() - and not quote_match.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.ParagraphMarker) - ): + if self._quotation_continuer_state.get_continuer_style() == QuotationContinuerStyle.SPANISH: return False - if not self._quotation_mark_resolver_state.has_open_quotation_mark(): - return False - - if quote_match.has_quote_introducer_in_leading_substring(): + if not self._meets_quote_continuer_prerequisites(quote_match, previous_match, next_match): return False if not self._quotation_continuer_state.has_continuer_been_observed(): @@ -161,45 +134,73 @@ def _is_quotation_continuer( if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): if next_match is None or next_match.get_start_index() != quote_match.get_end_index(): return False - elif ( - self._quotation_continuer_state.get_current_depth() - >= self._quotation_mark_resolver_state.get_current_depth() - ): - return False else: if ( quote_match.get_quotation_mark() - != self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() + != self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( + self._quotation_continuer_state.get_current_depth() + 1 + ) ): return False return True - def _process_quotation_continuer(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - return self._quotation_continuer_state.add_quotation_continuer(quote_match, self._quotation_mark_resolver_state) + def is_spanish_quotation_continuer( + self, + quote_match: QuotationMarkStringMatch, + previous_match: Union[QuotationMarkStringMatch, None], + next_match: Union[QuotationMarkStringMatch, None], + ) -> bool: + if self._quotation_continuer_state.get_continuer_style() == QuotationContinuerStyle.ENGLISH: + return False + if not self._meets_quote_continuer_prerequisites(quote_match, previous_match, next_match): + return False - def _is_depth_too_great(self) -> bool: - return self._quotation_mark_resolver_state.are_more_than_n_quotes_open(4) + if not self._quotation_continuer_state.has_continuer_been_observed(): + if quote_match.start_index > 0: + return False - def _process_opening_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - if not self._settings.does_metadata_match_quotation_mark( - quote_match.get_quotation_mark(), - self._quotation_mark_resolver_state.get_current_depth(), - QuotationMarkDirection.Opening, - ): - self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) - return self._quotation_mark_resolver_state.add_opening_quotation_mark(quote_match) + # this has only been observed with guillemets so far + if quote_match.get_quotation_mark() != "»": + return False + if not self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( + self._quotation_continuer_state.get_current_depth() + 1 + ), + quote_match.get_quotation_mark(), + ): + return False + if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): + if next_match is None or next_match.get_start_index() != quote_match.get_end_index(): + return False + else: + if not self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( + self._quotation_continuer_state.get_current_depth() + 1 + ), + quote_match.get_quotation_mark(), + ): + return False - def _process_closing_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - if not self._settings.does_metadata_match_quotation_mark( - quote_match.get_quotation_mark(), - self._quotation_mark_resolver_state.get_current_depth() - 1, - QuotationMarkDirection.Closing, + return True + + def _meets_quote_continuer_prerequisites( + self, + quote_match: QuotationMarkStringMatch, + previous_match: Union[QuotationMarkStringMatch, None], + next_match: Union[QuotationMarkStringMatch, None], + ) -> bool: + if ( + self._settings.should_rely_on_paragraph_markers() + and not quote_match.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.ParagraphMarker) ): - self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) - return self._quotation_mark_resolver_state.add_closing_quotation_mark(quote_match) + return False + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): + return False - def _is_opening_quote( + return True + + def is_opening_quote( self, match: QuotationMarkStringMatch, ) -> bool: @@ -216,7 +217,7 @@ def _is_opening_quote( ) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation()) return True - def _is_closing_quote( + def is_closing_quote( self, match: QuotationMarkStringMatch, ) -> bool: @@ -227,11 +228,14 @@ def _is_closing_quote( # if the quote convention is ambiguous, use whitespace as a clue if self._settings.is_valid_opening_quotation_mark(match): return ( - match.has_trailing_whitespace() or match.has_trailing_punctuation() or match.is_at_end_of_segment() + match.has_trailing_whitespace() + or match.has_trailing_punctuation() + or match.is_at_end_of_segment() + or match.does_next_character_match(self._settings.get_closing_quotation_mark_regex()) ) and not match.has_leading_whitespace() return True - def _is_malformed_opening_quote( + def is_malformed_opening_quote( self, match: QuotationMarkStringMatch, ) -> bool: @@ -250,7 +254,7 @@ def _is_malformed_opening_quote( return False - def _is_malformed_closing_quote( + def is_malformed_closing_quote( self, match: QuotationMarkStringMatch, ) -> bool: @@ -259,7 +263,8 @@ def _is_malformed_closing_quote( return ( ( - (match.is_at_end_of_segment() or not match.has_trailing_whitespace()) + match.is_at_end_of_segment() + or not match.has_trailing_whitespace() or (match.has_leading_whitespace() and match.has_trailing_whitespace()) ) and self._quotation_mark_resolver_state.has_open_quotation_mark() @@ -268,7 +273,7 @@ def _is_malformed_closing_quote( ) ) - def _is_unpaired_closing_quote( + def is_unpaired_closing_quote( self, match: QuotationMarkStringMatch, ) -> bool: @@ -288,7 +293,7 @@ def _does_most_recent_opening_mark_immediately_precede(self, match: QuotationMar self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == match.get_previous_character() ) - def _is_apostrophe( + def is_apostrophe( self, match: QuotationMarkStringMatch, next_match: Union[QuotationMarkStringMatch, None], @@ -335,5 +340,93 @@ def _is_apostrophe( return False + +class DepthBasedQuotationMarkResolver(QuotationMarkResolver): + def __init__(self, settings: QuotationMarkResolutionSettings): + self._settings = settings + self._quotation_mark_resolver_state = QuotationMarkResolverState() + self._quotation_continuer_state = QuotationContinuerState() + self._quotation_mark_categorizer = QuotationMarkCategorizer( + self._settings, self._quotation_mark_resolver_state, self._quotation_continuer_state + ) + self._issues: Set[QuotationMarkResolutionIssue] = set() + + def reset(self) -> None: + self._quotation_mark_resolver_state.reset() + self._quotation_continuer_state.reset() + self._issues = set() + + def resolve_quotation_marks( + self, quote_matches: list[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: + for quote_index, quote_match in enumerate(quote_matches): + previous_mark = None if quote_index == 0 else quote_matches[quote_index - 1] + next_mark = None if quote_index == len(quote_matches) - 1 else quote_matches[quote_index + 1] + yield from self._resolve_quotation_mark(quote_match, previous_mark, next_mark) + if self._quotation_mark_resolver_state.has_open_quotation_mark(): + self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) + + def _resolve_quotation_mark( + self, + quote_match: QuotationMarkStringMatch, + previous_mark: Union[QuotationMarkStringMatch, None], + next_mark: Union[QuotationMarkStringMatch, None], + ) -> Generator[QuotationMarkMetadata, None, None]: + if self._quotation_mark_categorizer.is_opening_quote(quote_match): + if self._quotation_mark_categorizer.is_english_quotation_continuer(quote_match, previous_mark, next_mark): + yield self._process_quotation_continuer(quote_match, QuotationContinuerStyle.ENGLISH) + else: + if self._is_depth_too_great(): + self._issues.add(QuotationMarkResolutionIssue.TOO_DEEP_NESTING) + return + + yield self._process_opening_mark(quote_match) + elif self._quotation_mark_categorizer.is_apostrophe(quote_match, next_mark): + pass + elif self._quotation_mark_categorizer.is_closing_quote(quote_match): + if self._quotation_mark_categorizer.is_spanish_quotation_continuer(quote_match, previous_mark, next_mark): + yield self._process_quotation_continuer(quote_match, QuotationContinuerStyle.SPANISH) + elif not self._quotation_mark_resolver_state.has_open_quotation_mark(): + self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) + return + else: + yield self._process_closing_mark(quote_match) + elif self._quotation_mark_categorizer.is_malformed_closing_quote(quote_match): + yield self._process_closing_mark(quote_match) + elif self._quotation_mark_categorizer.is_malformed_opening_quote(quote_match): + yield self._process_opening_mark(quote_match) + elif self._quotation_mark_categorizer.is_unpaired_closing_quote(quote_match): + self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) + else: + self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) + + def _process_quotation_continuer( + self, quote_match: QuotationMarkStringMatch, continuer_style: QuotationContinuerStyle + ) -> QuotationMarkMetadata: + return self._quotation_continuer_state.add_quotation_continuer( + quote_match, self._quotation_mark_resolver_state, continuer_style + ) + + def _is_depth_too_great(self) -> bool: + return self._quotation_mark_resolver_state.are_more_than_n_quotes_open(3) + + def _process_opening_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + if not self._settings.does_metadata_match_quotation_mark( + quote_match.get_quotation_mark(), + self._quotation_mark_resolver_state.get_current_depth(), + QuotationMarkDirection.Opening, + ): + self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) + return self._quotation_mark_resolver_state.add_opening_quotation_mark(quote_match) + + def _process_closing_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + if not self._settings.does_metadata_match_quotation_mark( + quote_match.get_quotation_mark(), + self._quotation_mark_resolver_state.get_current_depth() - 1, + QuotationMarkDirection.Closing, + ): + self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) + return self._quotation_mark_resolver_state.add_closing_quotation_mark(quote_match) + def get_issues(self) -> Set[QuotationMarkResolutionIssue]: return self._issues diff --git a/machine/corpora/analysis/preliminary_quotation_analyzer.py b/machine/corpora/analysis/preliminary_quotation_analyzer.py index 6204243d..a5699e94 100644 --- a/machine/corpora/analysis/preliminary_quotation_analyzer.py +++ b/machine/corpora/analysis/preliminary_quotation_analyzer.py @@ -10,7 +10,7 @@ from .verse import Verse -class CharacterCountStatistics: +class ApostropheProportionStatistics: def __init__(self): self.reset() @@ -84,7 +84,7 @@ def are_initial_and_final_rates_similar(self, quotation_mark: str) -> bool: num_initial_marks: int = self._get_word_initial_occurrences(quotation_mark) num_final_marks: int = self._get_word_final_occurrences(quotation_mark) num_total_marks: int = self._get_total_occurrences(quotation_mark) - return num_total_marks > 0 and abs(num_initial_marks - num_final_marks) / num_total_marks > 0.3 + return num_total_marks > 0 and abs(num_initial_marks - num_final_marks) / num_total_marks < 0.3 def is_mark_commonly_mid_word(self, quotation_mark: str) -> bool: num_mid_word_marks: int = self._get_mid_word_occurrences(quotation_mark) @@ -92,25 +92,6 @@ def is_mark_commonly_mid_word(self, quotation_mark: str) -> bool: return num_total_marks > 0 and num_mid_word_marks / num_total_marks > 0.3 -class QuotationMarkVersePositions: - def __init__(self): - self.reset() - - def reset(self) -> None: - self.verse_starting_quotation_mark_counts: Dict[str, int] = dict() - self.verse_ending_quotation_mark_counts: Dict[str, int] = dict() - - def process_verse_starting_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> None: - if quotation_mark_match.get_quotation_mark() not in self.verse_starting_quotation_mark_counts: - self.verse_starting_quotation_mark_counts[quotation_mark_match.get_quotation_mark()] = 0 - self.verse_starting_quotation_mark_counts[quotation_mark_match.get_quotation_mark()] += 1 - - def process_verse_ending_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> None: - if quotation_mark_match.get_quotation_mark() not in self.verse_ending_quotation_mark_counts: - self.verse_ending_quotation_mark_counts[quotation_mark_match.get_quotation_mark()] = 0 - self.verse_ending_quotation_mark_counts[quotation_mark_match.get_quotation_mark()] += 1 - - class QuotationMarkSequences: def __init__(self): self.reset() @@ -211,83 +192,39 @@ def has_distinct_paired_quotation_mark(self, quotation_mark: str) -> bool: ) -class PreliminaryQuotationAnalyzer: +class PreliminaryApostropheAnalyzer: apostrophe_pattern = regex.compile(r"[\'\u2019]", regex.U) - def __init__(self, quote_conventions: QuoteConventionSet): - self.quote_conventions = quote_conventions - self.character_count_statistics = CharacterCountStatistics() - self.word_position_statistics = QuotationMarkWordPositions() - self.verse_positions = QuotationMarkVersePositions() - self.quotation_mark_sequences = QuotationMarkSequences() - self._reset_analysis() - - def _reset_analysis(self) -> None: - self.character_count_statistics.reset() - self.word_position_statistics.reset() - self.verse_positions.reset() - self.quotation_mark_sequences.reset() - self.earlier_quotation_mark_counts: dict[str, int] = dict() - self.later_quotation_mark_counts: dict[str, int] = dict() - - def narrow_down_possible_quote_conventions(self, chapters: List[Chapter]) -> QuoteConventionSet: - for chapter in chapters: - self._analyze_quotation_marks_for_chapter(chapter) - return self._select_compatible_quote_conventions() - - def _analyze_quotation_marks_for_chapter(self, chapter: Chapter) -> None: - for verse in chapter.get_verses(): - self._analyze_quotation_marks_for_verse(verse) - - def _analyze_quotation_marks_for_verse(self, verse: Verse) -> None: - self._count_characters_in_verse(verse) - quotation_marks = QuotationMarkFinder(self.quote_conventions).find_all_potential_quotation_marks_in_verse(verse) - self._analyze_quotation_mark_sequence(quotation_marks) - self._count_verse_starting_and_ending_quotation_marks(quotation_marks) - - def _count_characters_in_verse(self, verse: Verse) -> None: - for text_segment in verse.get_text_segments(): - self._count_characters_in_text_segment(text_segment) - - def _count_characters_in_text_segment(self, text_segment: TextSegment) -> None: - self.character_count_statistics.count_characters(text_segment) - - def _analyze_quotation_mark_sequence(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: - quotation_mark_grouper: QuotationMarkGrouper = QuotationMarkGrouper(quotation_marks, self.quote_conventions) - for earlier_mark, later_mark in quotation_mark_grouper.get_quotation_mark_pairs(): - self.quotation_mark_sequences.record_earlier_quotation_mark(earlier_mark) - self.quotation_mark_sequences.record_later_quotation_mark(later_mark) + def __init__(self): + self._apostrophe_proportion_statistics = ApostropheProportionStatistics() + self._word_position_statistics = QuotationMarkWordPositions() + self.reset() - def _count_verse_starting_and_ending_quotation_marks(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: + def reset(self) -> None: + self._apostrophe_proportion_statistics.reset() + self._word_position_statistics.reset() + + def process_quotation_marks( + self, text_segments: List[TextSegment], quotation_marks: List[QuotationMarkStringMatch] + ) -> None: + for text_segment in text_segments: + self._apostrophe_proportion_statistics.count_characters(text_segment) for quotation_mark_match in quotation_marks: - if quotation_mark_match.does_quotation_mark_match(self.apostrophe_pattern): - self._count_apostrophe(quotation_mark_match) - if self._is_at_start_of_verse(quotation_mark_match): - self.verse_positions.process_verse_starting_quotation_mark(quotation_mark_match) - if self._is_at_end_of_verse(quotation_mark_match): - self.verse_positions.process_verse_ending_quotation_mark(quotation_mark_match) - - def _is_at_start_of_verse(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: - return ( - quotation_mark_match.get_text_segment().is_first_segment_in_verse() - and not quotation_mark_match.has_letter_in_leading_substring() - ) + self._process_quotation_mark(quotation_mark_match) - def _is_at_end_of_verse(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: - return ( - quotation_mark_match.get_text_segment().is_last_segment_in_verse() - and not quotation_mark_match.has_letter_in_trailing_substring() - ) + def _process_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> None: + if quotation_mark_match.does_quotation_mark_match(self.apostrophe_pattern): + self._count_apostrophe(quotation_mark_match) def _count_apostrophe(self, apostrophe_match: QuotationMarkStringMatch) -> None: apostrophe: str = apostrophe_match.get_quotation_mark() - self.character_count_statistics.add_apostrophe() + self._apostrophe_proportion_statistics.add_apostrophe() if self._is_match_word_initial(apostrophe_match): - self.word_position_statistics.count_word_initial_apostrophe(apostrophe) + self._word_position_statistics.count_word_initial_apostrophe(apostrophe) elif self._is_match_mid_word(apostrophe_match): - self.word_position_statistics.count_mid_word_apostrophe(apostrophe) + self._word_position_statistics.count_mid_word_apostrophe(apostrophe) elif self._is_match_word_final(apostrophe_match): - self.word_position_statistics.count_word_final_apostrophe(apostrophe) + self._word_position_statistics.count_word_final_apostrophe(apostrophe) def _is_match_word_initial(self, apostrophe_match: QuotationMarkStringMatch) -> bool: if apostrophe_match.has_trailing_whitespace(): @@ -310,68 +247,103 @@ def _is_match_word_final(self, apostrophe_match: QuotationMarkStringMatch) -> bo return False return True + def is_apostrophe_only(self, mark: str) -> bool: + if not self.apostrophe_pattern.search(mark): + return False + + if self._word_position_statistics.is_mark_rarely_initial( + mark + ) or self._word_position_statistics.is_mark_rarely_final(mark): + return True + + if self._word_position_statistics.are_initial_and_final_rates_similar( + mark + ) and self._word_position_statistics.is_mark_commonly_mid_word(mark): + return True + + if self._apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.02): + return True + + return False + + +class PreliminaryQuotationAnalyzer: + + def __init__(self, quote_conventions: QuoteConventionSet): + self._quote_conventions = quote_conventions + self._apostrophe_analyzer = PreliminaryApostropheAnalyzer() + self._quotation_mark_sequences = QuotationMarkSequences() + self.reset() + + def reset(self) -> None: + self._apostrophe_analyzer.reset() + self._quotation_mark_sequences.reset() + + def narrow_down_possible_quote_conventions(self, chapters: List[Chapter]) -> QuoteConventionSet: + for chapter in chapters: + self._analyze_quotation_marks_for_chapter(chapter) + return self._select_compatible_quote_conventions() + + def _analyze_quotation_marks_for_chapter(self, chapter: Chapter) -> None: + for verse in chapter.get_verses(): + self._analyze_quotation_marks_for_verse(verse) + + def _analyze_quotation_marks_for_verse(self, verse: Verse) -> None: + quotation_marks: List[QuotationMarkStringMatch] = QuotationMarkFinder( + self._quote_conventions + ).find_all_potential_quotation_marks_in_verse(verse) + self._analyze_quotation_mark_sequence(quotation_marks) + self._apostrophe_analyzer.process_quotation_marks(verse.get_text_segments(), quotation_marks) + + def _analyze_quotation_mark_sequence(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: + quotation_mark_grouper: QuotationMarkGrouper = QuotationMarkGrouper(quotation_marks, self._quote_conventions) + for earlier_mark, later_mark in quotation_mark_grouper.get_quotation_mark_pairs(): + self._quotation_mark_sequences.record_earlier_quotation_mark(earlier_mark) + self._quotation_mark_sequences.record_later_quotation_mark(later_mark) + def _select_compatible_quote_conventions(self) -> QuoteConventionSet: opening_quotation_marks = self._find_opening_quotation_marks() closing_quotation_marks = self._find_closing_quotation_marks() - return self.quote_conventions.filter_to_compatible_quote_conventions( + return self._quote_conventions.filter_to_compatible_quote_conventions( opening_quotation_marks, closing_quotation_marks ) def _find_opening_quotation_marks(self) -> List[str]: return [ quotation_mark - for quotation_mark in self.quote_conventions.get_possible_opening_marks() + for quotation_mark in self._quote_conventions.get_possible_opening_marks() if self._is_opening_quotation_mark(quotation_mark) ] def _is_opening_quotation_mark(self, quotation_mark: str) -> bool: - if self._is_apostrophe_only(quotation_mark): + if self._apostrophe_analyzer.is_apostrophe_only(quotation_mark): return False - if self.quotation_mark_sequences.is_mark_much_more_common_earlier(quotation_mark): + if self._quotation_mark_sequences.is_mark_much_more_common_earlier(quotation_mark): return True - if self.quotation_mark_sequences.is_mark_common_early_and_late( + if self._quotation_mark_sequences.is_mark_common_early_and_late( quotation_mark - ) and self.quote_conventions.is_quotation_mark_direction_ambiguous(quotation_mark): + ) and self._quote_conventions.is_quotation_mark_direction_ambiguous(quotation_mark): return True return False def _find_closing_quotation_marks(self) -> List[str]: return [ quotation_mark - for quotation_mark in self.quote_conventions.get_possible_closing_marks() + for quotation_mark in self._quote_conventions.get_possible_closing_marks() if self._is_closing_quotation_mark(quotation_mark) ] def _is_closing_quotation_mark(self, quotation_mark: str) -> bool: - if self._is_apostrophe_only(quotation_mark): + if self._apostrophe_analyzer.is_apostrophe_only(quotation_mark): return False - if self.quotation_mark_sequences.is_mark_much_more_common_later(quotation_mark): + if self._quotation_mark_sequences.is_mark_much_more_common_later(quotation_mark): return True - if self.quotation_mark_sequences.is_mark_common_early_and_late( + if self._quotation_mark_sequences.is_mark_common_early_and_late( quotation_mark - ) and self.quote_conventions.is_quotation_mark_direction_ambiguous(quotation_mark): - return True - return False - - def _is_apostrophe_only(self, mark: str) -> bool: - if not self.apostrophe_pattern.search(mark): - return False - - if self.word_position_statistics.is_mark_rarely_initial( - mark - ) or self.word_position_statistics.is_mark_rarely_final(mark): - return True - - if self.word_position_statistics.are_initial_and_final_rates_similar( - mark - ) and self.word_position_statistics.is_mark_commonly_mid_word(mark): + ) and self._quote_conventions.is_quotation_mark_direction_ambiguous(quotation_mark): return True - - if self.character_count_statistics.is_apostrophe_proportion_greater_than(0.02): - return True - return False diff --git a/machine/corpora/analysis/quotation_mark_resolution_settings.py b/machine/corpora/analysis/quotation_mark_resolution_settings.py index e3385f1d..cf50a2cd 100644 --- a/machine/corpora/analysis/quotation_mark_resolution_settings.py +++ b/machine/corpora/analysis/quotation_mark_resolution_settings.py @@ -1,6 +1,8 @@ from abc import ABC from typing import Set +import regex + from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_string_match import QuotationMarkStringMatch @@ -11,6 +13,10 @@ def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStr def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: ... + def get_opening_quotation_mark_regex(self) -> regex.Pattern: ... + + def get_closing_quotation_mark_regex(self) -> regex.Pattern: ... + def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: ... def should_rely_on_paragraph_markers(self) -> bool: ... diff --git a/machine/corpora/analysis/quotation_mark_string_match.py b/machine/corpora/analysis/quotation_mark_string_match.py index 0da48fc3..6be5494c 100644 --- a/machine/corpora/analysis/quotation_mark_string_match.py +++ b/machine/corpora/analysis/quotation_mark_string_match.py @@ -24,6 +24,15 @@ def __init__(self, text_segment: TextSegment, start_index: int, end_index: int): self.start_index = start_index self.end_index = end_index + def __eq__(self, value): + if not isinstance(value, QuotationMarkStringMatch): + return False + return ( + self.text_segment == value.text_segment + and self.start_index == value.start_index + and self.end_index == value.end_index + ) + def get_quotation_mark(self) -> str: return self.text_segment.get_text()[self.start_index : self.end_index] @@ -55,6 +64,18 @@ def get_previous_character(self) -> Union[str, None]: return None return self.text_segment.get_text()[self.start_index - 1] + def get_previous_character_string_match(self) -> Union["QuotationMarkStringMatch", None]: + if self.start_index == 0: + previous_segment = self.text_segment.get_previous_segment() + if previous_segment is not None and not self.text_segment.is_marker_in_preceding_context( + UsfmMarkerType.ParagraphMarker + ): + return QuotationMarkStringMatch( + previous_segment, previous_segment.length() - 1, previous_segment.length() + ) + return None + return QuotationMarkStringMatch(self.text_segment, self.start_index - 1, self.end_index - 1) + def get_next_character(self) -> Union[str, None]: if self.is_at_end_of_segment(): next_segment = self.text_segment.get_next_segment() @@ -65,6 +86,16 @@ def get_next_character(self) -> Union[str, None]: return None return self.text_segment.get_text()[self.end_index] + def get_next_character_string_match(self) -> Union["QuotationMarkStringMatch", None]: + if self.is_at_end_of_segment(): + next_segment = self.text_segment.get_next_segment() + if next_segment is not None and not next_segment.is_marker_in_preceding_context( + UsfmMarkerType.ParagraphMarker + ): + return QuotationMarkStringMatch(next_segment, 0, 1) + return None + return QuotationMarkStringMatch(self.text_segment, self.start_index + 1, self.end_index + 1) + def does_leading_substring_match(self, regex_pattern: regex.Pattern) -> bool: return regex_pattern.search(self.text_segment.substring_before(self.start_index)) is not None @@ -136,9 +167,3 @@ def has_trailing_latin_letter(self) -> bool: def has_quote_introducer_in_leading_substring(self) -> bool: return self.does_leading_substring_match(self.quote_introducer_pattern) - - def has_leading_closing_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: - return self.does_previous_character_match(quote_convention_set.get_closing_quotation_mark_regex()) - - def has_trailing_closing_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: - return self.does_next_character_match(quote_convention_set.get_closing_quotation_mark_regex()) diff --git a/machine/corpora/analysis/quote_convention_detection_resolution_settings.py b/machine/corpora/analysis/quote_convention_detection_resolution_settings.py index f38c6a5c..7192bd3d 100644 --- a/machine/corpora/analysis/quote_convention_detection_resolution_settings.py +++ b/machine/corpora/analysis/quote_convention_detection_resolution_settings.py @@ -1,5 +1,7 @@ from typing import Set +import regex + from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings from .quotation_mark_string_match import QuotationMarkStringMatch @@ -17,6 +19,12 @@ def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStr def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_convention_set) + def get_opening_quotation_mark_regex(self) -> regex.Pattern: + return self._quote_convention_set.get_opening_quotation_mark_regex() + + def get_closing_quotation_mark_regex(self) -> regex.Pattern: + return self._quote_convention_set.get_closing_quotation_mark_regex() + def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: return self._quote_convention_set.are_marks_a_valid_pair(opening_mark, closing_mark) diff --git a/machine/corpora/analysis/quote_convention_set.py b/machine/corpora/analysis/quote_convention_set.py index c4203774..d825c45c 100644 --- a/machine/corpora/analysis/quote_convention_set.py +++ b/machine/corpora/analysis/quote_convention_set.py @@ -14,6 +14,11 @@ def __init__(self, conventions: List[QuoteConvention]): self._create_quote_regexes() self._create_quotation_mark_pair_map() + def __eq__(self, other: object) -> bool: + if not isinstance(other, QuoteConventionSet): + return False + return self.conventions == other.conventions + def _create_quote_regexes(self) -> None: opening_quotation_marks: Set[str] = set() closing_quotation_marks: Set[str] = set() diff --git a/machine/corpora/quotation_mark_update_resolution_settings.py b/machine/corpora/quotation_mark_update_resolution_settings.py index 77d9009b..f67dce18 100644 --- a/machine/corpora/quotation_mark_update_resolution_settings.py +++ b/machine/corpora/quotation_mark_update_resolution_settings.py @@ -1,5 +1,7 @@ from typing import Set +import regex + from .analysis.quotation_mark_direction import QuotationMarkDirection from .analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings from .analysis.quotation_mark_string_match import QuotationMarkStringMatch @@ -19,6 +21,12 @@ def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStr def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_convention_singleton_set) + def get_opening_quotation_mark_regex(self) -> regex.Pattern: + return self._quote_convention_singleton_set.get_opening_quotation_mark_regex() + + def get_closing_quotation_mark_regex(self) -> regex.Pattern: + return self._quote_convention_singleton_set.get_closing_quotation_mark_regex() + def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: return self._quote_convention_singleton_set.are_marks_a_valid_pair(opening_mark, closing_mark) diff --git a/tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py b/tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py new file mode 100644 index 00000000..0c3cb1bd --- /dev/null +++ b/tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py @@ -0,0 +1,2655 @@ +from pytest import raises + +from machine.corpora import QuotationMarkUpdateResolutionSettings +from machine.corpora.analysis import ( + DepthBasedQuotationMarkResolver, + QuotationContinuerState, + QuotationContinuerStyle, + QuotationMarkCategorizer, + QuotationMarkDirection, + QuotationMarkMetadata, + QuotationMarkResolutionIssue, + QuotationMarkResolverState, + QuotationMarkStringMatch, + QuoteConventionDetectionResolutionSettings, + QuoteConventionSet, + TextSegment, + UsfmMarkerType, + standard_quote_conventions, +) + + +# QuotationMarkResolverState tests +def test_get_current_depth_quotation_mark_resolver_state() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + assert quotation_mark_resolver_state.get_current_depth() == 1 + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_current_depth() == 2 + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_current_depth() == 3 + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_current_depth() == 2 + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_current_depth() == 1 + + +def test_has_open_quotation_mark() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + assert not quotation_mark_resolver_state.has_open_quotation_mark() + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert quotation_mark_resolver_state.has_open_quotation_mark() + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert quotation_mark_resolver_state.has_open_quotation_mark() + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert quotation_mark_resolver_state.has_open_quotation_mark() + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not quotation_mark_resolver_state.has_open_quotation_mark() + + +def test_are_more_than_n_quotes_open() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(1) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(2) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(1) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(2) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert quotation_mark_resolver_state.are_more_than_n_quotes_open(1) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(2) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(1) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(2) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(1) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(2) + + +def test_get_opening_quotation_mark_at_depth() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + with raises(Exception): + quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(1) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(1) == "\u201c" + with raises(Exception): + quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(2) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(1) == "\u201c" + assert quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(2) == "\u2018" + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(1) == "\u201c" + with raises(Exception): + quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(2) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + with raises(Exception): + quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(1) + + +def test_get_deepest_opening_mark() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + with raises(Exception): + quotation_mark_resolver_state.get_deepest_opening_quotation_mark() + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == "\u201c" + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == "\u2018" + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == "\u201c" + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + with raises(Exception): + quotation_mark_resolver_state.get_deepest_opening_quotation_mark() + + +# QuotationContinuerState tests +def test_get_current_depth_quotation_continuer_state() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + + quotation_continuer_state = QuotationContinuerState() + assert quotation_continuer_state.get_current_depth() == 0 + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.get_current_depth() == 1 + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.get_current_depth() == 2 + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.get_current_depth() == 0 + + +def test_has_continuer_been_observed() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + + quotation_continuer_state = QuotationContinuerState() + assert not quotation_continuer_state.has_continuer_been_observed() + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.has_continuer_been_observed() + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.has_continuer_been_observed() + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) + assert not quotation_continuer_state.has_continuer_been_observed() + + +def test_get_continuer_style() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + + quotation_continuer_state = QuotationContinuerState() + assert quotation_continuer_state.get_continuer_style() is QuotationContinuerStyle.UNDETERMINED + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.get_continuer_style() is QuotationContinuerStyle.ENGLISH + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.SPANISH, + ) + assert quotation_continuer_state.get_continuer_style() is QuotationContinuerStyle.SPANISH + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.get_continuer_style() is QuotationContinuerStyle.ENGLISH + + +def test_add_quotation_continuer() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + + quotation_continuer_state = QuotationContinuerState() + + assert quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) == QuotationMarkMetadata( + "\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text("\u201c").build(), 0, 1 + ) + + assert quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.SPANISH, + ) == QuotationMarkMetadata( + "\u2018", 2, QuotationMarkDirection.Opening, TextSegment.Builder().set_text("\u2018").build(), 0, 1 + ) + assert quotation_continuer_state.get_continuer_style() == QuotationContinuerStyle.SPANISH + + assert quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) == QuotationMarkMetadata( + "\u201c", 3, QuotationMarkDirection.Opening, TextSegment.Builder().set_text("\u201c").build(), 0, 1 + ) + + +# QuotationMarkCategorizer tests + + +def test_is_english_quotation_continuer() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + + english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuotationContinuerState() + + quotation_mark_categorizer = QuotationMarkCategorizer( + english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + + # Should always be false if the continuer style is Spanish + quotation_continuer_state.continuer_style = QuotationContinuerStyle.ENGLISH + assert quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + + quotation_continuer_state.continuer_style = QuotationContinuerStyle.SPANISH + assert not quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + quotation_continuer_state.continuer_style = QuotationContinuerStyle.ENGLISH + + # Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) + assert not quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").build(), + 0, + 1, + ), + None, + None, + ) + + assert quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + + quotation_mark_categorizer_for_denormalization = QuotationMarkCategorizer( + QuotationMarkUpdateResolutionSettings(standard_english_quote_convention, standard_english_quote_convention), + quotation_mark_resolver_state, + quotation_continuer_state, + ) + assert quotation_mark_categorizer_for_denormalization.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").build(), + 0, + 1, + ), + None, + None, + ) + + # Should be false if there are no open quotation marks + empty_quotation_mark_resolver_state = QuotationMarkResolverState() + empty_quotation_mark_categorizer = QuotationMarkCategorizer( + english_resolver_settings, empty_quotation_mark_resolver_state, quotation_continuer_state + ) + assert not empty_quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + + # Should be false if the starting index of the quotation mark is greater than 0 + assert not quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text(" \u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 1, + 2, + ), + None, + None, + ) + + # Should be false if the mark does not match the already opened mark + assert not quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u2018test").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + + # If there are multiple open quotes, the next quote continuer must follow immediately + # after the current one + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + assert quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018test") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 0, + 1, + ), + None, + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018test") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + ) + assert quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u201ctest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 0, + 1, + ), + None, + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u201ctest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + ) + + # When there are multiple open quotes, the continuer must match the deepest observed mark + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018test") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 0, + 1, + ), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) + + assert not quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u201ctest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + None, + None, + ) + + assert quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018test") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + None, + None, + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201c").build(), + 0, + 1, + ) + ) + + assert quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018\u201ctest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + None, + None, + ) + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018\u201ctest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + quotation_mark_resolver_state, + QuotationContinuerStyle.ENGLISH, + ) + assert not quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018\u2018test") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 2, + 3, + ), + None, + None, + ) + assert quotation_mark_categorizer.is_english_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018\u201ctest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 2, + 3, + ), + None, + None, + ) + + +def test_is_spanish_quotation_continuer() -> None: + western_european_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("western_european") + ) + assert western_european_quote_convention is not None + + spanish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([western_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuotationContinuerState() + + quotation_mark_categorizer = QuotationMarkCategorizer( + spanish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00ab").build(), 0, 1) + ) + + # Should always be false if the continuer style is English + quotation_continuer_state.continuer_style = QuotationContinuerStyle.SPANISH + assert quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + + quotation_continuer_state.continuer_style = QuotationContinuerStyle.ENGLISH + assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + quotation_continuer_state.continuer_style = QuotationContinuerStyle.SPANISH + + # Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) + assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").build(), + 0, + 1, + ), + None, + None, + ) + + assert quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + + quotation_mark_categorizer_for_denormalization = QuotationMarkCategorizer( + QuotationMarkUpdateResolutionSettings(western_european_quote_convention, western_european_quote_convention), + quotation_mark_resolver_state, + quotation_continuer_state, + ) + assert quotation_mark_categorizer_for_denormalization.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").build(), + 0, + 1, + ), + None, + None, + ) + + # Should be false if there are no open quotation marks + empty_quotation_mark_resolver_state = QuotationMarkResolverState() + empty_quotation_mark_categorizer = QuotationMarkCategorizer( + spanish_resolver_settings, empty_quotation_mark_resolver_state, quotation_continuer_state + ) + assert not empty_quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + + # Should be false if the starting index of the quotation mark is greater than 0 + assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text(" \u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 1, + 2, + ), + None, + None, + ) + + # Should be false if the mark does not match the already opened mark + assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201dtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + + # If there are multiple open quotes, the next quote continuer must follow immediately + # after the current one + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + 0, + 1, + ), + None, + None, + ) + assert quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201dtest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 0, + 1, + ), + None, + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201dtest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + ) + assert quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u00bbtest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 0, + 1, + ), + None, + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u00bbtest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + ) + + # When there are multiple open quotes, the continuer must match the deepest observed mark + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201dtest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 0, + 1, + ), + quotation_mark_resolver_state, + QuotationContinuerStyle.SPANISH, + ) + + assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201ctest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + None, + None, + ) + + assert quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201dtest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + None, + None, + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u2018").build(), + 0, + 1, + ) + ) + + assert quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201d\u2019test") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + None, + None, + ) + + quotation_continuer_state.add_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201d\u2019test") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 1, + 2, + ), + quotation_mark_resolver_state, + QuotationContinuerStyle.SPANISH, + ) + assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201d\u201dtest") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 2, + 3, + ), + None, + None, + ) + assert quotation_mark_categorizer.is_spanish_quotation_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201d\u2019test") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build(), + 2, + 3, + ), + None, + None, + ) + + +def test_is_opening_quote() -> None: + central_european_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + ) + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuotationContinuerState() + central_european_quotation_mark_categorizer = QuotationMarkCategorizer( + central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + british_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("british_english") + ) + assert british_english_quote_convention is not None + british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([british_english_quote_convention]) + ) + british_english_quotation_mark_categorizer = QuotationMarkCategorizer( + british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_swedish_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + ) + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + three_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [central_european_quote_convention, british_english_quote_convention, standard_swedish_quote_convention] + ) + ) + three_conventions_quotation_mark_categorizer = QuotationMarkCategorizer( + three_conventions_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # It should only accept valid opening marks under the quote convention + assert central_european_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) + ) + assert central_european_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) + ) + + assert not british_english_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) + ) + assert british_english_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) + ) + assert british_english_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) + ) + + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) + ) + + assert three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) + ) + + # Leading whitespace is not necessary for unambiguous opening quotes + assert central_european_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201e").build(), 4, 5) + ) + assert central_european_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201a").build(), 4, 5) + ) + assert british_english_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201c").build(), 4, 5) + ) + assert british_english_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u2018").build(), 4, 5) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201e").build(), 4, 5) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201a").build(), 4, 5) + ) + + # An ambiguous quotation mark (opening/closing) is recognized as opening if + # it has a quote introducer beforehand + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201d").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(":\u2019").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201c").build(), 1, 2) + ) + + # An ambiguous quotation mark (opening/closing) is recognized as opening if + # preceded by another opening mark + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201c").build(), 1, 2) + ) + + # An ambiguous quotation mark (opening/closing) is not recognized as opening if + # it has trailing whitespace or punctuation + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d.").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201d ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019 ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019?").build(), 1, 2) + ) + + +def test_is_closing_quote() -> None: + central_european_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + ) + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuotationContinuerState() + central_european_quotation_mark_categorizer = QuotationMarkCategorizer( + central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + british_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("british_english") + ) + assert british_english_quote_convention is not None + british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([british_english_quote_convention]) + ) + british_english_quotation_mark_categorizer = QuotationMarkCategorizer( + british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_swedish_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + ) + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_french_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_french") + ) + assert standard_french_quote_convention is not None + standard_french_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_french_quote_convention]) + ) + standard_french_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_french_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + three_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [central_european_quote_convention, british_english_quote_convention, standard_swedish_quote_convention] + ) + ) + three_conventions_quotation_mark_categorizer = QuotationMarkCategorizer( + three_conventions_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # It should only accept valid closing marks under the quote convention + assert central_european_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) + ) + assert central_european_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201a ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) + ) + + assert not british_english_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) + ) + + assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) + ) + + assert three_conventions_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) + ) + + # Trailing whitespace is not necessary for unambiguous closing quotes + assert standard_french_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bbtext").build(), 0, 1) + ) + assert standard_french_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u203atext").build(), 0, 1) + ) + + # An ambiguous quotation mark (opening/closing) is recognized as closing if + # followed by whitespace, punctuation or the end of the segment + assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201dtext").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019text").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019?").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019\u201d").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201ctext").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c?").build(), 0, 1) + ) + + # An ambiguous quotation mark (opening/closing) is not recognized as opening if + # it has leading whitespace + assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\t\u201c?").build(), 1, 2) + ) + + +def test_is_malformed_opening_quote() -> None: + central_european_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + ) + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuotationContinuerState() + central_european_quotation_mark_categorizer = QuotationMarkCategorizer( + central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + british_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("british_english") + ) + assert british_english_quote_convention is not None + british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([british_english_quote_convention]) + ) + british_english_quotation_mark_categorizer = QuotationMarkCategorizer( + british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_swedish_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + ) + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + three_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [central_european_quote_convention, british_english_quote_convention, standard_swedish_quote_convention] + ) + ) + three_conventions_quotation_mark_categorizer = QuotationMarkCategorizer( + three_conventions_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # It should only accept valid opening marks under the quote convention + assert central_european_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) + ) + assert central_european_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) + ) + + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) + ) + assert british_english_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + assert british_english_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) + ) + + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) + ) + + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) + ) + + # Should return true if there is a leading quote introducer + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201d ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(":\u2019 ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201c ").build(), 1, 2) + ) + + # Should return false unless the mark has leading and trailing whitespace + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + + # Should return false if there is already an open quotation mark on the stack + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + + +def test_is_malformed_closing_quote() -> None: + central_european_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + ) + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuotationContinuerState() + central_european_quotation_mark_categorizer = QuotationMarkCategorizer( + central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + british_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("british_english") + ) + assert british_english_quote_convention is not None + british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([british_english_quote_convention]) + ) + british_english_quotation_mark_categorizer = QuotationMarkCategorizer( + british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_swedish_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + ) + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + three_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [central_european_quote_convention, british_english_quote_convention, standard_swedish_quote_convention] + ) + ) + three_conventions_quotation_mark_categorizer = QuotationMarkCategorizer( + three_conventions_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # It should only accept valid closing marks under the quote convention + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e").build(), 0, 1) + ) + assert central_european_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201a").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + # Returns true if it's at the end of the segment + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + + # Returns true if it does not have trailing whitespace + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d-").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201dtext").build(), 0, 1) + ) + + # Returns true if it has trailing and leading whitespace + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + + # Requires there to be an open quotation mark on the stack + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + + # Requires the quotation mark on the stack to be a valid pair with the + # observed quotation mark + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_malformed_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + + +def test_is_unpaired_closing_quote() -> None: + central_european_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + ) + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuotationContinuerState() + central_european_quotation_mark_categorizer = QuotationMarkCategorizer( + central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + british_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("british_english") + ) + assert british_english_quote_convention is not None + british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([british_english_quote_convention]) + ) + british_english_quotation_mark_categorizer = QuotationMarkCategorizer( + british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_swedish_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + ) + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + three_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [central_european_quote_convention, british_english_quote_convention, standard_swedish_quote_convention] + ) + ) + three_conventions_quotation_mark_categorizer = QuotationMarkCategorizer( + three_conventions_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # It should only accept valid closing marks under the quote convention + assert central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201a").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + # There must not be an opening quotation mark on the stack + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + + # There must not be leading whitespace + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\t\u2019").build(), 1, 2) + ) + + # The quotation mark must be either at the end of the segment + # or have trailing whitespace + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d?").build(), 0, 1) + ) + + +def test_is_apostrophe() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuotationContinuerState() + standard_english_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + typewriter_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_english") + ) + assert typewriter_english_quote_convention is not None + typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([typewriter_english_quote_convention]) + ) + typewriter_english_quotation_mark_categorizer = QuotationMarkCategorizer( + typewriter_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # The quotation mark must make for a plausible apostrophe + assert typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a'b").build(), 1, 2), None + ) + assert typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2019b").build(), 1, 2), None + ) + assert typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2018b").build(), 1, 2), None + ) + assert not typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u201cb").build(), 1, 2), None + ) + assert not typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text('a"b').build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a'b").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2019b").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2018b").build(), 1, 2), None + ) + assert not standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u201cb").build(), 1, 2), None + ) + assert not standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text('a"b').build(), 1, 2), None + ) + + # Returns true if the mark has Latin letters on both sides + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2019Ƅ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("ǡ\u2019b").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("ᴀ\u2019B").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("𝼀\u2019Ꝙ").build(), 1, 2), None + ) + assert not standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2019ℵ").build(), 1, 2), None + ) + assert typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2019ℵ").build(), 1, 2), None + ) + + # Recognizes s possessives (e.g. Moses') + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u2019 ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("Moses\u2019 ").build(), 5, 6), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u2019?").build(), 1, 2), None + ) + assert not standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u20195").build(), 1, 2), None + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u2019 ").build(), 1, 2), None + ) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u2019 ").build(), 1, 2), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word\u2019").build(), 4, 5), + ) + assert not standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u2019 ").build(), 1, 2), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word\u201d").build(), 4, 5), + ) + + # the straight quote should always be an apostrophe if it's not a valid quotation mark + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("5'ℵ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" ' ").build(), 1, 2), None + ) + + # the straight quote should be an apostrophe if there's nothing on the quotation mark stack + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("5'ℵ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" ' ").build(), 1, 2), None + ) + + # any matching mark should be an apostrophe if it doesn't pair with the + # deepest opening quotation mark on the stack + # (opening/closing quotation marks will have been detected before calling this) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("5'ℵ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" ' ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("5\u2018ℵ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("5\u2019ℵ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2), None + ) + + +# DepthBasedQuotationMarkResolver tests +def test_depth_based_quotation_mark_resolver_reset() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201cThis is a quote").build(), 0, 1)] + ) + ) + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + standard_english_quotation_mark_resolver.reset() + assert standard_english_quotation_mark_resolver.get_issues() == set() + + list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [QuotationMarkStringMatch(TextSegment.Builder().set_text("This is a quote\u2019").build(), 15, 16)] + ) + ) + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + +def test_basic_quotation_mark_recognition() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = TextSegment.Builder().set_text("\u201cThis is a \u2018quote\u2019\u201d").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + +def test_resolution_only_of_passed_matches() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = TextSegment.Builder().set_text("\u201cThis is a \u2018quote\u2019\u201d").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + ] + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + text_segment = TextSegment.Builder().set_text("\u201cThis is a \u2018quote\u2019\u201d").build() + assert ( + list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 17, 18), + ] + ) + ) + == [] + ) + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + +def test_resolution_across_segments() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment1 = TextSegment.Builder().set_text("\u201cThis is a ").build() + text_segment2 = TextSegment.Builder().set_text("\u2018quote\u2019\u201d").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment1, 0, 1), + QuotationMarkStringMatch(text_segment2, 0, 1), + QuotationMarkStringMatch(text_segment2, 6, 7), + QuotationMarkStringMatch(text_segment2, 7, 8), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment1, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment2, 0, 1), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment2, 6, 7), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment2, 7, 8), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + +def test_resolution_with_apostrophes() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = ( + TextSegment.Builder() + .set_text("\u201cThis\u2019 is a \u2018quote\u2019\u201d") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build() + ) + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 5, 6), + QuotationMarkStringMatch(text_segment, 12, 13), + QuotationMarkStringMatch(text_segment, 18, 19), + QuotationMarkStringMatch(text_segment, 19, 20), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment, 12, 13), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment, 18, 19), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment, 19, 20), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + typewriter_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_english") + ) + assert typewriter_english_quote_convention is not None + typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([typewriter_english_quote_convention]) + ) + typewriter_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(typewriter_english_resolver_settings) + + text_segment = ( + TextSegment.Builder() + .set_text("\"This' is a 'quote'\"") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build() + ) + assert list( + typewriter_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 5, 6), + QuotationMarkStringMatch(text_segment, 12, 13), + QuotationMarkStringMatch(text_segment, 18, 19), + QuotationMarkStringMatch(text_segment, 19, 20), + ] + ) + ) == [ + QuotationMarkMetadata('"', 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.Opening, text_segment, 12, 13), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.Closing, text_segment, 18, 19), + QuotationMarkMetadata('"', 1, QuotationMarkDirection.Closing, text_segment, 19, 20), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + +def test_english_quote_continuers() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment1 = TextSegment.Builder().set_text("\u201cThis is a \u2018quote").build() + text_segment2 = ( + TextSegment.Builder() + .set_text("\u201c\u2018This is the rest\u2019 of it\u201d") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build() + ) + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment1, 0, 1), + QuotationMarkStringMatch(text_segment1, 11, 12), + QuotationMarkStringMatch(text_segment2, 0, 1), + QuotationMarkStringMatch(text_segment2, 1, 2), + QuotationMarkStringMatch(text_segment2, 18, 19), + QuotationMarkStringMatch(text_segment2, 25, 26), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment1, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment1, 11, 12), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment2, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment2, 1, 2), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment2, 18, 19), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment2, 25, 26), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + +def test_spanish_quote_continuers() -> None: + western_european_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("western_european") + ) + assert western_european_quote_convention is not None + western_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([western_european_quote_convention]) + ) + western_european_quotation_mark_resolver = DepthBasedQuotationMarkResolver(western_european_resolver_settings) + + text_segment1 = TextSegment.Builder().set_text("\u00abThis is a \u201cquote").build() + text_segment2 = ( + TextSegment.Builder() + .set_text("\u00bb\u201dThis is the rest\u201d of it\u00bb") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build() + ) + assert list( + western_european_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment1, 0, 1), + QuotationMarkStringMatch(text_segment1, 11, 12), + QuotationMarkStringMatch(text_segment2, 0, 1), + QuotationMarkStringMatch(text_segment2, 1, 2), + QuotationMarkStringMatch(text_segment2, 18, 19), + QuotationMarkStringMatch(text_segment2, 25, 26), + ] + ) + ) == [ + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, text_segment1, 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, text_segment1, 11, 12), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Opening, text_segment2, 0, 1), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Opening, text_segment2, 1, 2), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Closing, text_segment2, 18, 19), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, text_segment2, 25, 26), + ] + assert western_european_quotation_mark_resolver.get_issues() == set() + + +def test_malformed_quotation_marks() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment1 = TextSegment.Builder().set_text("\u201c This is a,\u2018 quote").build() + text_segment2 = ( + TextSegment.Builder() + .set_text("This is the rest \u2019 of it \u201d") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build() + ) + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment1, 0, 1), + QuotationMarkStringMatch(text_segment1, 12, 13), + QuotationMarkStringMatch(text_segment2, 17, 18), + QuotationMarkStringMatch(text_segment2, 25, 26), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment1, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment1, 12, 13), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment2, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment2, 25, 26), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + +def test_unpaired_quotation_mark_issue() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = TextSegment.Builder().set_text("\u201cThis is a \u2018quote\u2019").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), + ] + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + text_segment = TextSegment.Builder().set_text("another quote\u201d").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 13, 14), + ] + ) + ) == [ + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment, 13, 14), + ] + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + +def test_too_deep_nesting_issue() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = ( + TextSegment.Builder().set_text("\u201cThis \u2018is \u201ca \u2018quote \u201cnested too deeply").build() + ) + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 6, 7), + QuotationMarkStringMatch(text_segment, 10, 11), + QuotationMarkStringMatch(text_segment, 13, 14), + QuotationMarkStringMatch(text_segment, 20, 21), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment, 6, 7), + QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.Opening, text_segment, 10, 11), + QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.Opening, text_segment, 13, 14), + # QuotationMarkMetadata("\u201c", 5, QuotationMarkDirection.Opening, text_segment, 20, 21), + ] + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.TOO_DEEP_NESTING, + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, + } + + +def test_incompatible_quotation_mark_issue() -> None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = TextSegment.Builder().set_text("\u201cThis is a \u201cquote\u201d\u201d").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + ] + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK + } + + +def test_ambiguous_quotation_mark_issue() -> None: + typewriter_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_english") + ) + assert typewriter_english_quote_convention is not None + typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([typewriter_english_quote_convention]) + ) + typewriter_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(typewriter_english_resolver_settings) + + text_segment = TextSegment.Builder().set_text('This"is an ambiguous quotation mark').build() + assert ( + list( + typewriter_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 4, 5), + ] + ) + ) + == [] + ) + assert typewriter_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK + } + + typewriter_english_quotation_mark_resolver.reset() + text_segment = TextSegment.Builder().set_text("\u201cThis is an ambiguous quotation mark").build() + assert ( + list( + typewriter_english_quotation_mark_resolver.resolve_quotation_marks( + [QuotationMarkStringMatch(text_segment, 0, 1)] + ) + ) + == [] + ) + assert typewriter_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK + } + + +def test_typewriter_english_quotation_mark_recognition() -> None: + typewriter_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_english") + ) + assert typewriter_english_quote_convention is not None + typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([typewriter_english_quote_convention]) + ) + typewriter_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(typewriter_english_resolver_settings) + + text_segment = ( + TextSegment.Builder() + .set_text("\"This is a 'quote'\"") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build() + ) + assert list( + typewriter_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata('"', 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), + QuotationMarkMetadata('"', 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + ] + assert typewriter_english_quotation_mark_resolver.get_issues() == set() + + +def test_typewriter_french_mark_recognition() -> None: + typewriter_french_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_french") + ) + assert typewriter_french_quote_convention is not None + typewriter_french_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([typewriter_french_quote_convention]) + ) + typewriter_french_quotation_mark_resolver = DepthBasedQuotationMarkResolver(typewriter_french_resolver_settings) + + text_segment = TextSegment.Builder().set_text("<>>").build() + assert list( + typewriter_french_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 2), + QuotationMarkStringMatch(text_segment, 12, 13), + QuotationMarkStringMatch(text_segment, 18, 19), + QuotationMarkStringMatch(text_segment, 19, 21), + ] + ) + ) == [ + QuotationMarkMetadata("<<", 1, QuotationMarkDirection.Opening, text_segment, 0, 2), + QuotationMarkMetadata("<", 2, QuotationMarkDirection.Opening, text_segment, 12, 13), + QuotationMarkMetadata(">", 2, QuotationMarkDirection.Closing, text_segment, 18, 19), + QuotationMarkMetadata(">>", 1, QuotationMarkDirection.Closing, text_segment, 19, 21), + ] + assert typewriter_french_quotation_mark_resolver.get_issues() == set() + + +def test_central_european_quotation_mark_recognition() -> None: + central_european_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + ) + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + central_european_quotation_mark_resolver = DepthBasedQuotationMarkResolver(central_european_resolver_settings) + + text_segment = ( + TextSegment.Builder() + .set_text("\u201eThis is a \u201aquote\u2018\u201c") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build() + ) + assert list( + central_european_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("\u201a", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + ] + assert central_european_quotation_mark_resolver.get_issues() == set() + + +def test_standard_swedish_quotation_mark_recognition() -> None: + standard_swedish_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + ) + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_swedish_resolver_settings) + + text_segment = ( + TextSegment.Builder() + .set_text("\u201dThis is a \u2019quote\u2019\u201d") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build() + ) + assert list( + standard_swedish_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + ] + assert standard_swedish_quotation_mark_resolver.get_issues() == set() + + +def test_multiple_conventions_quotation_mark_recognition() -> None: + typewriter_french_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_french") + ) + assert typewriter_french_quote_convention is not None + + central_european_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + ) + assert central_european_quote_convention is not None + + standard_swedish_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + ) + assert standard_swedish_quote_convention is not None + multiple_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [typewriter_french_quote_convention, central_european_quote_convention, standard_swedish_quote_convention] + ) + ) + multiple_conventions_quotation_mark_resolver = DepthBasedQuotationMarkResolver( + multiple_conventions_resolver_settings + ) + + text_segment = ( + TextSegment.Builder() + .set_text("\u201eThis is a \u2019quote>\u201c") + .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .build() + ) + assert list( + multiple_conventions_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), + QuotationMarkMetadata(">", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + ] + assert multiple_conventions_quotation_mark_resolver.get_issues() == set() diff --git a/tests/corpora/analysis/test_preliminary_quotation_analyzer.py b/tests/corpora/analysis/test_preliminary_quotation_analyzer.py new file mode 100644 index 00000000..de167f05 --- /dev/null +++ b/tests/corpora/analysis/test_preliminary_quotation_analyzer.py @@ -0,0 +1,987 @@ +from machine.corpora.analysis import ( + ApostropheProportionStatistics, + Chapter, + PreliminaryApostropheAnalyzer, + PreliminaryQuotationAnalyzer, + QuotationMarkGrouper, + QuotationMarkSequences, + QuotationMarkStringMatch, + QuotationMarkWordPositions, + QuoteConvention, + QuoteConventionSet, + SingleLevelQuoteConvention, + TextSegment, + Verse, +) + + +# ApostropheProportionStatistics tests +def test_apostrophe_proportion_statistics_reset() -> None: + apostrophe_proportion_statistics = ApostropheProportionStatistics() + apostrophe_proportion_statistics.count_characters(TextSegment.Builder().set_text("'").build()) + apostrophe_proportion_statistics.add_apostrophe() + assert apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.5) + + apostrophe_proportion_statistics.reset() + assert not apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.5) + + +def test_is_apostrophe_proportion_greater_than() -> None: + apostrophe_proportion_statistics = ApostropheProportionStatistics() + assert not apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.0) + + # invalid case where no characters have been counted + apostrophe_proportion_statistics.add_apostrophe() + assert not apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.0) + + apostrophe_proportion_statistics.count_characters(TextSegment.Builder().set_text("a").build()) + assert apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.99) + + apostrophe_proportion_statistics.add_apostrophe() + apostrophe_proportion_statistics.count_characters(TextSegment.Builder().set_text("bcd").build()) + assert apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.4) + assert not apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.5) + + apostrophe_proportion_statistics.count_characters(TextSegment.Builder().set_text("ef").build()) + assert apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.3) + assert not apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.4) + + +# QuotationMarkWordPosition tests +def test_is_mark_rarely_initial() -> None: + quotation_mark_word_positions = QuotationMarkWordPositions() + assert not quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + assert quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + assert not quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + assert quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201c") + assert quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201c") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201c") + assert quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + assert not quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + +def test_is_mark_rarely_final() -> None: + quotation_mark_word_positions = QuotationMarkWordPositions() + assert not quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + assert quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + assert not quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + assert quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201c") + assert quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201c") + quotation_mark_word_positions.count_word_final_apostrophe("\u201c") + assert quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + assert not quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + +def test_are_initial_and_final_rates_similar() -> None: + quotation_mark_word_positions = QuotationMarkWordPositions() + assert not quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + assert quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + assert not quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + assert quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + assert not quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + assert quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + +def test_is_mark_commonly_mid_word() -> None: + quotation_mark_word_positions = QuotationMarkWordPositions() + assert not quotation_mark_word_positions.is_mark_commonly_mid_word("'") + + quotation_mark_word_positions.count_mid_word_apostrophe("'") + assert quotation_mark_word_positions.is_mark_commonly_mid_word("'") + + quotation_mark_word_positions.count_word_initial_apostrophe("'") + quotation_mark_word_positions.count_word_final_apostrophe("'") + quotation_mark_word_positions.count_word_initial_apostrophe("'") + quotation_mark_word_positions.count_word_final_apostrophe("'") + assert not quotation_mark_word_positions.is_mark_commonly_mid_word("'") + + quotation_mark_word_positions.count_mid_word_apostrophe("'") + assert quotation_mark_word_positions.is_mark_commonly_mid_word("'") + + +def test_quotation_mark_word_positions_reset() -> None: + quotation_mark_word_positions = QuotationMarkWordPositions() + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + + assert quotation_mark_word_positions.is_mark_commonly_mid_word("\u201d") + + quotation_mark_word_positions.reset() + + assert not quotation_mark_word_positions.is_mark_commonly_mid_word("\u201d") + + +# QuotationMarkSequence tests +def test_is_mark_much_more_common_earlier() -> None: + quotation_mark_sequences = QuotationMarkSequences() + assert not quotation_mark_sequences.is_mark_much_more_common_earlier('"') + + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + assert quotation_mark_sequences.is_mark_much_more_common_earlier('"') + + quotation_mark_sequences.record_later_quotation_mark('"') + assert not quotation_mark_sequences.is_mark_much_more_common_earlier('"') + + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + assert quotation_mark_sequences.is_mark_much_more_common_earlier('"') + + quotation_mark_sequences.record_later_quotation_mark('"') + assert not quotation_mark_sequences.is_mark_much_more_common_earlier('"') + + +def test_is_mark_much_more_common_later() -> None: + quotation_mark_sequences = QuotationMarkSequences() + assert not quotation_mark_sequences.is_mark_much_more_common_later('"') + + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + assert quotation_mark_sequences.is_mark_much_more_common_later('"') + + quotation_mark_sequences.record_earlier_quotation_mark('"') + assert not quotation_mark_sequences.is_mark_much_more_common_later('"') + + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + assert quotation_mark_sequences.is_mark_much_more_common_later('"') + + quotation_mark_sequences.record_earlier_quotation_mark('"') + assert not quotation_mark_sequences.is_mark_much_more_common_later('"') + + +def test_is_mark_common_early_and_late() -> None: + quotation_mark_sequences = QuotationMarkSequences() + assert not quotation_mark_sequences.is_mark_common_early_and_late('"') + + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + assert quotation_mark_sequences.is_mark_common_early_and_late('"') + + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.record_later_quotation_mark('"') + assert quotation_mark_sequences.is_mark_common_early_and_late('"') + + quotation_mark_sequences.record_later_quotation_mark('"') + assert quotation_mark_sequences.is_mark_common_early_and_late('"') + + quotation_mark_sequences.record_later_quotation_mark('"') + assert not quotation_mark_sequences.is_mark_common_early_and_late('"') + + +# QuotationMarkGrouper tests +def test_get_quotation_mark_pairs() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + typewriter_english_quote_convention: QuoteConvention = QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + + quotation_mark_grouper = QuotationMarkGrouper([], QuoteConventionSet([standard_english_quote_convention])) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # no paired quotation mark + quotation_mark_grouper = QuotationMarkGrouper( + [QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1)], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # basic quotation mark pair + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [("\u201c", "\u201d")] + + # out-of-order quotation mark pair + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # multiple unpaired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # paired and unpaired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2018\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2018\u201d").build(), 1, 2), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2018\u201d").build(), 2, 3), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [("\u201c", "\u201d")] + + # ambiguous unpaired quotation mark + quotation_mark_grouper = QuotationMarkGrouper( + [QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1)], + QuoteConventionSet([typewriter_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # paired ambiguous quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('""').build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text('""').build(), 1, 2), + ], + QuoteConventionSet([typewriter_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [('"', '"')] + + # multiple paired quotation marks (should be skipped because we don't know how to pair them) + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u201c\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u201c\u201d").build(), 1, 2), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u201c\u201d").build(), 2, 3), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u201c\u201d").build(), 3, 4), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # multiple different paired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u2018\u2019").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u2018\u2019").build(), 1, 2), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u2018\u2019").build(), 2, 3), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u2018\u2019").build(), 3, 4), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [("\u201c", "\u201d"), ("\u2018", "\u2019")] + + # second-level paired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018\u2019").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018\u2019").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [("\u2018", "\u2019")] + + # quotation marks that don't match the convention set + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2), + ], + QuoteConventionSet([typewriter_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + +def test_has_distinct_paired_quotation_marks() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + typewriter_english_quote_convention: QuoteConvention = QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + + quotation_mark_grouper = QuotationMarkGrouper( + [], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201c") + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201d") + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("") + + # basic paired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201c") + assert quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201d") + + # second-level paired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018\u2019").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018\u2019").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert quotation_mark_grouper.has_distinct_paired_quotation_mark("\u2018") + assert quotation_mark_grouper.has_distinct_paired_quotation_mark("\u2019") + + # only one half of the pair observed + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201c") + assert quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201d") + + # quotation marks that don't match the convention set + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2), + ], + QuoteConventionSet([typewriter_english_quote_convention]), + ) + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201c") + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201d") + + # ambiguous quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('""').build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text('""').build(), 1, 2), + ], + QuoteConventionSet([typewriter_english_quote_convention]), + ) + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark('"') + + +# PreliminaryApostropheAnalyzer tests +def test_that_the_mark_must_be_an_apostrophe() -> None: + preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("alternative mid\u2019word apostrophe").build(), 15, 16 + ), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid\u2018word quotation mark").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid\u201cword quotation mark").build(), 3, 4), + ], + ) + assert preliminary_apostrophe_analyzer.is_apostrophe_only("'") + assert preliminary_apostrophe_analyzer.is_apostrophe_only("\u2019") + assert not preliminary_apostrophe_analyzer.is_apostrophe_only("\u2018") + assert not preliminary_apostrophe_analyzer.is_apostrophe_only("\u201c") + assert not preliminary_apostrophe_analyzer.is_apostrophe_only("\u201d") + + +def test_that_a_rarely_initial_or_final_mark_is_an_apostrophe() -> None: + negative_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + negative_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + ], + ) + assert not negative_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + positive_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + positive_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + TextSegment.Builder() + .set_text( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .build(), + TextSegment.Builder() + .set_text( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .build(), + TextSegment.Builder() + .set_text("Technically Unicode has a separate character for the glottal stop, but it is rarely used") + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + ], + ) + assert positive_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + +def test_that_a_mark_with_similar_final_and_initial_rates_is_an_apostrophe() -> None: + negative_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + negative_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + TextSegment.Builder() + .set_text("We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test") + .build(), + TextSegment.Builder() + .set_text( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .build(), + TextSegment.Builder() + .set_text( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + ], + ) + assert not negative_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + negative_preliminary_apostrophe_analyzer2 = PreliminaryApostropheAnalyzer() + negative_preliminary_apostrophe_analyzer2.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + TextSegment.Builder() + .set_text("We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test") + .build(), + TextSegment.Builder() + .set_text( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .build(), + TextSegment.Builder() + .set_text( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + ], + ) + assert not negative_preliminary_apostrophe_analyzer2.is_apostrophe_only("'") + + positive_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + positive_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + TextSegment.Builder() + .set_text("We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test") + .build(), + TextSegment.Builder() + .set_text( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .build(), + TextSegment.Builder() + .set_text( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + ], + ) + assert positive_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + +def test_that_a_commonly_mid_word_mark_is_an_apostrophe() -> None: + negative_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + negative_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + ], + ) + assert not negative_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + positive_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + positive_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + ], + ) + assert positive_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + +def test_that_a_frequently_occurring_character_is_an_apostrophe() -> None: + negative_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + negative_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + ], + ) + assert not negative_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + positive_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + positive_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Very short text").build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + ], + ) + assert positive_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + +# PreliminaryQuotationAnalyzer tests +def test_that_quotation_mark_sequence_is_used_to_determine_opening_and_closing_quotes() -> None: + standard_english_quote_convention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + typewriter_english_quote_convention = QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + standard_french_quote_convention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + western_european_quote_convention = QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + standard_swedish_quote_convention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + preliminary_quotation_analyzer = PreliminaryQuotationAnalyzer( + QuoteConventionSet( + [ + standard_english_quote_convention, + typewriter_english_quote_convention, + standard_french_quote_convention, + western_european_quote_convention, + standard_swedish_quote_convention, + ] + ) + ) + + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("initial text \u201c quoted English text \u201d final text") + .build() + ] + ) + ] + ) + ] + ) == QuoteConventionSet([standard_english_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("initial text \u201d quoted Swedish text \u201d final text") + .build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([standard_swedish_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("initial text \u00ab quoted French/Western European text \u00bb final text") + .build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([standard_french_quote_convention, western_european_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text('initial text " quoted typewriter English text " final text') + .build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([typewriter_english_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("initial text \u201c quoted English text \u201d final text") + .build(), + TextSegment.Builder().set_text("second level \u2018 English quotes \u2019").build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([standard_english_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text('initial text " quoted typewriter English text " final text') + .build(), + TextSegment.Builder().set_text("second level 'typewriter quotes'").build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([typewriter_english_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("initial text \u201c quoted English text \u201d final text") + .build(), + TextSegment.Builder() + .set_text("the quotes \u201d in this segment \u201c are backwards") + .build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("first-level quotes \u2018 must be observed \u2019 to retain a quote convention") + .build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([]) + + +def test_that_apostrophes_not_considered_as_quotation_marks() -> None: + standard_english_quote_convention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + typewriter_english_quote_convention = QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + + preliminary_quotation_analyzer = PreliminaryQuotationAnalyzer( + QuoteConventionSet( + [ + standard_english_quote_convention, + typewriter_english_quote_convention, + ] + ) + ) + + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("ini'tial 'text \u201c quo'ted English text' \u201d fi'nal text") + .build() + ] + ) + ] + ) + ] + ) == QuoteConventionSet([standard_english_quote_convention]) diff --git a/tests/corpora/analysis/test_quotation_mark_finder.py b/tests/corpora/analysis/test_quotation_mark_finder.py new file mode 100644 index 00000000..10f6cb52 --- /dev/null +++ b/tests/corpora/analysis/test_quotation_mark_finder.py @@ -0,0 +1,290 @@ +from machine.corpora.analysis import ( + QuotationMarkFinder, + QuotationMarkStringMatch, + QuoteConventionSet, + TextSegment, + standard_quote_conventions, +) + + +def test_that_all_possible_quotation_marks_are_identified() -> None: + quotation_mark_finder = QuotationMarkFinder(standard_quote_conventions.standard_quote_conventions) + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text("\u201cSample Text\u201d").build() + ) == [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201cSample Text\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201cSample Text\u201d").build(), 12, 13), + ] + + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text("\"Sample Text'").build() + ) == [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\"Sample Text'").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\"Sample Text'").build(), 12, 13), + ] + + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text("All \u201cthe \u2019English quotation\u2018 marks\u201d").build() + ) == [ + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u201cthe \u2019English quotation\u2018 marks\u201d").build(), 4, 5 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u201cthe \u2019English quotation\u2018 marks\u201d").build(), 9, 10 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u201cthe \u2019English quotation\u2018 marks\u201d").build(), 27, 28 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u201cthe \u2019English quotation\u2018 marks\u201d").build(), 34, 35 + ), + ] + + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text("All \u00abthe \u2039French quotation\u203a marks\u00bb").build() + ) == [ + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u00abthe \u2039French quotation\u203a marks\u00bb").build(), 4, 5 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u00abthe \u2039French quotation\u203a marks\u00bb").build(), 9, 10 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u00abthe \u2039French quotation\u203a marks\u00bb").build(), 26, 27 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u00abthe \u2039French quotation\u203a marks\u00bb").build(), 33, 34 + ), + ] + + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text("All \"the 'typewriter quotation marks").build() + ) == [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("All \"the 'typewriter quotation marks").build(), 4, 5), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \"the 'typewriter quotation marks").build(), 9, 10 + ), + ] + + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder() + .set_text("This has \u201equotes from \u00bbdifferent conventions < None: + standard_english_quote_convention = ( + standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + ) + assert standard_english_quote_convention is not None + + english_quotation_mark_finder = QuotationMarkFinder(QuoteConventionSet([standard_english_quote_convention])) + assert ( + english_quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder() + .set_text("This has \u201equotes from \u00bbdifferent conventions < None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample, text").build(), 8, 9) assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() - - -def test_has_leading_closing_quotation_mark() -> None: - standard_english_quote_convention = QuoteConvention( - "standard_english", - [ - SingleLevelQuoteConvention("\u201c", "\u201d"), - SingleLevelQuoteConvention("\u2018", "\u2019"), - SingleLevelQuoteConvention("\u201c", "\u201d"), - SingleLevelQuoteConvention("\u2018", "\u2019"), - ], - ) - standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) - - normalized_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention.normalize()]) - - quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample\u2019\u201d text").build(), 7, 8 - ) - assert quotation_mark_string_match.has_leading_closing_quotation_mark(standard_english_quote_convention_set) - assert not quotation_mark_string_match.has_leading_closing_quotation_mark(normalized_english_quote_convention_set) - - quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text('sample\u2019" text').build(), 7, 8 - ) - assert quotation_mark_string_match.has_leading_closing_quotation_mark(standard_english_quote_convention_set) - assert not quotation_mark_string_match.has_leading_closing_quotation_mark(normalized_english_quote_convention_set) - - quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text('sample"\u201d text').build(), 7, 8 - ) - assert not quotation_mark_string_match.has_leading_closing_quotation_mark(standard_english_quote_convention_set) - assert quotation_mark_string_match.has_leading_closing_quotation_mark(normalized_english_quote_convention_set) - - quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text('sample" \u201d text').build(), 8, 9 - ) - assert not quotation_mark_string_match.has_leading_closing_quotation_mark(standard_english_quote_convention_set) - assert not quotation_mark_string_match.has_leading_closing_quotation_mark(normalized_english_quote_convention_set) - - -def test_has_trailing_closing_quotation_mark() -> None: - standard_english_quote_convention = QuoteConvention( - "standard_english", - [ - SingleLevelQuoteConvention("\u201c", "\u201d"), - SingleLevelQuoteConvention("\u2018", "\u2019"), - SingleLevelQuoteConvention("\u201c", "\u201d"), - SingleLevelQuoteConvention("\u2018", "\u2019"), - ], - ) - standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) - - normalized_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention.normalize()]) - - quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample\u2019\u201d text").build(), 6, 7 - ) - assert quotation_mark_string_match.has_trailing_closing_quotation_mark(standard_english_quote_convention_set) - assert not quotation_mark_string_match.has_trailing_closing_quotation_mark(normalized_english_quote_convention_set) - - quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text('sample\u2019" text').build(), 6, 7 - ) - assert not quotation_mark_string_match.has_trailing_closing_quotation_mark(standard_english_quote_convention_set) - assert quotation_mark_string_match.has_trailing_closing_quotation_mark(normalized_english_quote_convention_set) - - quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text('sample"\u201d text').build(), 6, 7 - ) - assert quotation_mark_string_match.has_trailing_closing_quotation_mark(standard_english_quote_convention_set) - assert not quotation_mark_string_match.has_trailing_closing_quotation_mark(normalized_english_quote_convention_set) - - quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text('sample" \u201d text').build(), 6, 7 - ) - assert not quotation_mark_string_match.has_trailing_closing_quotation_mark(standard_english_quote_convention_set) - assert not quotation_mark_string_match.has_trailing_closing_quotation_mark(normalized_english_quote_convention_set) diff --git a/tests/corpora/analysis/test_quote_convention_set.py b/tests/corpora/analysis/test_quote_convention_set.py index 01c79d34..d24996e2 100644 --- a/tests/corpora/analysis/test_quote_convention_set.py +++ b/tests/corpora/analysis/test_quote_convention_set.py @@ -1,8 +1,13 @@ +from pytest import approx + from machine.corpora.analysis import ( QuotationMarkDirection, + QuotationMarkMetadata, + QuotationMarkTabulator, QuoteConvention, QuoteConventionSet, SingleLevelQuoteConvention, + TextSegment, ) @@ -1141,5 +1146,181 @@ def test_filter_to_compatible_quote_conventions() -> None: def test_find_most_similar_convention() -> None: - # TODO: test this after testing QuotationMarkTabulator - pass + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + standard_french_quote_convention: QuoteConvention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + western_european_quote_convention: QuoteConvention = QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + all_three_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + standard_french_quote_convention, + western_european_quote_convention, + ] + ) + two_french_quote_convention_set = QuoteConventionSet( + [western_european_quote_convention, standard_french_quote_convention] + ) + + multiple_english_quotes_tabulator = QuotationMarkTabulator() + multiple_english_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 42, 43), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(multiple_english_quotes_tabulator) == ( + standard_english_quote_convention, + 1.0, + ) + + multiple_western_european_quotes_tabulator = QuotationMarkTabulator() + multiple_western_european_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 42, 43), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(multiple_western_european_quotes_tabulator) == ( + western_european_quote_convention, + 1.0, + ) + + multiple_french_quotes_tabulator = QuotationMarkTabulator() + multiple_french_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u203a", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 42, 43), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(multiple_french_quotes_tabulator) == ( + standard_french_quote_convention, + 1.0, + ) + assert two_french_quote_convention_set.find_most_similar_convention(multiple_french_quotes_tabulator) == ( + standard_french_quote_convention, + 1.0, + ) + + noisy_multiple_english_quotes_tabulator = QuotationMarkTabulator() + noisy_multiple_english_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 42, 43), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(noisy_multiple_english_quotes_tabulator) == ( + standard_english_quote_convention, + approx(0.9, rel=1e-9), + ) + assert two_french_quote_convention_set.find_most_similar_convention(noisy_multiple_english_quotes_tabulator) == ( + western_european_quote_convention, + approx(0.1, rel=1e-9), + ) + + noisy_multiple_french_quotes_tabulator = QuotationMarkTabulator() + noisy_multiple_french_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u203a", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 42, 43), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(noisy_multiple_french_quotes_tabulator) == ( + standard_french_quote_convention, + approx(0.916666666666, rel=1e-9), + ) + + too_deep_english_quotes_tabulator = QuotationMarkTabulator() + too_deep_english_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 15, 16), + QuotationMarkMetadata("\u201c", 5, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 17, 18), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(too_deep_english_quotes_tabulator) == ( + standard_english_quote_convention, + approx(0.967741935483871, rel=1e-9), + ) + + # in case of ties, the earlier convention in the list should be returned + unknown_quote_tabulator = QuotationMarkTabulator() + unknown_quote_tabulator.tabulate( + [QuotationMarkMetadata("\u201a", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1)] + ) + assert all_three_quote_convention_set.find_most_similar_convention(unknown_quote_tabulator) == ( + standard_english_quote_convention, + 0.0, + ) + + single_french_opening_quote_tabulator = QuotationMarkTabulator() + single_french_opening_quote_tabulator.tabulate( + [QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1)] + ) + assert all_three_quote_convention_set.find_most_similar_convention(single_french_opening_quote_tabulator) == ( + standard_french_quote_convention, + 1.0, + ) + assert two_french_quote_convention_set.find_most_similar_convention(single_french_opening_quote_tabulator) == ( + western_european_quote_convention, + 1.0, + ) + + # Default values should be returned when the QuoteConventionSet is empty + single_english_opening_quote_tabulator = QuotationMarkTabulator() + single_english_opening_quote_tabulator.tabulate( + [QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1)] + ) + empty_quote_convention_set = QuoteConventionSet([]) + assert empty_quote_convention_set.find_most_similar_convention(single_english_opening_quote_tabulator) == ( + None, + float("-inf"), + ) From 8d9c2f67ee354da8d885ea473e0bbec8198574d3 Mon Sep 17 00:00:00 2001 From: Ben King Date: Sat, 28 Jun 2025 16:25:24 -0400 Subject: [PATCH 17/31] Change update_usfm_parser_handler.py to match main branch --- machine/corpora/update_usfm_parser_handler.py | 11 +++-------- tests/corpora/test_update_usfm_parser_handler.py | 4 +--- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 203e8971..a51021b2 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -1,8 +1,6 @@ from enum import Enum, auto from typing import Iterable, List, Optional, Sequence, Tuple, Union -from ..scripture.verse_ref import VerseRef -from .scripture_embed import is_embed_part_style from .scripture_ref import ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType from .usfm_parser_state import UsfmParserState @@ -355,12 +353,9 @@ def _skip_updatable_tokens(self, state: UsfmParserState) -> None: self._token_index += 1 self._token_index = state.index + 1 + state.special_token_count - def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) -> bool: - marker: Optional[str] = state.token if state.token is None else state.token.marker - in_embed: bool = self._is_in_embed(marker) - - in_nested_embed: bool = self._is_in_nested_embed(marker) - is_style_tag: bool = marker is not None and not is_embed_part_style(marker) + def _replace_with_new_tokens(self, state: UsfmParserState) -> bool: + if self._current_text_type == ScriptureTextType.EMBED: + return False existing_text = any( t.type == UsfmTokenType.TEXT and t.text diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 510ae264..47cd6280 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -209,7 +209,6 @@ def test_paragraph_in_verse(): usfm = r"""\id MAT - Test \c 1 \p paragraph not in a verse -\p paragraph not in a verse \v 1 verse 1 \p inner verse paragraph \s1 Section Header \v 2 Verse 2 \p inner verse paragraph @@ -220,7 +219,6 @@ def test_paragraph_in_verse(): result = r"""\id MAT - Test \c 1 \p paragraph not in a verse -\p paragraph not in a verse \v 1 Update 1 \s1 Section Header \v 2 Verse 2 inner verse paragraph @@ -237,7 +235,7 @@ def test_paragraph_in_verse(): result_strip = r"""\id MAT \c 1 -\p +\p \v 1 Update 1 \s1 \v 2 From cb31252bc2ee58c9afa058d3e105af9b15d0a337 Mon Sep 17 00:00:00 2001 From: Ben King Date: Sat, 28 Jun 2025 16:45:31 -0400 Subject: [PATCH 18/31] Fix linting for test_quote_convention_detector.py --- tests/corpora/analysis/test_quote_convention_detector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/corpora/analysis/test_quote_convention_detector.py b/tests/corpora/analysis/test_quote_convention_detector.py index 59321a0e..a9c142b2 100644 --- a/tests/corpora/analysis/test_quote_convention_detector.py +++ b/tests/corpora/analysis/test_quote_convention_detector.py @@ -289,9 +289,9 @@ def test_mismatched_quotation_marks() -> None: of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?” - \\v 2 The woman said to the serpent, - “We may eat fruit from the trees of the garden, - \\v 3 but not the fruit of the tree which is in the middle of the garden. + \\v 2 The woman said to the serpent, + “We may eat fruit from the trees of the garden, + \\v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’ """ analysis = detect_quote_convention(usfm) From a9da4a841131e726195f5bf4937bf749da398dde Mon Sep 17 00:00:00 2001 From: Ben King Date: Tue, 1 Jul 2025 14:55:01 -0400 Subject: [PATCH 19/31] Address reviewer comments + refactor --- machine/corpora/analysis/chapter.py | 9 +- .../depth_based_quotation_mark_resolver.py | 151 +++---- .../preliminary_quotation_analyzer.py | 96 ++-- .../analysis/quotation_mark_direction.py | 4 +- .../corpora/analysis/quotation_mark_finder.py | 12 +- .../analysis/quotation_mark_metadata.py | 53 +-- .../quotation_mark_resolution_settings.py | 12 +- .../analysis/quotation_mark_resolver.py | 6 +- .../analysis/quotation_mark_string_match.py | 155 +++---- .../analysis/quotation_mark_tabulator.py | 50 +- machine/corpora/analysis/quote_convention.py | 35 +- ...onvention_detection_resolution_settings.py | 10 +- .../analysis/quote_convention_detector.py | 21 +- .../corpora/analysis/quote_convention_set.py | 59 +-- machine/corpora/analysis/text_segment.py | 91 ++-- machine/corpora/analysis/usfm_marker_type.py | 14 +- .../analysis/usfm_structure_extractor.py | 44 +- machine/corpora/analysis/verse.py | 11 +- .../fallback_quotation_mark_resolver.py | 14 +- .../quotation_denormalization_first_pass.py | 1 + ...normalization_usfm_update_block_handler.py | 1 + .../quotation_mark_update_first_pass.py | 11 +- ...otation_mark_update_resolution_settings.py | 8 +- ...tion_changing_usfm_update_block_handler.py | 12 +- tests/corpora/analysis/test_chapter.py | 6 +- ...est_depth_based_quotation_mark_resolver.py | 322 ++++++------- .../analysis/test_quotation_mark_metadata.py | 12 +- .../analysis/test_quotation_mark_resolver.py | 33 +- .../test_quotation_mark_string_match.py | 96 ++-- .../analysis/test_quotation_mark_tabulator.py | 16 +- .../corpora/analysis/test_quote_convention.py | 104 ++--- .../test_quote_convention_detector.py | 44 +- .../analysis/test_quote_convention_set.py | 426 +++++++++--------- tests/corpora/analysis/test_text_segment.py | 190 ++++---- .../analysis/test_usfm_structure_extractor.py | 148 +++--- tests/corpora/analysis/test_verse.py | 16 +- .../test_fallback_quotation_mark_resolver.py | 21 +- .../test_quotation_mark_update_first_pass.py | 32 +- ...tion_changing_usfm_block_update_handler.py | 105 +++-- 39 files changed, 1181 insertions(+), 1270 deletions(-) diff --git a/machine/corpora/analysis/chapter.py b/machine/corpora/analysis/chapter.py index f96441e6..30d01add 100644 --- a/machine/corpora/analysis/chapter.py +++ b/machine/corpora/analysis/chapter.py @@ -1,9 +1,8 @@ +from dataclasses import dataclass + from .verse import Verse +@dataclass class Chapter: - def __init__(self, verses: list[Verse]): - self.verses = verses - - def get_verses(self) -> list[Verse]: - return self.verses + verses: list[Verse] diff --git a/machine/corpora/analysis/depth_based_quotation_mark_resolver.py b/machine/corpora/analysis/depth_based_quotation_mark_resolver.py index 9695b65a..004212b1 100644 --- a/machine/corpora/analysis/depth_based_quotation_mark_resolver.py +++ b/machine/corpora/analysis/depth_based_quotation_mark_resolver.py @@ -18,43 +18,44 @@ def __init__(self): self.reset() def reset(self) -> None: - self.quotation_stack: list[QuotationMarkMetadata] = [] - self.current_depth: int = 0 + self._quotation_stack: list[QuotationMarkMetadata] = [] + self._current_depth: int = 0 - def get_current_depth(self) -> int: - return self.current_depth + 1 + @property + def current_depth(self) -> int: + return self._current_depth + 1 def has_open_quotation_mark(self) -> bool: - return self.current_depth > 0 + return self._current_depth > 0 def are_more_than_n_quotes_open(self, n: int) -> bool: - return self.current_depth > n + return self._current_depth > n def add_opening_quotation_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - quote = quote_match.resolve(self.current_depth + 1, QuotationMarkDirection.Opening) - self.quotation_stack.append(quote) - self.current_depth += 1 + quote = quote_match.resolve(self._current_depth + 1, QuotationMarkDirection.OPENING) + self._quotation_stack.append(quote) + self._current_depth += 1 return quote def add_closing_quotation_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - quote = quote_match.resolve(self.current_depth, QuotationMarkDirection.Closing) - self.quotation_stack.pop() - self.current_depth -= 1 + quote = quote_match.resolve(self._current_depth, QuotationMarkDirection.CLOSING) + self._quotation_stack.pop() + self._current_depth -= 1 return quote def get_opening_quotation_mark_at_depth(self, depth: int) -> str: - if depth > len(self.quotation_stack): + if depth > len(self._quotation_stack): raise RuntimeError( "get_opening_quotation_mark_at_depth() was called with a depth greater than the quotation stack size." ) - return self.quotation_stack[depth - 1].get_quotation_mark() + return self._quotation_stack[depth - 1].quotation_mark def get_deepest_opening_quotation_mark(self) -> str: if not self.has_open_quotation_mark(): raise RuntimeError( "get_deepest_opening_quotation_mark() was called when the stack of quotation marks was empty." ) - return self.quotation_stack[-1].get_quotation_mark() + return self._quotation_stack[-1].quotation_mark class QuotationContinuerStyle(Enum): @@ -68,18 +69,20 @@ def __init__(self): self.reset() def reset(self) -> None: - self.quotation_continuer_stack: list[QuotationMarkMetadata] = [] - self.current_depth = 0 - self.continuer_style = QuotationContinuerStyle.UNDETERMINED + self._quotation_continuer_stack: list[QuotationMarkMetadata] = [] + self._current_depth = 0 + self._continuer_style = QuotationContinuerStyle.UNDETERMINED - def get_current_depth(self) -> int: - return self.current_depth + @property + def current_depth(self) -> int: + return self._current_depth - def has_continuer_been_observed(self) -> bool: - return len(self.quotation_continuer_stack) > 0 + def continuer_has_been_observed(self) -> bool: + return len(self._quotation_continuer_stack) > 0 - def get_continuer_style(self) -> QuotationContinuerStyle: - return self.continuer_style + @property + def continuer_style(self) -> QuotationContinuerStyle: + return self._continuer_style def add_quotation_continuer( self, @@ -87,13 +90,13 @@ def add_quotation_continuer( quotation_mark_resolver_state: QuotationMarkResolverState, quotation_continuer_style: QuotationContinuerStyle, ) -> QuotationMarkMetadata: - quote = quote_match.resolve(len(self.quotation_continuer_stack) + 1, QuotationMarkDirection.Opening) - self.quotation_continuer_stack.append(quote) - self.current_depth += 1 - self.continuer_style = quotation_continuer_style - if len(self.quotation_continuer_stack) == len(quotation_mark_resolver_state.quotation_stack): - self.quotation_continuer_stack.clear() - self.current_depth = 0 + quote = quote_match.resolve(len(self._quotation_continuer_stack) + 1, QuotationMarkDirection.OPENING) + self._quotation_continuer_stack.append(quote) + self._current_depth += 1 + self._continuer_style = quotation_continuer_style + if len(self._quotation_continuer_stack) == len(quotation_mark_resolver_state._quotation_stack): + self._quotation_continuer_stack.clear() + self._current_depth = 0 return quote @@ -116,30 +119,24 @@ def is_english_quotation_continuer( previous_match: Union[QuotationMarkStringMatch, None], next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if self._quotation_continuer_state.get_continuer_style() == QuotationContinuerStyle.SPANISH: + if self._quotation_continuer_state.continuer_style == QuotationContinuerStyle.SPANISH: return False if not self._meets_quote_continuer_prerequisites(quote_match, previous_match, next_match): return False - if not self._quotation_continuer_state.has_continuer_been_observed(): - if quote_match.start_index > 0: + if not self._quotation_continuer_state.continuer_has_been_observed(): + if quote_match._start_index > 0: return False - if ( - quote_match.get_quotation_mark() - != self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( - self._quotation_continuer_state.get_current_depth() + 1 - ) + if quote_match.quotation_mark != self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( + self._quotation_continuer_state.current_depth + 1 ): return False if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): - if next_match is None or next_match.get_start_index() != quote_match.get_end_index(): + if next_match is None or next_match.start_index != quote_match.end_index: return False else: - if ( - quote_match.get_quotation_mark() - != self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( - self._quotation_continuer_state.get_current_depth() + 1 - ) + if quote_match.quotation_mark != self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( + self._quotation_continuer_state.current_depth + 1 ): return False @@ -151,34 +148,34 @@ def is_spanish_quotation_continuer( previous_match: Union[QuotationMarkStringMatch, None], next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if self._quotation_continuer_state.get_continuer_style() == QuotationContinuerStyle.ENGLISH: + if self._quotation_continuer_state.continuer_style == QuotationContinuerStyle.ENGLISH: return False if not self._meets_quote_continuer_prerequisites(quote_match, previous_match, next_match): return False - if not self._quotation_continuer_state.has_continuer_been_observed(): - if quote_match.start_index > 0: + if not self._quotation_continuer_state.continuer_has_been_observed(): + if quote_match._start_index > 0: return False # this has only been observed with guillemets so far - if quote_match.get_quotation_mark() != "»": + if quote_match.quotation_mark != "»": return False if not self._settings.are_marks_a_valid_pair( self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( - self._quotation_continuer_state.get_current_depth() + 1 + self._quotation_continuer_state.current_depth + 1 ), - quote_match.get_quotation_mark(), + quote_match.quotation_mark, ): return False if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): - if next_match is None or next_match.get_start_index() != quote_match.get_end_index(): + if next_match is None or next_match.start_index != quote_match.end_index: return False else: if not self._settings.are_marks_a_valid_pair( self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( - self._quotation_continuer_state.get_current_depth() + 1 + self._quotation_continuer_state.current_depth + 1 ), - quote_match.get_quotation_mark(), + quote_match.quotation_mark, ): return False @@ -192,7 +189,7 @@ def _meets_quote_continuer_prerequisites( ) -> bool: if ( self._settings.should_rely_on_paragraph_markers() - and not quote_match.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.ParagraphMarker) + and not quote_match._text_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH) ): return False if not self._quotation_mark_resolver_state.has_open_quotation_mark(): @@ -212,7 +209,7 @@ def is_opening_quote( if self._settings.is_valid_closing_quotation_mark(match): return ( match.has_leading_whitespace() - or self._does_most_recent_opening_mark_immediately_precede(match) + or self._most_recent_opening_mark_immediately_precedes(match) or match.has_quote_introducer_in_leading_substring() ) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation()) return True @@ -231,7 +228,7 @@ def is_closing_quote( match.has_trailing_whitespace() or match.has_trailing_punctuation() or match.is_at_end_of_segment() - or match.does_next_character_match(self._settings.get_closing_quotation_mark_regex()) + or match.next_character_matches(self._settings.get_closing_quotation_mark_regex()) ) and not match.has_leading_whitespace() return True @@ -269,7 +266,7 @@ def is_malformed_closing_quote( ) and self._quotation_mark_resolver_state.has_open_quotation_mark() and self._settings.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.quotation_mark ) ) @@ -285,55 +282,53 @@ def is_unpaired_closing_quote( return not match.has_leading_whitespace() and (match.is_at_end_of_segment() or match.has_trailing_whitespace()) - def _does_most_recent_opening_mark_immediately_precede(self, match: QuotationMarkStringMatch) -> bool: + def _most_recent_opening_mark_immediately_precedes(self, match: QuotationMarkStringMatch) -> bool: if not self._quotation_mark_resolver_state.has_open_quotation_mark(): return False - return ( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == match.get_previous_character() - ) + return self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == match.previous_character def is_apostrophe( self, match: QuotationMarkStringMatch, next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if not match.does_quotation_mark_match(self.apostrophe_pattern): + if not match.quotation_mark_matches(self.apostrophe_pattern): return False # Latin letters on both sides of punctuation mark if ( - match.get_previous_character() is not None + match.previous_character is not None and match.has_leading_latin_letter() - and match.get_next_character() is not None + and match.next_character is not None and match.has_trailing_latin_letter() ): return True # potential final s possessive (e.g. Moses') - if match.does_previous_character_match(regex.compile(r"s")) and ( + if match.previous_character_matches(regex.compile(r"s")) and ( match.has_trailing_whitespace() or match.has_trailing_punctuation() ): # check whether it could be a closing quote if not self._quotation_mark_resolver_state.has_open_quotation_mark(): return True if not self._settings.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.quotation_mark ): return True if next_match is not None and self._settings.are_marks_a_valid_pair( self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), - next_match.get_quotation_mark(), + next_match.quotation_mark, ): return True # for languages that use apostrophes at the start and end of words if ( not self._quotation_mark_resolver_state.has_open_quotation_mark() - and match.get_quotation_mark() == "'" + and match.quotation_mark == "'" or self._quotation_mark_resolver_state.has_open_quotation_mark() and not self._settings.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.get_quotation_mark() + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.quotation_mark ) ): return True @@ -411,19 +406,19 @@ def _is_depth_too_great(self) -> bool: return self._quotation_mark_resolver_state.are_more_than_n_quotes_open(3) def _process_opening_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - if not self._settings.does_metadata_match_quotation_mark( - quote_match.get_quotation_mark(), - self._quotation_mark_resolver_state.get_current_depth(), - QuotationMarkDirection.Opening, + if not self._settings.metadata_matches_quotation_mark( + quote_match.quotation_mark, + self._quotation_mark_resolver_state.current_depth, + QuotationMarkDirection.OPENING, ): self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) return self._quotation_mark_resolver_state.add_opening_quotation_mark(quote_match) def _process_closing_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - if not self._settings.does_metadata_match_quotation_mark( - quote_match.get_quotation_mark(), - self._quotation_mark_resolver_state.get_current_depth() - 1, - QuotationMarkDirection.Closing, + if not self._settings.metadata_matches_quotation_mark( + quote_match.quotation_mark, + self._quotation_mark_resolver_state.current_depth - 1, + QuotationMarkDirection.CLOSING, ): self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) return self._quotation_mark_resolver_state.add_closing_quotation_mark(quote_match) diff --git a/machine/corpora/analysis/preliminary_quotation_analyzer.py b/machine/corpora/analysis/preliminary_quotation_analyzer.py index a5699e94..c3a43a94 100644 --- a/machine/corpora/analysis/preliminary_quotation_analyzer.py +++ b/machine/corpora/analysis/preliminary_quotation_analyzer.py @@ -15,19 +15,19 @@ def __init__(self): self.reset() def reset(self) -> None: - self.num_characters = 0 - self.num_apostrophes = 0 + self._num_characters = 0 + self._num_apostrophes = 0 def count_characters(self, text_segment: TextSegment) -> None: - self.num_characters += len(text_segment.get_text()) + self._num_characters += text_segment.length def add_apostrophe(self) -> None: - self.num_apostrophes += 1 + self._num_apostrophes += 1 def is_apostrophe_proportion_greater_than(self, threshold: float) -> bool: - if self.num_characters == 0: + if self._num_characters == 0: return False - return self.num_apostrophes / self.num_characters > threshold + return self._num_apostrophes / self._num_characters > threshold class QuotationMarkWordPositions: @@ -35,33 +35,33 @@ def __init__(self): self.reset() def reset(self) -> None: - self.word_initial_occurrences: Dict[str, int] = dict() - self.mid_word_occurrences: Dict[str, int] = dict() - self.word_final_occurrences: Dict[str, int] = dict() + self._word_initial_occurrences: Dict[str, int] = dict() + self._mid_word_occurrences: Dict[str, int] = dict() + self._word_final_occurrences: Dict[str, int] = dict() def count_word_initial_apostrophe(self, quotation_mark: str) -> None: - if quotation_mark not in self.word_initial_occurrences: - self.word_initial_occurrences[quotation_mark] = 0 - self.word_initial_occurrences[quotation_mark] += 1 + if quotation_mark not in self._word_initial_occurrences: + self._word_initial_occurrences[quotation_mark] = 0 + self._word_initial_occurrences[quotation_mark] += 1 def count_mid_word_apostrophe(self, quotation_mark: str) -> None: - if quotation_mark not in self.mid_word_occurrences: - self.mid_word_occurrences[quotation_mark] = 0 - self.mid_word_occurrences[quotation_mark] += 1 + if quotation_mark not in self._mid_word_occurrences: + self._mid_word_occurrences[quotation_mark] = 0 + self._mid_word_occurrences[quotation_mark] += 1 def count_word_final_apostrophe(self, quotation_mark: str) -> None: - if quotation_mark not in self.word_final_occurrences: - self.word_final_occurrences[quotation_mark] = 0 - self.word_final_occurrences[quotation_mark] += 1 + if quotation_mark not in self._word_final_occurrences: + self._word_final_occurrences[quotation_mark] = 0 + self._word_final_occurrences[quotation_mark] += 1 def _get_word_initial_occurrences(self, quotation_mark: str) -> int: - return self.word_initial_occurrences[quotation_mark] if quotation_mark in self.word_initial_occurrences else 0 + return self._word_initial_occurrences[quotation_mark] if quotation_mark in self._word_initial_occurrences else 0 def _get_mid_word_occurrences(self, quotation_mark: str) -> int: - return self.mid_word_occurrences[quotation_mark] if quotation_mark in self.mid_word_occurrences else 0 + return self._mid_word_occurrences[quotation_mark] if quotation_mark in self._mid_word_occurrences else 0 def _get_word_final_occurrences(self, quotation_mark: str) -> int: - return self.word_final_occurrences[quotation_mark] if quotation_mark in self.word_final_occurrences else 0 + return self._word_final_occurrences[quotation_mark] if quotation_mark in self._word_final_occurrences else 0 def _get_total_occurrences(self, quotation_mark: str) -> int: return ( @@ -97,30 +97,30 @@ def __init__(self): self.reset() def reset(self) -> None: - self.earlier_quotation_mark_counts: Dict[str, int] = dict() - self.later_quotation_mark_counts: Dict[str, int] = dict() + self._earlier_quotation_mark_counts: Dict[str, int] = dict() + self._later_quotation_mark_counts: Dict[str, int] = dict() def record_earlier_quotation_mark(self, quotation_mark: str) -> None: - if quotation_mark not in self.earlier_quotation_mark_counts: - self.earlier_quotation_mark_counts[quotation_mark] = 0 - self.earlier_quotation_mark_counts[quotation_mark] += 1 + if quotation_mark not in self._earlier_quotation_mark_counts: + self._earlier_quotation_mark_counts[quotation_mark] = 0 + self._earlier_quotation_mark_counts[quotation_mark] += 1 def record_later_quotation_mark(self, quotation_mark: str) -> None: - if quotation_mark not in self.later_quotation_mark_counts: - self.later_quotation_mark_counts[quotation_mark] = 0 - self.later_quotation_mark_counts[quotation_mark] += 1 + if quotation_mark not in self._later_quotation_mark_counts: + self._later_quotation_mark_counts[quotation_mark] = 0 + self._later_quotation_mark_counts[quotation_mark] += 1 def _get_earlier_occurrences(self, quotation_mark: str) -> int: return ( - self.earlier_quotation_mark_counts[quotation_mark] - if quotation_mark in self.earlier_quotation_mark_counts + self._earlier_quotation_mark_counts[quotation_mark] + if quotation_mark in self._earlier_quotation_mark_counts else 0 ) def _get_later_occurrences(self, quotation_mark: str) -> int: return ( - self.later_quotation_mark_counts[quotation_mark] - if quotation_mark in self.later_quotation_mark_counts + self._later_quotation_mark_counts[quotation_mark] + if quotation_mark in self._later_quotation_mark_counts else 0 ) @@ -149,22 +149,22 @@ def is_mark_common_early_and_late(self, quotation_mark: str) -> bool: class QuotationMarkGrouper: def __init__(self, quotation_marks: List[QuotationMarkStringMatch], quote_convention_set: QuoteConventionSet): - self.quote_convention_set = quote_convention_set + self._quote_convention_set = quote_convention_set self._group_quotation_marks(quotation_marks) def _group_quotation_marks(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: - self.grouped_quotation_marks: Dict[str, List[QuotationMarkStringMatch]] = dict() + self._grouped_quotation_marks: Dict[str, List[QuotationMarkStringMatch]] = dict() for quotation_mark_match in quotation_marks: - if quotation_mark_match.get_quotation_mark() not in self.grouped_quotation_marks: - self.grouped_quotation_marks[quotation_mark_match.get_quotation_mark()] = [] - self.grouped_quotation_marks[quotation_mark_match.get_quotation_mark()].append(quotation_mark_match) + if quotation_mark_match.quotation_mark not in self._grouped_quotation_marks: + self._grouped_quotation_marks[quotation_mark_match.quotation_mark] = [] + self._grouped_quotation_marks[quotation_mark_match.quotation_mark].append(quotation_mark_match) def get_quotation_mark_pairs(self) -> Generator[Tuple[str, str], None, None]: - for mark1, matches1 in self.grouped_quotation_marks.items(): + for mark1, matches1 in self._grouped_quotation_marks.items(): # handle cases of identical opening/closing marks if ( len(matches1) == 2 - and self.quote_convention_set.is_quotation_mark_direction_ambiguous(mark1) + and self._quote_convention_set.is_quotation_mark_direction_ambiguous(mark1) and not self.has_distinct_paired_quotation_mark(mark1) ): yield (mark1, mark1) @@ -175,10 +175,10 @@ def get_quotation_mark_pairs(self) -> Generator[Tuple[str, str], None, None]: continue # find matching closing marks - for mark2, matches2 in self.grouped_quotation_marks.items(): + for mark2, matches2 in self._grouped_quotation_marks.items(): if ( len(matches2) == 1 - and self.quote_convention_set.are_marks_a_valid_pair(mark1, mark2) + and self._quote_convention_set.marks_are_a_valid_pair(mark1, mark2) and matches1[0].precedes(matches2[0]) ): yield (mark1, mark2) @@ -186,8 +186,8 @@ def get_quotation_mark_pairs(self) -> Generator[Tuple[str, str], None, None]: def has_distinct_paired_quotation_mark(self, quotation_mark: str) -> bool: return any( [ - mark != quotation_mark and mark in self.grouped_quotation_marks - for mark in self.quote_convention_set.get_possible_paired_quotation_marks(quotation_mark) + mark != quotation_mark and mark in self._grouped_quotation_marks + for mark in self._quote_convention_set.get_possible_paired_quotation_marks(quotation_mark) ] ) @@ -213,11 +213,11 @@ def process_quotation_marks( self._process_quotation_mark(quotation_mark_match) def _process_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> None: - if quotation_mark_match.does_quotation_mark_match(self.apostrophe_pattern): + if quotation_mark_match.quotation_mark_matches(self.apostrophe_pattern): self._count_apostrophe(quotation_mark_match) def _count_apostrophe(self, apostrophe_match: QuotationMarkStringMatch) -> None: - apostrophe: str = apostrophe_match.get_quotation_mark() + apostrophe: str = apostrophe_match.quotation_mark self._apostrophe_proportion_statistics.add_apostrophe() if self._is_match_word_initial(apostrophe_match): self._word_position_statistics.count_word_initial_apostrophe(apostrophe) @@ -285,7 +285,7 @@ def narrow_down_possible_quote_conventions(self, chapters: List[Chapter]) -> Quo return self._select_compatible_quote_conventions() def _analyze_quotation_marks_for_chapter(self, chapter: Chapter) -> None: - for verse in chapter.get_verses(): + for verse in chapter.verses: self._analyze_quotation_marks_for_verse(verse) def _analyze_quotation_marks_for_verse(self, verse: Verse) -> None: @@ -293,7 +293,7 @@ def _analyze_quotation_marks_for_verse(self, verse: Verse) -> None: self._quote_conventions ).find_all_potential_quotation_marks_in_verse(verse) self._analyze_quotation_mark_sequence(quotation_marks) - self._apostrophe_analyzer.process_quotation_marks(verse.get_text_segments(), quotation_marks) + self._apostrophe_analyzer.process_quotation_marks(verse.text_segments, quotation_marks) def _analyze_quotation_mark_sequence(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: quotation_mark_grouper: QuotationMarkGrouper = QuotationMarkGrouper(quotation_marks, self._quote_conventions) diff --git a/machine/corpora/analysis/quotation_mark_direction.py b/machine/corpora/analysis/quotation_mark_direction.py index f606f991..87734ae3 100644 --- a/machine/corpora/analysis/quotation_mark_direction.py +++ b/machine/corpora/analysis/quotation_mark_direction.py @@ -2,5 +2,5 @@ class QuotationMarkDirection(Enum): - Opening = auto() - Closing = auto() + OPENING = auto() + CLOSING = auto() diff --git a/machine/corpora/analysis/quotation_mark_finder.py b/machine/corpora/analysis/quotation_mark_finder.py index 73078abc..30a60cf1 100644 --- a/machine/corpora/analysis/quotation_mark_finder.py +++ b/machine/corpora/analysis/quotation_mark_finder.py @@ -13,16 +13,16 @@ class QuotationMarkFinder: quote_pattern = regex.compile(r"(\p{Quotation_Mark}|<<|>>|<|>)", regex.U) def __init__(self, quote_convention_set: QuoteConventionSet): - self.quote_convention_set = quote_convention_set + self._quote_convention_set = quote_convention_set def find_all_potential_quotation_marks_in_chapter(self, chapter: Chapter) -> List[QuotationMarkStringMatch]: quotation_matches: List[QuotationMarkStringMatch] = [] - for verse in chapter.get_verses(): + for verse in chapter.verses: quotation_matches.extend(self.find_all_potential_quotation_marks_in_verse(verse)) return quotation_matches def find_all_potential_quotation_marks_in_verse(self, verse: Verse) -> List[QuotationMarkStringMatch]: - return self.find_all_potential_quotation_marks_in_text_segments(verse.get_text_segments()) + return self.find_all_potential_quotation_marks_in_text_segments(verse.text_segments) def find_all_potential_quotation_marks_in_text_segments( self, text_segments: List[TextSegment] @@ -36,9 +36,9 @@ def find_all_potential_quotation_marks_in_text_segment( self, text_segment: TextSegment ) -> List[QuotationMarkStringMatch]: quotation_matches: List[QuotationMarkStringMatch] = [] - for quote_match in self.quote_pattern.finditer(text_segment.get_text()): - if self.quote_convention_set.is_valid_opening_quotation_mark( + for quote_match in self.quote_pattern.finditer(text_segment.text): + if self._quote_convention_set.is_valid_opening_quotation_mark( quote_match.group() - ) or self.quote_convention_set.is_valid_closing_quotation_mark(quote_match.group()): + ) or self._quote_convention_set.is_valid_closing_quotation_mark(quote_match.group()): quotation_matches.append(QuotationMarkStringMatch(text_segment, quote_match.start(), quote_match.end())) return quotation_matches diff --git a/machine/corpora/analysis/quotation_mark_metadata.py b/machine/corpora/analysis/quotation_mark_metadata.py index efe3ce45..7737bd41 100644 --- a/machine/corpora/analysis/quotation_mark_metadata.py +++ b/machine/corpora/analysis/quotation_mark_metadata.py @@ -1,54 +1,19 @@ +from dataclasses import dataclass + from .quotation_mark_direction import QuotationMarkDirection from .quote_convention import QuoteConvention from .text_segment import TextSegment +@dataclass class QuotationMarkMetadata: - def __init__( - self, - quotation_mark: str, - depth: int, - direction: QuotationMarkDirection, - text_segment: TextSegment, - start_index: int, - end_index: int, - ): - self.quotation_mark = quotation_mark - self.depth = depth - self.direction = direction - self.text_segment = text_segment - self.start_index = start_index - self.end_index = end_index - - def __eq__(self, other): - if not isinstance(other, QuotationMarkMetadata): - return False - return ( - self.quotation_mark == other.quotation_mark - and self.depth == other.depth - and self.direction == other.direction - and self.text_segment == other.text_segment - and self.start_index == other.start_index - and self.end_index == other.end_index - ) - - def get_quotation_mark(self) -> str: - return self.quotation_mark - - def get_depth(self) -> int: - return self.depth - - def get_direction(self) -> QuotationMarkDirection: - return self.direction - - def get_text_segment(self) -> TextSegment: - return self.text_segment - - def get_start_index(self) -> int: - return self.start_index - def get_end_index(self) -> int: - return self.end_index + quotation_mark: str + depth: int + direction: QuotationMarkDirection + text_segment: TextSegment + start_index: int + end_index: int def update_quotation_mark(self, quote_convention: QuoteConvention) -> None: updated_quotation_mark = quote_convention.get_expected_quotation_mark(self.depth, self.direction) diff --git a/machine/corpora/analysis/quotation_mark_resolution_settings.py b/machine/corpora/analysis/quotation_mark_resolution_settings.py index cf50a2cd..aed6711e 100644 --- a/machine/corpora/analysis/quotation_mark_resolution_settings.py +++ b/machine/corpora/analysis/quotation_mark_resolution_settings.py @@ -1,4 +1,4 @@ -from abc import ABC +from abc import ABC, abstractmethod from typing import Set import regex @@ -9,20 +9,28 @@ class QuotationMarkResolutionSettings(ABC): + @abstractmethod def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: ... + @abstractmethod def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: ... + @abstractmethod def get_opening_quotation_mark_regex(self) -> regex.Pattern: ... + @abstractmethod def get_closing_quotation_mark_regex(self) -> regex.Pattern: ... + @abstractmethod def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: ... + @abstractmethod def should_rely_on_paragraph_markers(self) -> bool: ... + @abstractmethod def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: ... - def does_metadata_match_quotation_mark( + @abstractmethod + def metadata_matches_quotation_mark( self, quotation_mark: str, depth: int, direction: QuotationMarkDirection ) -> bool: ... diff --git a/machine/corpora/analysis/quotation_mark_resolver.py b/machine/corpora/analysis/quotation_mark_resolver.py index 658aa2e1..189c21dc 100644 --- a/machine/corpora/analysis/quotation_mark_resolver.py +++ b/machine/corpora/analysis/quotation_mark_resolver.py @@ -1,4 +1,4 @@ -from abc import ABC +from abc import ABC, abstractmethod from typing import Generator, List, Set from .quotation_mark_metadata import QuotationMarkMetadata @@ -10,12 +10,14 @@ class QuotationMarkResolver(ABC): def __init__(self, settings: QuotationMarkResolutionSettings): - self.settings = settings + self._settings = settings + @abstractmethod def resolve_quotation_marks( self, quote_matches: List[QuotationMarkStringMatch] ) -> Generator[QuotationMarkMetadata, None, None]: ... def reset(self) -> None: ... + @abstractmethod def get_issues(self) -> Set[QuotationMarkResolutionIssue]: ... diff --git a/machine/corpora/analysis/quotation_mark_string_match.py b/machine/corpora/analysis/quotation_mark_string_match.py index 6be5494c..736b376a 100644 --- a/machine/corpora/analysis/quotation_mark_string_match.py +++ b/machine/corpora/analysis/quotation_mark_string_match.py @@ -20,150 +20,131 @@ class QuotationMarkStringMatch: quote_introducer_pattern: Pattern = regex.compile(r"[:,]\s*$", regex.U) def __init__(self, text_segment: TextSegment, start_index: int, end_index: int): - self.text_segment = text_segment - self.start_index = start_index - self.end_index = end_index + self._text_segment = text_segment + self._start_index = start_index + self._end_index = end_index def __eq__(self, value): if not isinstance(value, QuotationMarkStringMatch): return False return ( - self.text_segment == value.text_segment - and self.start_index == value.start_index - and self.end_index == value.end_index + self._text_segment == value._text_segment + and self._start_index == value._start_index + and self._end_index == value._end_index ) - def get_quotation_mark(self) -> str: - return self.text_segment.get_text()[self.start_index : self.end_index] + @property + def quotation_mark(self) -> str: + return self._text_segment.text[self._start_index : self._end_index] def is_valid_opening_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: - return quote_convention_set.is_valid_opening_quotation_mark(self.get_quotation_mark()) + return quote_convention_set.is_valid_opening_quotation_mark(self.quotation_mark) def is_valid_closing_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: - return quote_convention_set.is_valid_closing_quotation_mark(self.get_quotation_mark()) + return quote_convention_set.is_valid_closing_quotation_mark(self.quotation_mark) - def does_quotation_mark_match(self, regex_pattern: regex.Pattern) -> bool: - return regex_pattern.search(self.get_quotation_mark()) is not None + def quotation_mark_matches(self, regex_pattern: regex.Pattern) -> bool: + return regex_pattern.search(self.quotation_mark) is not None - def does_next_character_match(self, regex_pattern: regex.Pattern) -> bool: - return self.get_next_character() is not None and regex_pattern.search(self.get_next_character()) is not None + def next_character_matches(self, regex_pattern: regex.Pattern) -> bool: + return self.next_character is not None and regex_pattern.search(self.next_character) is not None - def does_previous_character_match(self, regex_pattern: regex.Pattern) -> bool: - return ( - self.get_previous_character() is not None - and regex_pattern.search(self.get_previous_character()) is not None - ) - - def get_previous_character(self) -> Union[str, None]: - if self.start_index == 0: - previous_segment = self.text_segment.get_previous_segment() - if previous_segment is not None and not self.text_segment.is_marker_in_preceding_context( - UsfmMarkerType.ParagraphMarker - ): - return previous_segment.get_text()[-1] - return None - return self.text_segment.get_text()[self.start_index - 1] + def previous_character_matches(self, regex_pattern: regex.Pattern) -> bool: + return self.previous_character is not None and regex_pattern.search(self.previous_character) is not None - def get_previous_character_string_match(self) -> Union["QuotationMarkStringMatch", None]: - if self.start_index == 0: - previous_segment = self.text_segment.get_previous_segment() - if previous_segment is not None and not self.text_segment.is_marker_in_preceding_context( - UsfmMarkerType.ParagraphMarker + @property + def previous_character(self) -> Union[str, None]: + if self._start_index == 0: + previous_segment = self._text_segment.previous_segment + if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context( + UsfmMarkerType.PARAGRAPH ): - return QuotationMarkStringMatch( - previous_segment, previous_segment.length() - 1, previous_segment.length() - ) + return previous_segment.text[-1] return None - return QuotationMarkStringMatch(self.text_segment, self.start_index - 1, self.end_index - 1) + return self._text_segment.text[self._start_index - 1] - def get_next_character(self) -> Union[str, None]: + @property + def next_character(self) -> Union[str, None]: if self.is_at_end_of_segment(): - next_segment = self.text_segment.get_next_segment() - if next_segment is not None and not next_segment.is_marker_in_preceding_context( - UsfmMarkerType.ParagraphMarker - ): - return next_segment.get_text()[0] - return None - return self.text_segment.get_text()[self.end_index] - - def get_next_character_string_match(self) -> Union["QuotationMarkStringMatch", None]: - if self.is_at_end_of_segment(): - next_segment = self.text_segment.get_next_segment() - if next_segment is not None and not next_segment.is_marker_in_preceding_context( - UsfmMarkerType.ParagraphMarker - ): - return QuotationMarkStringMatch(next_segment, 0, 1) + next_segment = self._text_segment.next_segment + if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH): + return next_segment.text[0] return None - return QuotationMarkStringMatch(self.text_segment, self.start_index + 1, self.end_index + 1) + return self._text_segment.text[self._end_index] - def does_leading_substring_match(self, regex_pattern: regex.Pattern) -> bool: - return regex_pattern.search(self.text_segment.substring_before(self.start_index)) is not None + def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool: + return regex_pattern.search(self._text_segment.substring_before(self._start_index)) is not None - def does_trailing_substring_match(self, regex_pattern: regex.Pattern) -> bool: - return regex_pattern.search(self.text_segment.substring_after(self.end_index)) is not None + def trailing_substring_matches(self, regex_pattern: regex.Pattern) -> bool: + return regex_pattern.search(self._text_segment.substring_after(self._end_index)) is not None # this assumes that the two matches occur in the same verse def precedes(self, other: "QuotationMarkStringMatch") -> bool: - return self.text_segment.index_in_verse < other.text_segment.index_in_verse or ( - self.text_segment.index_in_verse == other.text_segment.index_in_verse - and self.start_index < other.start_index + return self._text_segment._index_in_verse < other._text_segment._index_in_verse or ( + self._text_segment._index_in_verse == other._text_segment._index_in_verse + and self._start_index < other._start_index ) - def get_text_segment(self) -> TextSegment: - return self.text_segment + @property + def text_segment(self) -> TextSegment: + return self._text_segment - def get_start_index(self) -> int: - return self.start_index + @property + def start_index(self) -> int: + return self._start_index - def get_end_index(self) -> int: - return self.end_index + @property + def end_index(self) -> int: + return self._end_index - def get_context(self) -> str: - return self.text_segment.get_text()[ - max(self.start_index - 10, 0) : min(self.end_index + 10, len(self.text_segment.get_text())) + # not used, but a useful method for debugging + @property + def context(self) -> str: + return self._text_segment.text[ + max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text)) ] def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata: return QuotationMarkMetadata( - self.get_quotation_mark(), depth, direction, self.text_segment, self.start_index, self.end_index + self.quotation_mark, depth, direction, self._text_segment, self._start_index, self._end_index ) def is_at_start_of_segment(self) -> bool: - return self.start_index == 0 + return self._start_index == 0 def is_at_end_of_segment(self) -> bool: - return self.end_index == self.text_segment.length() + return self._end_index == self._text_segment.length def has_leading_whitespace(self) -> bool: - if self.get_previous_character() is None: + if self.previous_character is None: return ( - self.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.ParagraphMarker) - or self.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.EmbedMarker) - or self.get_text_segment().is_marker_in_preceding_context(UsfmMarkerType.VerseMarker) + self._text_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH) + or self._text_segment.marker_is_in_preceding_context(UsfmMarkerType.EMBED) + or self._text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) ) - return self.does_previous_character_match(self.whitespace_pattern) + return self.previous_character_matches(self.whitespace_pattern) def has_trailing_whitespace(self) -> bool: - return self.does_next_character_match(self.whitespace_pattern) + return self.next_character_matches(self.whitespace_pattern) def has_leading_punctuation(self) -> bool: - return self.does_previous_character_match(self.punctuation_pattern) + return self.previous_character_matches(self.punctuation_pattern) def has_trailing_punctuation(self) -> bool: - return self.does_next_character_match(self.punctuation_pattern) + return self.next_character_matches(self.punctuation_pattern) def has_letter_in_leading_substring(self) -> bool: - return self.does_leading_substring_match(self.letter_pattern) + return self.leading_substring_matches(self.letter_pattern) def has_letter_in_trailing_substring(self) -> bool: - return self.does_trailing_substring_match(self.letter_pattern) + return self.trailing_substring_matches(self.letter_pattern) def has_leading_latin_letter(self) -> bool: - return self.does_previous_character_match(self.latin_letter_pattern) + return self.previous_character_matches(self.latin_letter_pattern) def has_trailing_latin_letter(self) -> bool: - return self.does_next_character_match(self.latin_letter_pattern) + return self.next_character_matches(self.latin_letter_pattern) def has_quote_introducer_in_leading_substring(self) -> bool: - return self.does_leading_substring_match(self.quote_introducer_pattern) + return self.leading_substring_matches(self.quote_introducer_pattern) diff --git a/machine/corpora/analysis/quotation_mark_tabulator.py b/machine/corpora/analysis/quotation_mark_tabulator.py index eb3eb6c3..522d145e 100644 --- a/machine/corpora/analysis/quotation_mark_tabulator.py +++ b/machine/corpora/analysis/quotation_mark_tabulator.py @@ -7,26 +7,26 @@ class QuotationMarkCounts: def __init__(self): - self.string_counts: Dict[str, int] = dict() - self.total_count = 0 + self._string_counts: Dict[str, int] = dict() + self._total_count = 0 def count_quotation_mark(self, quotation_mark: str) -> None: - if quotation_mark not in self.string_counts: - self.string_counts[quotation_mark] = 0 - self.string_counts[quotation_mark] += 1 - self.total_count += 1 + if quotation_mark not in self._string_counts: + self._string_counts[quotation_mark] = 0 + self._string_counts[quotation_mark] += 1 + self._total_count += 1 - def get_best_proportion(self) -> tuple[str, int, int]: - best_str = max(self.string_counts, key=lambda x: self.string_counts[x]) - return (best_str, self.string_counts[best_str], self.total_count) + def find_best_quotation_mark_proportion(self) -> tuple[str, int, int]: + best_str = max(self._string_counts, key=lambda x: self._string_counts[x]) + return (best_str, self._string_counts[best_str], self._total_count) def calculate_num_differences(self, expected_quotation_mark: str) -> int: - if expected_quotation_mark not in self.string_counts: - return self.total_count - return self.total_count - self.string_counts[expected_quotation_mark] + if expected_quotation_mark not in self._string_counts: + return self._total_count + return self._total_count - self._string_counts[expected_quotation_mark] def get_observed_count(self) -> int: - return self.total_count + return self._total_count class QuotationMarkTabulator: @@ -41,19 +41,19 @@ def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None: self._count_quotation_mark(quotation_mark) def _count_quotation_mark(self, quote: QuotationMarkMetadata) -> None: - key = (quote.get_depth(), quote.get_direction()) - quotation_mark = quote.get_quotation_mark() + key = (quote.depth, quote.direction) + quotation_mark = quote.quotation_mark if key not in self.quotation_counts_by_depth_and_direction: self.quotation_counts_by_depth_and_direction[key] = QuotationMarkCounts() self.quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark) - def _has_depth_and_direction_been_observed(self, depth: int, direction: QuotationMarkDirection) -> bool: + def _depth_and_direction_observed(self, depth: int, direction: QuotationMarkDirection) -> bool: return (depth, direction) in self.quotation_counts_by_depth_and_direction - def _get_most_common_quote_by_depth_and_direction( + def _find_most_common_quotation_mark_with_depth_and_direction( self, depth: int, direction: QuotationMarkDirection ) -> tuple[str, int, int]: - return self.quotation_counts_by_depth_and_direction[(depth, direction)].get_best_proportion() + return self.quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion() def calculate_similarity(self, quote_convention: QuoteConvention) -> float: num_differences = 0 @@ -75,14 +75,18 @@ def calculate_similarity(self, quote_convention: QuoteConvention) -> float: def print_summary(self) -> None: for depth in range(1, 5): - if self._has_depth_and_direction_been_observed( - depth, QuotationMarkDirection.Opening - ) and self._has_depth_and_direction_been_observed(depth, QuotationMarkDirection.Closing): + if self._depth_and_direction_observed( + depth, QuotationMarkDirection.OPENING + ) and self._depth_and_direction_observed(depth, QuotationMarkDirection.CLOSING): (opening_quotation_mark, observed_opening_count, total_opening_count) = ( - self._get_most_common_quote_by_depth_and_direction(depth, QuotationMarkDirection.Opening) + self._find_most_common_quotation_mark_with_depth_and_direction( + depth, QuotationMarkDirection.OPENING + ) ) (closing_quotation_mark, observed_closing_count, total_closing_count) = ( - self._get_most_common_quote_by_depth_and_direction(depth, QuotationMarkDirection.Closing) + self._find_most_common_quotation_mark_with_depth_and_direction( + depth, QuotationMarkDirection.CLOSING + ) ) print( "The most common level %i quotes are %s (%i of %i opening quotes) and %s (%i of %i closing quotes)" diff --git a/machine/corpora/analysis/quote_convention.py b/machine/corpora/analysis/quote_convention.py index 944b8ee4..609c786b 100644 --- a/machine/corpora/analysis/quote_convention.py +++ b/machine/corpora/analysis/quote_convention.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from typing import Dict, Set from .quotation_mark_direction import QuotationMarkDirection @@ -18,16 +19,10 @@ } +@dataclass class SingleLevelQuoteConvention: - def __init__(self, opening_quote: str, closing_quote: str): - self.opening_quote = opening_quote - self.closing_quote = closing_quote - - def get_opening_quote(self) -> str: - return self.opening_quote - - def get_closing_quote(self) -> str: - return self.closing_quote + opening_quote: str + closing_quote: str def normalize(self) -> "SingleLevelQuoteConvention": normalized_opening_quote = ( @@ -56,9 +51,9 @@ def __eq__(self, value): if len(self.levels) != len(value.levels): return False for level, other_level in zip(self.levels, value.levels): - if level.get_opening_quote() != other_level.get_opening_quote(): + if level.opening_quote != other_level.opening_quote: return False - if level.get_closing_quote() != other_level.get_closing_quote(): + if level.closing_quote != other_level.closing_quote: return False return True @@ -69,38 +64,38 @@ def get_num_levels(self) -> int: return len(self.levels) def get_opening_quote_at_level(self, level: int) -> str: - return self.levels[level - 1].get_opening_quote() + return self.levels[level - 1].opening_quote def get_closing_quote_at_level(self, level: int) -> str: - return self.levels[level - 1].get_closing_quote() + return self.levels[level - 1].closing_quote def get_expected_quotation_mark(self, depth: int, direction: QuotationMarkDirection) -> str: if depth > len(self.levels) or depth < 1: return "" return ( self.get_opening_quote_at_level(depth) - if direction is QuotationMarkDirection.Opening + if direction is QuotationMarkDirection.OPENING else self.get_closing_quote_at_level(depth) ) def _includes_opening_quotation_mark(self, opening_quotation_mark: str) -> bool: for level in self.levels: - if level.get_opening_quote() == opening_quotation_mark: + if level.opening_quote == opening_quotation_mark: return True return False def _includes_closing_quotation_mark(self, closing_quotation_mark: str) -> bool: for level in self.levels: - if level.get_closing_quote() == closing_quotation_mark: + if level.closing_quote == closing_quotation_mark: return True return False def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: depths: Set[int] = set() for depth, level in enumerate(self.levels, start=1): - if direction is QuotationMarkDirection.Opening and level.get_opening_quote() == quotation_mark: + if direction is QuotationMarkDirection.OPENING and level.opening_quote == quotation_mark: depths.add(depth) - elif direction is QuotationMarkDirection.Closing and level.get_closing_quote() == quotation_mark: + elif direction is QuotationMarkDirection.CLOSING and level.closing_quote == quotation_mark: depths.add(depth) return depths @@ -132,9 +127,9 @@ def _get_summary_message(self) -> str: for level, convention in enumerate(self.levels): ordinal_name = self._get_ordinal_name(level + 1) summary += "%s%s-level quote%s\n" % ( - convention.get_opening_quote(), + convention.opening_quote, ordinal_name, - convention.get_closing_quote(), + convention.closing_quote, ) return summary diff --git a/machine/corpora/analysis/quote_convention_detection_resolution_settings.py b/machine/corpora/analysis/quote_convention_detection_resolution_settings.py index 7192bd3d..c167fbb9 100644 --- a/machine/corpora/analysis/quote_convention_detection_resolution_settings.py +++ b/machine/corpora/analysis/quote_convention_detection_resolution_settings.py @@ -20,13 +20,13 @@ def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStr return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_convention_set) def get_opening_quotation_mark_regex(self) -> regex.Pattern: - return self._quote_convention_set.get_opening_quotation_mark_regex() + return self._quote_convention_set.opening_quotation_mark_regex def get_closing_quotation_mark_regex(self) -> regex.Pattern: - return self._quote_convention_set.get_closing_quotation_mark_regex() + return self._quote_convention_set.closing_quotation_mark_regex def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: - return self._quote_convention_set.are_marks_a_valid_pair(opening_mark, closing_mark) + return self._quote_convention_set.marks_are_a_valid_pair(opening_mark, closing_mark) def should_rely_on_paragraph_markers(self): return True @@ -34,7 +34,7 @@ def should_rely_on_paragraph_markers(self): def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: return self._quote_convention_set.get_possible_depths(quotation_mark, direction) - def does_metadata_match_quotation_mark( + def metadata_matches_quotation_mark( self, quotation_mark: str, depth: int, direction: QuotationMarkDirection ) -> bool: - return self._quote_convention_set.does_metadata_match_quotation_mark(quotation_mark, depth, direction) + return self._quote_convention_set.metadata_matches_quotation_mark(quotation_mark, depth, direction) diff --git a/machine/corpora/analysis/quote_convention_detector.py b/machine/corpora/analysis/quote_convention_detector.py index b7b186a9..3e2673b9 100644 --- a/machine/corpora/analysis/quote_convention_detector.py +++ b/machine/corpora/analysis/quote_convention_detector.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from typing import List, Union from .chapter import Chapter @@ -14,23 +15,17 @@ from .usfm_structure_extractor import UsfmStructureExtractor +@dataclass class QuoteConventionAnalysis: - def __init__(self, best_quote_convention: QuoteConvention, best_quote_convention_score: float): - self.best_quote_convention = best_quote_convention - self.best_quote_convention_score = best_quote_convention_score - - def get_best_quote_convention(self) -> QuoteConvention: - return self.best_quote_convention - - def get_best_quote_convention_similarity_score(self) -> float: - return self.best_quote_convention_score * 100 + best_quote_convention: QuoteConvention + best_quote_convention_score: float class QuoteConventionDetector(UsfmStructureExtractor): def __init__(self): super().__init__() - self.quotation_mark_tabulator = QuotationMarkTabulator() + self._quotation_mark_tabulator = QuotationMarkTabulator() def _count_quotation_marks_in_chapters(self, chapters: list[Chapter]) -> None: possible_quote_conventions: QuoteConventionSet = PreliminaryQuotationAnalyzer( @@ -53,17 +48,17 @@ def _count_quotation_marks_in_chapter( ).resolve_quotation_marks(quotation_mark_matches) ) - self.quotation_mark_tabulator.tabulate(resolved_quotation_marks) + self._quotation_mark_tabulator.tabulate(resolved_quotation_marks) def detect_quotation_convention(self, print_summary: bool) -> Union[QuoteConventionAnalysis, None]: self._count_quotation_marks_in_chapters(self.get_chapters()) (best_quote_convention, score) = standard_quote_conventions.find_most_similar_convention( - self.quotation_mark_tabulator + self._quotation_mark_tabulator ) if print_summary: - self.quotation_mark_tabulator.print_summary() + self._quotation_mark_tabulator.print_summary() if score > 0 and best_quote_convention is not None: return QuoteConventionAnalysis(best_quote_convention, score) diff --git a/machine/corpora/analysis/quote_convention_set.py b/machine/corpora/analysis/quote_convention_set.py index d825c45c..5d27c25f 100644 --- a/machine/corpora/analysis/quote_convention_set.py +++ b/machine/corpora/analysis/quote_convention_set.py @@ -10,22 +10,22 @@ class QuoteConventionSet: def __init__(self, conventions: List[QuoteConvention]): - self.conventions = conventions + self._conventions = conventions self._create_quote_regexes() self._create_quotation_mark_pair_map() def __eq__(self, other: object) -> bool: if not isinstance(other, QuoteConventionSet): return False - return self.conventions == other.conventions + return self._conventions == other._conventions def _create_quote_regexes(self) -> None: opening_quotation_marks: Set[str] = set() closing_quotation_marks: Set[str] = set() all_quotation_marks: Set[str] = set() - if len(self.conventions) > 0: - for convention in self.conventions: + if len(self._conventions) > 0: + for convention in self._conventions: for level in range(1, convention.get_num_levels() + 1): opening_quote = convention.get_opening_quote_at_level(level) closing_quote = convention.get_closing_quote_at_level(level) @@ -35,27 +35,27 @@ def _create_quote_regexes(self) -> None: all_quotation_marks.add(closing_quote) if len(all_quotation_marks) > 0: - self.opening_quotation_mark_regex: Pattern = regex.compile( + self._opening_quotation_mark_regex: Pattern = regex.compile( r"[" + "".join(sorted(list(opening_quotation_marks))) + "]" ) - self.closing_quotation_mark_regex: Pattern = regex.compile( + self._closing_quotation_mark_regex: Pattern = regex.compile( r"[" + "".join(sorted(list(closing_quotation_marks))) + "]" ) - self.all_quotation_mark_regex: Pattern = regex.compile( + self._all_quotation_mark_regex: Pattern = regex.compile( r"[" + "".join(sorted(list(all_quotation_marks))) + "]" ) if len(opening_quotation_marks) == 0: - self.opening_quotation_mark_regex = regex.compile(r"") + self._opening_quotation_mark_regex = regex.compile(r"") if len(closing_quotation_marks) == 0: - self.closing_quotation_mark_regex = regex.compile(r"") + self._closing_quotation_mark_regex = regex.compile(r"") if len(all_quotation_marks) == 0: - self.all_quotation_mark_regex = regex.compile(r"") + self._all_quotation_mark_regex = regex.compile(r"") def _create_quotation_mark_pair_map(self) -> None: self.closing_marks_by_opening_mark: Dict[str, set[str]] = dict() self.opening_marks_by_closing_mark: Dict[str, set[str]] = dict() - for convention in self.conventions: + for convention in self._conventions: for level in range(1, convention.get_num_levels() + 1): opening_quote = convention.get_opening_quote_at_level(level) closing_quote = convention.get_closing_quote_at_level(level) @@ -66,14 +66,26 @@ def _create_quotation_mark_pair_map(self) -> None: self.opening_marks_by_closing_mark[closing_quote] = set() self.opening_marks_by_closing_mark[closing_quote].add(opening_quote) + @property + def opening_quotation_mark_regex(self) -> Pattern: + return self._opening_quotation_mark_regex + + @property + def closing_quotation_mark_regex(self) -> Pattern: + return self._closing_quotation_mark_regex + + @property + def quotation_mark_regex(self) -> Pattern: + return self._all_quotation_mark_regex + def get_quote_convention_by_name(self, name: str) -> Union[QuoteConvention, None]: - for convention in self.conventions: + for convention in self._conventions: if convention.get_name() == name: return convention return None def get_all_quote_convention_names(self) -> List[str]: - return sorted([qc.name for qc in self.conventions]) + return sorted([qc.name for qc in self._conventions]) def get_possible_opening_marks(self) -> list[str]: return sorted(list(self.closing_marks_by_opening_mark.keys())) @@ -87,7 +99,7 @@ def is_valid_opening_quotation_mark(self, quotation_mark: str) -> bool: def is_valid_closing_quotation_mark(self, quotation_mark: str) -> bool: return quotation_mark in self.opening_marks_by_closing_mark - def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: + def marks_are_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: return (opening_mark in self.closing_marks_by_opening_mark) and ( closing_mark in self.closing_marks_by_opening_mark[opening_mark] ) @@ -108,23 +120,14 @@ def get_possible_paired_quotation_marks(self, quotation_mark: str) -> Set[str]: def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: depths: Set[int] = set() - for convention in self.conventions: + for convention in self._conventions: depths.update(convention.get_possible_depths(quotation_mark, direction)) return depths - def get_opening_quotation_mark_regex(self) -> Pattern: - return self.opening_quotation_mark_regex - - def get_closing_quotation_mark_regex(self) -> Pattern: - return self.closing_quotation_mark_regex - - def get_quotation_mark_regex(self) -> Pattern: - return self.all_quotation_mark_regex - - def does_metadata_match_quotation_mark( + def metadata_matches_quotation_mark( self, quotation_mark: str, depth: int, direction: QuotationMarkDirection ) -> bool: - for convention in self.conventions: + for convention in self._conventions: if convention.get_expected_quotation_mark(depth, direction) == quotation_mark: return True return False @@ -135,7 +138,7 @@ def filter_to_compatible_quote_conventions( return QuoteConventionSet( [ convention - for convention in self.conventions + for convention in self._conventions if convention.is_compatible_with_observed_quotation_marks( opening_quotation_marks, closing_quotation_marks ) @@ -147,7 +150,7 @@ def find_most_similar_convention( ) -> Tuple[Union[QuoteConvention, None], float]: best_similarity: float = float("-inf") best_quote_convention: Union[QuoteConvention, None] = None - for quote_convention in self.conventions: + for quote_convention in self._conventions: similarity = tabulated_quotation_marks.calculate_similarity(quote_convention) if similarity > best_similarity: best_similarity = similarity diff --git a/machine/corpora/analysis/text_segment.py b/machine/corpora/analysis/text_segment.py index 8c7b7159..039bd579 100644 --- a/machine/corpora/analysis/text_segment.py +++ b/machine/corpora/analysis/text_segment.py @@ -6,96 +6,99 @@ class TextSegment: def __init__(self): - self.text = "" - self.immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NoMarker - self.markers_in_preceding_context: Set[UsfmMarkerType] = set() - self.previous_segment: Union[TextSegment, None] = None - self.next_segment: Union[TextSegment, None] = None - self.index_in_verse: int = 0 - self.num_segments_in_verse: int = 0 - self.usfm_token: Union[UsfmToken, None] = None + self._text = "" + self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER + self._markers_in_preceding_context: Set[UsfmMarkerType] = set() + self._previous_segment: Union[TextSegment, None] = None + self._next_segment: Union[TextSegment, None] = None + self._index_in_verse: int = 0 + self._num_segments_in_verse: int = 0 + self._usfm_token: Union[UsfmToken, None] = None def __eq__(self, value): if not isinstance(value, TextSegment): return False - if self.text != value.text: + if self._text != value._text: return False - if self.index_in_verse != value.index_in_verse: + if self._index_in_verse != value._index_in_verse: return False - if self.usfm_token != value.usfm_token: + if self._index_in_verse != value._index_in_verse: return False - if self.immediate_preceding_marker != value.immediate_preceding_marker: + if self._usfm_token != value._usfm_token: + return False + if self._immediate_preceding_marker != value._immediate_preceding_marker: return False return True - def get_text(self) -> str: - return self.text + @property + def text(self) -> str: + return self._text + + @property + def previous_segment(self) -> Union["TextSegment", None]: + return self._previous_segment + + @property + def next_segment(self) -> Union["TextSegment", None]: + return self._next_segment + @property def length(self) -> int: - return len(self.text) + return len(self._text) def substring_before(self, index: int) -> str: - return self.text[:index] + return self._text[:index] def substring_after(self, index: int) -> str: - return self.text[index:] - - def get_immediate_preceding_marker_type(self) -> UsfmMarkerType: - return self.immediate_preceding_marker - - def is_marker_in_preceding_context(self, marker: UsfmMarkerType) -> bool: - return marker in self.markers_in_preceding_context - - def get_previous_segment(self) -> Union["TextSegment", None]: - return self.previous_segment + return self._text[index:] - def get_next_segment(self) -> Union["TextSegment", None]: - return self.next_segment + def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool: + return marker in self._markers_in_preceding_context def is_first_segment_in_verse(self) -> bool: - return self.index_in_verse == 0 + return self._index_in_verse == 0 def is_last_segment_in_verse(self) -> bool: - return self.index_in_verse == self.num_segments_in_verse - 1 + return self._index_in_verse == self._num_segments_in_verse - 1 def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None: - self.text = self.text[:start_index] + replacement + self.text[end_index:] - if self.usfm_token is not None: - self.usfm_token.text = self.text + self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index) + if self._usfm_token is not None: + self._usfm_token.text = self._text # These setters need to be implemented outside the builder to avoid circular dependencies def set_previous_segment(self, previous_segment: "TextSegment") -> None: - self.previous_segment = previous_segment + self._previous_segment = previous_segment def set_next_segment(self, next_segment: "TextSegment") -> None: - self.next_segment = next_segment + self._next_segment = next_segment def set_index_in_verse(self, index_in_verse: int) -> None: - self.index_in_verse = index_in_verse + self._index_in_verse = index_in_verse def set_num_segments_in_verse(self, num_segments_in_verse: int) -> None: - self.num_segments_in_verse = num_segments_in_verse + self._num_segments_in_verse = num_segments_in_verse class Builder: def __init__(self): - self.text_segment = TextSegment() + self._text_segment = TextSegment() def set_previous_segment(self, previous_segment: "TextSegment") -> "TextSegment.Builder": - self.text_segment.previous_segment = previous_segment + self._text_segment._previous_segment = previous_segment return self def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder": - self.text_segment.immediate_preceding_marker = marker - self.text_segment.markers_in_preceding_context.add(marker) + self._text_segment._immediate_preceding_marker = marker + self._text_segment._markers_in_preceding_context.add(marker) return self def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder": - self.text_segment.usfm_token = token + self._text_segment._usfm_token = token return self def set_text(self, text: str) -> "TextSegment.Builder": - self.text_segment.text = text + self._text_segment._text = text return self def build(self) -> "TextSegment": - return self.text_segment + return self._text_segment diff --git a/machine/corpora/analysis/usfm_marker_type.py b/machine/corpora/analysis/usfm_marker_type.py index 00bbbb1a..ea4349e6 100644 --- a/machine/corpora/analysis/usfm_marker_type.py +++ b/machine/corpora/analysis/usfm_marker_type.py @@ -2,10 +2,10 @@ class UsfmMarkerType(Enum): - ParagraphMarker = auto() - CharacterMarker = auto() - VerseMarker = auto() - ChapterMarker = auto() - EmbedMarker = auto() - Other = auto() - NoMarker = auto() + PARAGRAPH = auto() + CHARACTER = auto() + VERSE = auto() + CHAPTER = auto() + EMBED = auto() + OTHER = auto() + NO_MARKER = auto() diff --git a/machine/corpora/analysis/usfm_structure_extractor.py b/machine/corpora/analysis/usfm_structure_extractor.py index 71968780..8958c2aa 100644 --- a/machine/corpora/analysis/usfm_structure_extractor.py +++ b/machine/corpora/analysis/usfm_structure_extractor.py @@ -14,8 +14,8 @@ def __init__(self): self._reset() def _reset(self): - self.text_segments: list[TextSegment] = [] - self.next_text_segment_builder: TextSegment.Builder = TextSegment.Builder() + self._text_segments: list[TextSegment] = [] + self._next_text_segment_builder: TextSegment.Builder = TextSegment.Builder() def chapter( self, @@ -25,7 +25,7 @@ def chapter( alt_number: Optional[str], pub_number: Optional[str], ) -> None: - self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHAPTER) def start_para( self, @@ -34,7 +34,7 @@ def start_para( unknown: bool, attributes: Optional[Sequence[UsfmAttribute]], ) -> None: - self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.ParagraphMarker) + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.PARAGRAPH) def start_char( self, @@ -43,57 +43,55 @@ def start_char( unknown: bool, attributes: Optional[Sequence[UsfmAttribute]], ) -> None: - self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CharacterMarker) + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHARACTER) def end_char( self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool ) -> None: - self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CharacterMarker) + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHARACTER) def verse( self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str] ) -> None: - self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.VERSE) def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: - self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) def end_table(self, state: UsfmParserState) -> None: - self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: - self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None: - self.next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) def text(self, state: UsfmParserState, text: str) -> None: if not state.is_verse_text: return if len(text) > 0: - self.next_text_segment_builder.set_text(text) - text_segment: TextSegment = self.next_text_segment_builder.build() + self._next_text_segment_builder.set_text(text) + text_segment: TextSegment = self._next_text_segment_builder.build() # don't look past verse boundaries, to enable identical functionality in the # online one-verse-at-a-time (QuotationDenormalizationScriptureUpdateBlockHandler) # and offline whole-book-at-once settings (QuoteConventionDetector) - if len(self.text_segments) > 0 and not text_segment.is_marker_in_preceding_context( - UsfmMarkerType.VerseMarker - ): - self.text_segments[-1].set_next_segment(text_segment) - text_segment.set_previous_segment(self.text_segments[-1]) - self.text_segments.append(text_segment) - self.next_text_segment_builder = TextSegment.Builder() + if len(self._text_segments) > 0 and not text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE): + self._text_segments[-1].set_next_segment(text_segment) + text_segment.set_previous_segment(self._text_segments[-1]) + self._text_segments.append(text_segment) + self._next_text_segment_builder = TextSegment.Builder() def get_chapters(self) -> list[Chapter]: chapters: list[Chapter] = [] current_chapter_verses: list[Verse] = [] current_verse_segments: list[TextSegment] = [] - for text_segment in self.text_segments: - if text_segment.is_marker_in_preceding_context(UsfmMarkerType.VerseMarker): + for text_segment in self._text_segments: + if text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE): if len(current_verse_segments) > 0: current_chapter_verses.append(Verse(current_verse_segments)) current_verse_segments = [] - if text_segment.is_marker_in_preceding_context(UsfmMarkerType.ChapterMarker): + if text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER): if len(current_chapter_verses) > 0: chapters.append(Chapter(current_chapter_verses)) current_chapter_verses = [] diff --git a/machine/corpora/analysis/verse.py b/machine/corpora/analysis/verse.py index 98fc58ca..00916586 100644 --- a/machine/corpora/analysis/verse.py +++ b/machine/corpora/analysis/verse.py @@ -3,13 +3,14 @@ class Verse: def __init__(self, text_segments: list[TextSegment]): - self.text_segments = text_segments + self._text_segments = text_segments self._index_text_segments() def _index_text_segments(self) -> None: - for index, text_segment in enumerate(self.text_segments): + for index, text_segment in enumerate(self._text_segments): text_segment.set_index_in_verse(index) - text_segment.set_num_segments_in_verse(len(self.text_segments)) + text_segment.set_num_segments_in_verse(len(self._text_segments)) - def get_text_segments(self) -> list[TextSegment]: - return self.text_segments + @property + def text_segments(self) -> list[TextSegment]: + return self._text_segments diff --git a/machine/corpora/fallback_quotation_mark_resolver.py b/machine/corpora/fallback_quotation_mark_resolver.py index 106ef460..002fd4ea 100644 --- a/machine/corpora/fallback_quotation_mark_resolver.py +++ b/machine/corpora/fallback_quotation_mark_resolver.py @@ -69,13 +69,13 @@ def _does_most_recent_opening_mark_immediately_precede( ) -> bool: if ( self._last_quotation_mark is None - or self._last_quotation_mark.get_direction() is not QuotationMarkDirection.Opening + or self._last_quotation_mark.direction is not QuotationMarkDirection.OPENING ): return False return ( - self._last_quotation_mark.get_text_segment() == match.get_text_segment() - and self._last_quotation_mark.get_end_index() == match.get_start_index() + self._last_quotation_mark.text_segment == match.text_segment + and self._last_quotation_mark.end_index == match.start_index ) def _is_closing_quote( @@ -96,23 +96,23 @@ def _is_closing_quote( def _resolve_opening_mark(self, quote_match: QuotationMarkStringMatch) -> Union[QuotationMarkMetadata, None]: possible_depths: Set[int] = self._settings.get_possible_depths( - quote_match.get_quotation_mark(), QuotationMarkDirection.Opening + quote_match.quotation_mark, QuotationMarkDirection.OPENING ) if len(possible_depths) == 0: return None - quote = quote_match.resolve(min(possible_depths), QuotationMarkDirection.Opening) + quote = quote_match.resolve(min(possible_depths), QuotationMarkDirection.OPENING) self._last_quotation_mark = quote return quote def _resolve_closing_mark(self, quote_match: QuotationMarkStringMatch) -> Union[QuotationMarkMetadata, None]: possible_depths: Set[int] = self._settings.get_possible_depths( - quote_match.get_quotation_mark(), QuotationMarkDirection.Closing + quote_match.quotation_mark, QuotationMarkDirection.CLOSING ) if len(possible_depths) == 0: return None - quote = quote_match.resolve(min(possible_depths), QuotationMarkDirection.Closing) + quote = quote_match.resolve(min(possible_depths), QuotationMarkDirection.CLOSING) self._last_quotation_mark = quote return quote diff --git a/machine/corpora/quotation_denormalization_first_pass.py b/machine/corpora/quotation_denormalization_first_pass.py index 19ecdb9d..d3cc453c 100644 --- a/machine/corpora/quotation_denormalization_first_pass.py +++ b/machine/corpora/quotation_denormalization_first_pass.py @@ -2,6 +2,7 @@ from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass +# This is a convenience class so that users don't have to know to normalize the source quote convention class QuotationDenormalizationFirstPass(QuotationMarkUpdateFirstPass): def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): diff --git a/machine/corpora/quotation_denormalization_usfm_update_block_handler.py b/machine/corpora/quotation_denormalization_usfm_update_block_handler.py index 6b412202..d6cd9cef 100644 --- a/machine/corpora/quotation_denormalization_usfm_update_block_handler.py +++ b/machine/corpora/quotation_denormalization_usfm_update_block_handler.py @@ -3,6 +3,7 @@ from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler +# This is a convenience class so that users don't have to know to normalize the source quote convention class QuotationDenormalizationUsfmUpdateBlockHandler(QuoteConventionChangingUsfmUpdateBlockHandler): def __init__( diff --git a/machine/corpora/quotation_mark_update_first_pass.py b/machine/corpora/quotation_mark_update_first_pass.py index e4b294e3..f4853968 100644 --- a/machine/corpora/quotation_mark_update_first_pass.py +++ b/machine/corpora/quotation_mark_update_first_pass.py @@ -13,6 +13,7 @@ from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy +# Determines the best strategy to take for each chapter class QuotationMarkUpdateFirstPass(UsfmStructureExtractor): def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): @@ -47,15 +48,15 @@ def _check_whether_fallback_mode_will_work( return False return True - def get_best_actions_by_chapter(self) -> List[QuotationMarkUpdateStrategy]: + def find_best_chapter_strategies(self) -> List[QuotationMarkUpdateStrategy]: best_actions_by_chapter: List[QuotationMarkUpdateStrategy] = [] for chapter in self.get_chapters(): - best_actions_by_chapter.append(self._find_best_action_for_chapter(chapter)) + best_actions_by_chapter.append(self._find_best_strategy_for_chapter(chapter)) return best_actions_by_chapter - def _find_best_action_for_chapter(self, chapter: Chapter) -> QuotationMarkUpdateStrategy: + def _find_best_strategy_for_chapter(self, chapter: Chapter) -> QuotationMarkUpdateStrategy: quotation_mark_matches: List[QuotationMarkStringMatch] = ( self._quotation_mark_finder.find_all_potential_quotation_marks_in_chapter(chapter) ) @@ -65,9 +66,9 @@ def _find_best_action_for_chapter(self, chapter: Chapter) -> QuotationMarkUpdate # use list() to force evaluation of the generator list(self._quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches)) - return self._choose_best_action_based_on_observed_issues(self._quotation_mark_resolver.get_issues()) + return self._choose_best_strategy_based_on_observed_issues(self._quotation_mark_resolver.get_issues()) - def _choose_best_action_based_on_observed_issues(self, issues) -> QuotationMarkUpdateStrategy: + def _choose_best_strategy_based_on_observed_issues(self, issues) -> QuotationMarkUpdateStrategy: if QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK in issues: return QuotationMarkUpdateStrategy.SKIP diff --git a/machine/corpora/quotation_mark_update_resolution_settings.py b/machine/corpora/quotation_mark_update_resolution_settings.py index f67dce18..b0fd40f2 100644 --- a/machine/corpora/quotation_mark_update_resolution_settings.py +++ b/machine/corpora/quotation_mark_update_resolution_settings.py @@ -22,13 +22,13 @@ def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStr return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_convention_singleton_set) def get_opening_quotation_mark_regex(self) -> regex.Pattern: - return self._quote_convention_singleton_set.get_opening_quotation_mark_regex() + return self._quote_convention_singleton_set.opening_quotation_mark_regex def get_closing_quotation_mark_regex(self) -> regex.Pattern: - return self._quote_convention_singleton_set.get_closing_quotation_mark_regex() + return self._quote_convention_singleton_set.closing_quotation_mark_regex def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: - return self._quote_convention_singleton_set.are_marks_a_valid_pair(opening_mark, closing_mark) + return self._quote_convention_singleton_set.marks_are_a_valid_pair(opening_mark, closing_mark) def should_rely_on_paragraph_markers(self): return False @@ -36,7 +36,7 @@ def should_rely_on_paragraph_markers(self): def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: return self._source_quote_convention.get_possible_depths(quotation_mark, direction) - def does_metadata_match_quotation_mark( + def metadata_matches_quotation_mark( self, quotation_mark: str, depth: int, direction: QuotationMarkDirection ) -> bool: return self._source_quote_convention.get_expected_quotation_mark(depth, direction) == quotation_mark diff --git a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py index 153fcafe..745b718b 100644 --- a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py +++ b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py @@ -94,13 +94,13 @@ def _create_text_segments(self, element: UsfmUpdateBlockElement) -> List[TextSeg text_segments: List[TextSegment] = [] for token in element.get_tokens(): if token.type == UsfmTokenType.VERSE: - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VERSE) elif token.type == UsfmTokenType.PARAGRAPH: - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ParagraphMarker) + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.PARAGRAPH) elif token.type == UsfmTokenType.CHARACTER: - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.CharacterMarker) + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHARACTER) elif token.type == UsfmTokenType.NOTE: - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker) + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) elif token.type == UsfmTokenType.TEXT: text_segment: Union[TextSegment, None] = self._create_text_segment(token) if text_segment is not None: @@ -135,7 +135,7 @@ def _start_new_chapter(self, new_chapter_number: int) -> None: self._current_strategy = self._settings.get_action_for_chapter(new_chapter_number) self._verse_text_quotation_mark_resolver.reset() self._next_scripture_text_segment_builder = TextSegment.Builder() - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHAPTER) def _check_for_verse_change(self, block: UsfmUpdateBlock) -> None: for scripture_ref in block.refs: @@ -147,4 +147,4 @@ def _check_for_verse_change(self, block: UsfmUpdateBlock) -> None: self._start_new_verse(self._current_verse_number) def _start_new_verse(self, new_chapter_number: int) -> None: - self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker) + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VERSE) diff --git a/tests/corpora/analysis/test_chapter.py b/tests/corpora/analysis/test_chapter.py index 6f8557f0..cb9f4f23 100644 --- a/tests/corpora/analysis/test_chapter.py +++ b/tests/corpora/analysis/test_chapter.py @@ -18,6 +18,6 @@ def test_initialize_verse() -> None: chapter = Chapter([verse1, verse2]) - assert len(chapter.get_verses()) == 2 - assert chapter.get_verses()[0].get_text_segments() == text_segments1 - assert chapter.get_verses()[1].get_text_segments() == text_segments2 + assert len(chapter.verses) == 2 + assert chapter.verses[0].text_segments == text_segments1 + assert chapter.verses[1].text_segments == text_segments2 diff --git a/tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py b/tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py index 0c3cb1bd..a41933d1 100644 --- a/tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py +++ b/tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py @@ -22,27 +22,27 @@ # QuotationMarkResolverState tests def test_get_current_depth_quotation_mark_resolver_state() -> None: quotation_mark_resolver_state = QuotationMarkResolverState() - assert quotation_mark_resolver_state.get_current_depth() == 1 + assert quotation_mark_resolver_state.current_depth == 1 quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert quotation_mark_resolver_state.get_current_depth() == 2 + assert quotation_mark_resolver_state.current_depth == 2 quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert quotation_mark_resolver_state.get_current_depth() == 3 + assert quotation_mark_resolver_state.current_depth == 3 quotation_mark_resolver_state.add_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert quotation_mark_resolver_state.get_current_depth() == 2 + assert quotation_mark_resolver_state.current_depth == 2 quotation_mark_resolver_state.add_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert quotation_mark_resolver_state.get_current_depth() == 1 + assert quotation_mark_resolver_state.current_depth == 1 def test_has_open_quotation_mark() -> None: @@ -173,28 +173,28 @@ def test_get_current_depth_quotation_continuer_state() -> None: ) quotation_continuer_state = QuotationContinuerState() - assert quotation_continuer_state.get_current_depth() == 0 + assert quotation_continuer_state.current_depth == 0 quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, QuotationContinuerStyle.ENGLISH, ) - assert quotation_continuer_state.get_current_depth() == 1 + assert quotation_continuer_state.current_depth == 1 quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), quotation_mark_resolver_state, QuotationContinuerStyle.ENGLISH, ) - assert quotation_continuer_state.get_current_depth() == 2 + assert quotation_continuer_state.current_depth == 2 quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, QuotationContinuerStyle.ENGLISH, ) - assert quotation_continuer_state.get_current_depth() == 0 + assert quotation_continuer_state.current_depth == 0 def test_has_continuer_been_observed() -> None: @@ -210,28 +210,28 @@ def test_has_continuer_been_observed() -> None: ) quotation_continuer_state = QuotationContinuerState() - assert not quotation_continuer_state.has_continuer_been_observed() + assert not quotation_continuer_state.continuer_has_been_observed() quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, QuotationContinuerStyle.ENGLISH, ) - assert quotation_continuer_state.has_continuer_been_observed() + assert quotation_continuer_state.continuer_has_been_observed() quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), quotation_mark_resolver_state, QuotationContinuerStyle.ENGLISH, ) - assert quotation_continuer_state.has_continuer_been_observed() + assert quotation_continuer_state.continuer_has_been_observed() quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, QuotationContinuerStyle.ENGLISH, ) - assert not quotation_continuer_state.has_continuer_been_observed() + assert not quotation_continuer_state.continuer_has_been_observed() def test_get_continuer_style() -> None: @@ -247,28 +247,28 @@ def test_get_continuer_style() -> None: ) quotation_continuer_state = QuotationContinuerState() - assert quotation_continuer_state.get_continuer_style() is QuotationContinuerStyle.UNDETERMINED + assert quotation_continuer_state.continuer_style is QuotationContinuerStyle.UNDETERMINED quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, QuotationContinuerStyle.ENGLISH, ) - assert quotation_continuer_state.get_continuer_style() is QuotationContinuerStyle.ENGLISH + assert quotation_continuer_state.continuer_style is QuotationContinuerStyle.ENGLISH quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), quotation_mark_resolver_state, QuotationContinuerStyle.SPANISH, ) - assert quotation_continuer_state.get_continuer_style() is QuotationContinuerStyle.SPANISH + assert quotation_continuer_state.continuer_style is QuotationContinuerStyle.SPANISH quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, QuotationContinuerStyle.ENGLISH, ) - assert quotation_continuer_state.get_continuer_style() is QuotationContinuerStyle.ENGLISH + assert quotation_continuer_state.continuer_style is QuotationContinuerStyle.ENGLISH def test_add_quotation_continuer() -> None: @@ -290,7 +290,7 @@ def test_add_quotation_continuer() -> None: quotation_mark_resolver_state, QuotationContinuerStyle.ENGLISH, ) == QuotationMarkMetadata( - "\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text("\u201c").build(), 0, 1 + "\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text("\u201c").build(), 0, 1 ) assert quotation_continuer_state.add_quotation_continuer( @@ -298,16 +298,16 @@ def test_add_quotation_continuer() -> None: quotation_mark_resolver_state, QuotationContinuerStyle.SPANISH, ) == QuotationMarkMetadata( - "\u2018", 2, QuotationMarkDirection.Opening, TextSegment.Builder().set_text("\u2018").build(), 0, 1 + "\u2018", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text("\u2018").build(), 0, 1 ) - assert quotation_continuer_state.get_continuer_style() == QuotationContinuerStyle.SPANISH + assert quotation_continuer_state.continuer_style == QuotationContinuerStyle.SPANISH assert quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, QuotationContinuerStyle.ENGLISH, ) == QuotationMarkMetadata( - "\u201c", 3, QuotationMarkDirection.Opening, TextSegment.Builder().set_text("\u201c").build(), 0, 1 + "\u201c", 3, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text("\u201c").build(), 0, 1 ) @@ -335,10 +335,10 @@ def test_is_english_quotation_continuer() -> None: ) # Should always be false if the continuer style is Spanish - quotation_continuer_state.continuer_style = QuotationContinuerStyle.ENGLISH + quotation_continuer_state._continuer_style = QuotationContinuerStyle.ENGLISH assert quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -346,17 +346,17 @@ def test_is_english_quotation_continuer() -> None: None, ) - quotation_continuer_state.continuer_style = QuotationContinuerStyle.SPANISH + quotation_continuer_state._continuer_style = QuotationContinuerStyle.SPANISH assert not quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), None, None, ) - quotation_continuer_state.continuer_style = QuotationContinuerStyle.ENGLISH + quotation_continuer_state._continuer_style = QuotationContinuerStyle.ENGLISH # Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) assert not quotation_mark_categorizer.is_english_quotation_continuer( @@ -371,7 +371,7 @@ def test_is_english_quotation_continuer() -> None: assert quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -401,7 +401,7 @@ def test_is_english_quotation_continuer() -> None: ) assert not empty_quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -412,7 +412,7 @@ def test_is_english_quotation_continuer() -> None: # Should be false if the starting index of the quotation mark is greater than 0 assert not quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text(" \u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text(" \u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, 2, ), @@ -423,7 +423,7 @@ def test_is_english_quotation_continuer() -> None: # Should be false if the mark does not match the already opened mark assert not quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u2018test").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -438,7 +438,7 @@ def test_is_english_quotation_continuer() -> None: ) assert not quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -447,38 +447,26 @@ def test_is_english_quotation_continuer() -> None: ) assert quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u201c\u2018test") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), None, QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u201c\u2018test") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, 2, ), ) assert quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u201c\u201ctest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u201c\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), None, QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u201c\u201ctest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u201c\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, 2, ), @@ -487,10 +475,7 @@ def test_is_english_quotation_continuer() -> None: # When there are multiple open quotes, the continuer must match the deepest observed mark quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u201c\u2018test") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -500,10 +485,7 @@ def test_is_english_quotation_continuer() -> None: assert not quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u201c\u201ctest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u201c\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, 2, ), @@ -513,10 +495,7 @@ def test_is_english_quotation_continuer() -> None: assert quotation_mark_categorizer.is_english_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u201c\u2018test") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, 2, ), @@ -536,7 +515,7 @@ def test_is_english_quotation_continuer() -> None: QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u201c\u2018\u201ctest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build(), 1, 2, @@ -549,7 +528,7 @@ def test_is_english_quotation_continuer() -> None: QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u201c\u2018\u201ctest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build(), 1, 2, @@ -561,7 +540,7 @@ def test_is_english_quotation_continuer() -> None: QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u201c\u2018\u2018test") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build(), 2, 3, @@ -573,7 +552,7 @@ def test_is_english_quotation_continuer() -> None: QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u201c\u2018\u201ctest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build(), 2, 3, @@ -604,10 +583,10 @@ def test_is_spanish_quotation_continuer() -> None: ) # Should always be false if the continuer style is English - quotation_continuer_state.continuer_style = QuotationContinuerStyle.SPANISH + quotation_continuer_state._continuer_style = QuotationContinuerStyle.SPANISH assert quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -615,17 +594,17 @@ def test_is_spanish_quotation_continuer() -> None: None, ) - quotation_continuer_state.continuer_style = QuotationContinuerStyle.ENGLISH + quotation_continuer_state._continuer_style = QuotationContinuerStyle.ENGLISH assert not quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), None, None, ) - quotation_continuer_state.continuer_style = QuotationContinuerStyle.SPANISH + quotation_continuer_state._continuer_style = QuotationContinuerStyle.SPANISH # Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) assert not quotation_mark_categorizer.is_spanish_quotation_continuer( @@ -640,7 +619,7 @@ def test_is_spanish_quotation_continuer() -> None: assert quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -670,7 +649,7 @@ def test_is_spanish_quotation_continuer() -> None: ) assert not empty_quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -681,7 +660,7 @@ def test_is_spanish_quotation_continuer() -> None: # Should be false if the starting index of the quotation mark is greater than 0 assert not quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text(" \u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text(" \u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, 2, ), @@ -692,7 +671,7 @@ def test_is_spanish_quotation_continuer() -> None: # Should be false if the mark does not match the already opened mark assert not quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u201dtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -707,7 +686,7 @@ def test_is_spanish_quotation_continuer() -> None: ) assert not quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -716,38 +695,26 @@ def test_is_spanish_quotation_continuer() -> None: ) assert quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u00bb\u201dtest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), None, QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u00bb\u201dtest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, 2, ), ) assert quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u00bb\u00bbtest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u00bb\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), None, QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u00bb\u00bbtest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u00bb\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, 2, ), @@ -756,10 +723,7 @@ def test_is_spanish_quotation_continuer() -> None: # When there are multiple open quotes, the continuer must match the deepest observed mark quotation_continuer_state.add_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u00bb\u201dtest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), @@ -769,10 +733,7 @@ def test_is_spanish_quotation_continuer() -> None: assert not quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u00bb\u201ctest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u00bb\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, 2, ), @@ -782,10 +743,7 @@ def test_is_spanish_quotation_continuer() -> None: assert quotation_mark_categorizer.is_spanish_quotation_continuer( QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("\u00bb\u201dtest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, 2, ), @@ -805,7 +763,7 @@ def test_is_spanish_quotation_continuer() -> None: QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u00bb\u201d\u2019test") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build(), 1, 2, @@ -818,7 +776,7 @@ def test_is_spanish_quotation_continuer() -> None: QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u00bb\u201d\u2019test") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build(), 1, 2, @@ -830,7 +788,7 @@ def test_is_spanish_quotation_continuer() -> None: QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u00bb\u201d\u201dtest") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build(), 2, 3, @@ -842,7 +800,7 @@ def test_is_spanish_quotation_continuer() -> None: QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u00bb\u201d\u2019test") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build(), 2, 3, @@ -2067,10 +2025,10 @@ def test_basic_quotation_mark_recognition() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), ] assert standard_english_quotation_mark_resolver.get_issues() == set() @@ -2093,7 +2051,7 @@ def test_resolution_only_of_passed_matches() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), ] assert standard_english_quotation_mark_resolver.get_issues() == { QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK @@ -2137,10 +2095,10 @@ def test_resolution_across_segments() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment1, 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment2, 0, 1), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment2, 6, 7), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment2, 7, 8), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment1, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment2, 0, 1), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment2, 6, 7), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment2, 7, 8), ] assert standard_english_quotation_mark_resolver.get_issues() == set() @@ -2158,7 +2116,7 @@ def test_resolution_with_apostrophes() -> None: text_segment = ( TextSegment.Builder() .set_text("\u201cThis\u2019 is a \u2018quote\u2019\u201d") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build() ) assert list( @@ -2172,10 +2130,10 @@ def test_resolution_with_apostrophes() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment, 12, 13), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment, 18, 19), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment, 19, 20), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment, 12, 13), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment, 19, 20), ] assert standard_english_quotation_mark_resolver.get_issues() == set() @@ -2189,10 +2147,7 @@ def test_resolution_with_apostrophes() -> None: typewriter_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(typewriter_english_resolver_settings) text_segment = ( - TextSegment.Builder() - .set_text("\"This' is a 'quote'\"") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build() + TextSegment.Builder().set_text("\"This' is a 'quote'\"").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build() ) assert list( typewriter_english_quotation_mark_resolver.resolve_quotation_marks( @@ -2205,10 +2160,10 @@ def test_resolution_with_apostrophes() -> None: ] ) ) == [ - QuotationMarkMetadata('"', 1, QuotationMarkDirection.Opening, text_segment, 0, 1), - QuotationMarkMetadata("'", 2, QuotationMarkDirection.Opening, text_segment, 12, 13), - QuotationMarkMetadata("'", 2, QuotationMarkDirection.Closing, text_segment, 18, 19), - QuotationMarkMetadata('"', 1, QuotationMarkDirection.Closing, text_segment, 19, 20), + QuotationMarkMetadata('"', 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.OPENING, text_segment, 12, 13), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + QuotationMarkMetadata('"', 1, QuotationMarkDirection.CLOSING, text_segment, 19, 20), ] assert standard_english_quotation_mark_resolver.get_issues() == set() @@ -2227,7 +2182,7 @@ def test_english_quote_continuers() -> None: text_segment2 = ( TextSegment.Builder() .set_text("\u201c\u2018This is the rest\u2019 of it\u201d") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build() ) assert list( @@ -2242,12 +2197,12 @@ def test_english_quote_continuers() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment1, 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment1, 11, 12), - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment2, 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment2, 1, 2), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment2, 18, 19), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment2, 25, 26), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment1, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment1, 11, 12), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment2, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment2, 1, 2), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment2, 18, 19), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment2, 25, 26), ] assert standard_english_quotation_mark_resolver.get_issues() == set() @@ -2266,7 +2221,7 @@ def test_spanish_quote_continuers() -> None: text_segment2 = ( TextSegment.Builder() .set_text("\u00bb\u201dThis is the rest\u201d of it\u00bb") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build() ) assert list( @@ -2281,12 +2236,12 @@ def test_spanish_quote_continuers() -> None: ] ) ) == [ - QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, text_segment1, 0, 1), - QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, text_segment1, 11, 12), - QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Opening, text_segment2, 0, 1), - QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Opening, text_segment2, 1, 2), - QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Closing, text_segment2, 18, 19), - QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, text_segment2, 25, 26), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, text_segment1, 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.OPENING, text_segment1, 11, 12), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.OPENING, text_segment2, 0, 1), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.OPENING, text_segment2, 1, 2), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.CLOSING, text_segment2, 18, 19), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, text_segment2, 25, 26), ] assert western_european_quotation_mark_resolver.get_issues() == set() @@ -2305,7 +2260,7 @@ def test_malformed_quotation_marks() -> None: text_segment2 = ( TextSegment.Builder() .set_text("This is the rest \u2019 of it \u201d") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build() ) assert list( @@ -2318,10 +2273,10 @@ def test_malformed_quotation_marks() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment1, 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment1, 12, 13), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment2, 17, 18), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment2, 25, 26), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment1, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment1, 12, 13), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment2, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment2, 25, 26), ] assert standard_english_quotation_mark_resolver.get_issues() == set() @@ -2346,9 +2301,9 @@ def test_unpaired_quotation_mark_issue() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), ] assert standard_english_quotation_mark_resolver.get_issues() == { QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK @@ -2362,7 +2317,7 @@ def test_unpaired_quotation_mark_issue() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment, 13, 14), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment, 13, 14), ] assert standard_english_quotation_mark_resolver.get_issues() == { QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK @@ -2393,10 +2348,10 @@ def test_too_deep_nesting_issue() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, text_segment, 6, 7), - QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.Opening, text_segment, 10, 11), - QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.Opening, text_segment, 13, 14), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment, 6, 7), + QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.OPENING, text_segment, 10, 11), + QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.OPENING, text_segment, 13, 14), # QuotationMarkMetadata("\u201c", 5, QuotationMarkDirection.Opening, text_segment, 20, 21), ] assert standard_english_quotation_mark_resolver.get_issues() == { @@ -2426,10 +2381,10 @@ def test_incompatible_quotation_mark_issue() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), - QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), - QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), ] assert standard_english_quotation_mark_resolver.get_issues() == { QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK @@ -2487,10 +2442,7 @@ def test_typewriter_english_quotation_mark_recognition() -> None: typewriter_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(typewriter_english_resolver_settings) text_segment = ( - TextSegment.Builder() - .set_text("\"This is a 'quote'\"") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build() + TextSegment.Builder().set_text("\"This is a 'quote'\"").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build() ) assert list( typewriter_english_quotation_mark_resolver.resolve_quotation_marks( @@ -2502,10 +2454,10 @@ def test_typewriter_english_quotation_mark_recognition() -> None: ] ) ) == [ - QuotationMarkMetadata('"', 1, QuotationMarkDirection.Opening, text_segment, 0, 1), - QuotationMarkMetadata("'", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), - QuotationMarkMetadata("'", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), - QuotationMarkMetadata('"', 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + QuotationMarkMetadata('"', 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata('"', 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), ] assert typewriter_english_quotation_mark_resolver.get_issues() == set() @@ -2531,10 +2483,10 @@ def test_typewriter_french_mark_recognition() -> None: ] ) ) == [ - QuotationMarkMetadata("<<", 1, QuotationMarkDirection.Opening, text_segment, 0, 2), - QuotationMarkMetadata("<", 2, QuotationMarkDirection.Opening, text_segment, 12, 13), - QuotationMarkMetadata(">", 2, QuotationMarkDirection.Closing, text_segment, 18, 19), - QuotationMarkMetadata(">>", 1, QuotationMarkDirection.Closing, text_segment, 19, 21), + QuotationMarkMetadata("<<", 1, QuotationMarkDirection.OPENING, text_segment, 0, 2), + QuotationMarkMetadata("<", 2, QuotationMarkDirection.OPENING, text_segment, 12, 13), + QuotationMarkMetadata(">", 2, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + QuotationMarkMetadata(">>", 1, QuotationMarkDirection.CLOSING, text_segment, 19, 21), ] assert typewriter_french_quotation_mark_resolver.get_issues() == set() @@ -2552,7 +2504,7 @@ def test_central_european_quotation_mark_recognition() -> None: text_segment = ( TextSegment.Builder() .set_text("\u201eThis is a \u201aquote\u2018\u201c") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build() ) assert list( @@ -2565,10 +2517,10 @@ def test_central_european_quotation_mark_recognition() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), - QuotationMarkMetadata("\u201a", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u201a", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), ] assert central_european_quotation_mark_resolver.get_issues() == set() @@ -2586,7 +2538,7 @@ def test_standard_swedish_quotation_mark_recognition() -> None: text_segment = ( TextSegment.Builder() .set_text("\u201dThis is a \u2019quote\u2019\u201d") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build() ) assert list( @@ -2599,10 +2551,10 @@ def test_standard_swedish_quotation_mark_recognition() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), ] assert standard_swedish_quotation_mark_resolver.get_issues() == set() @@ -2634,7 +2586,7 @@ def test_multiple_conventions_quotation_mark_recognition() -> None: text_segment = ( TextSegment.Builder() .set_text("\u201eThis is a \u2019quote>\u201c") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build() ) assert list( @@ -2647,9 +2599,9 @@ def test_multiple_conventions_quotation_mark_recognition() -> None: ] ) ) == [ - QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.Opening, text_segment, 0, 1), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Opening, text_segment, 11, 12), - QuotationMarkMetadata(">", 2, QuotationMarkDirection.Closing, text_segment, 17, 18), - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Closing, text_segment, 18, 19), + QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata(">", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), ] assert multiple_conventions_quotation_mark_resolver.get_issues() == set() diff --git a/tests/corpora/analysis/test_quotation_mark_metadata.py b/tests/corpora/analysis/test_quotation_mark_metadata.py index c81954a1..d7c2395e 100644 --- a/tests/corpora/analysis/test_quotation_mark_metadata.py +++ b/tests/corpora/analysis/test_quotation_mark_metadata.py @@ -13,35 +13,35 @@ def test_update_quotation_mark() -> None: quotation_mark_metadata = QuotationMarkMetadata( quotation_mark='"', depth=1, - direction=QuotationMarkDirection.Opening, + direction=QuotationMarkDirection.OPENING, text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), start_index=22, end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english")) - assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said," + assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said," quotation_mark_metadata = QuotationMarkMetadata( quotation_mark='"', depth=1, - direction=QuotationMarkDirection.Opening, + direction=QuotationMarkDirection.OPENING, text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), start_index=22, end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) - assert quotation_mark_metadata.text_segment.text == "He said to the woman, «Has God really said," + assert quotation_mark_metadata.text_segment._text == "He said to the woman, «Has God really said," quotation_mark_metadata = QuotationMarkMetadata( quotation_mark='"', depth=1, - direction=QuotationMarkDirection.Opening, + direction=QuotationMarkDirection.OPENING, text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), start_index=23, end_index=24, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) - assert quotation_mark_metadata.text_segment.text == 'He said to the woman, "«as God really said,' + assert quotation_mark_metadata.text_segment._text == 'He said to the woman, "«as God really said,' def get_quote_convention_by_name(name: str) -> QuoteConvention: diff --git a/tests/corpora/analysis/test_quotation_mark_resolver.py b/tests/corpora/analysis/test_quotation_mark_resolver.py index 7f207eec..0b30172c 100644 --- a/tests/corpora/analysis/test_quotation_mark_resolver.py +++ b/tests/corpora/analysis/test_quotation_mark_resolver.py @@ -16,38 +16,35 @@ def test_reset() -> None: QuoteConventionDetectionResolutionSettings(standard_quote_conventions.standard_quote_conventions) ) - assert quotation_mark_resolver._quotation_mark_resolver_state.quotation_stack == [] - assert quotation_mark_resolver._quotation_continuer_state.quotation_continuer_stack == [] - assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 - assert quotation_mark_resolver._quotation_continuer_state.current_depth == 0 + assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] + assert quotation_mark_resolver._quotation_continuer_state._quotation_continuer_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state._current_depth == 0 + assert quotation_mark_resolver._quotation_continuer_state._current_depth == 0 quotation_mark_resolver.reset() - assert quotation_mark_resolver._quotation_mark_resolver_state.quotation_stack == [] - assert quotation_mark_resolver._quotation_continuer_state.quotation_continuer_stack == [] - assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 - assert quotation_mark_resolver._quotation_continuer_state.current_depth == 0 + assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] + assert quotation_mark_resolver._quotation_continuer_state._quotation_continuer_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state._current_depth == 0 + assert quotation_mark_resolver._quotation_continuer_state._current_depth == 0 quotation_mark_string_matches: List[QuotationMarkStringMatch] = [ QuotationMarkStringMatch(TextSegment.Builder().set_text("Opening “quote").build(), 8, 9), QuotationMarkStringMatch(TextSegment.Builder().set_text("Another opening ‘quote").build(), 16, 17), QuotationMarkStringMatch( - TextSegment.Builder() - .set_text("“‘quote continuer") - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) - .build(), + TextSegment.Builder().set_text("“‘quote continuer").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), ] list(quotation_mark_resolver.resolve_quotation_marks(quotation_mark_string_matches)) - assert len(quotation_mark_resolver._quotation_mark_resolver_state.quotation_stack) > 0 - assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth > 0 + assert len(quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack) > 0 + assert quotation_mark_resolver._quotation_mark_resolver_state._current_depth > 0 quotation_mark_resolver.reset() - assert quotation_mark_resolver._quotation_mark_resolver_state.quotation_stack == [] - assert quotation_mark_resolver._quotation_continuer_state.quotation_continuer_stack == [] - assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 - assert quotation_mark_resolver._quotation_continuer_state.current_depth == 0 + assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] + assert quotation_mark_resolver._quotation_continuer_state._quotation_continuer_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state._current_depth == 0 + assert quotation_mark_resolver._quotation_continuer_state._current_depth == 0 diff --git a/tests/corpora/analysis/test_quotation_mark_string_match.py b/tests/corpora/analysis/test_quotation_mark_string_match.py index de3f21d1..e54caa31 100644 --- a/tests/corpora/analysis/test_quotation_mark_string_match.py +++ b/tests/corpora/analysis/test_quotation_mark_string_match.py @@ -16,15 +16,15 @@ def test_get_quotation_mark() -> None: quotation_mark_string_match = QuotationMarkStringMatch( TextSegment.Builder().set_text("quick brown fox").build(), 6, 7 ) - assert quotation_mark_string_match.get_quotation_mark() == "b" + assert quotation_mark_string_match.quotation_mark == "b" quotation_mark_string_match = QuotationMarkStringMatch( TextSegment.Builder().set_text("quick brown fox").build(), 6, 10 ) - assert quotation_mark_string_match.get_quotation_mark() == "brow" + assert quotation_mark_string_match.quotation_mark == "brow" quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("q").build(), 0, 1) - assert quotation_mark_string_match.get_quotation_mark() == "q" + assert quotation_mark_string_match.quotation_mark == "q" def test_is_valid_opening_quotation_mark() -> None: @@ -79,120 +79,120 @@ def test_is_valid_closing_quotation_mark() -> None: def test_does_quotation_mark_match() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) - assert quotation_mark_string_match.does_quotation_mark_match(regex.compile(r"^s$")) - assert not quotation_mark_string_match.does_quotation_mark_match(regex.compile(r"a")) - assert not quotation_mark_string_match.does_quotation_mark_match(regex.compile(r"sa")) + assert quotation_mark_string_match.quotation_mark_matches(regex.compile(r"^s$")) + assert not quotation_mark_string_match.quotation_mark_matches(regex.compile(r"a")) + assert not quotation_mark_string_match.quotation_mark_matches(regex.compile(r"sa")) def test_does_next_character_match() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) - assert not quotation_mark_string_match.does_next_character_match(regex.compile(r"^s$")) - assert quotation_mark_string_match.does_next_character_match(regex.compile(r"a")) - assert not quotation_mark_string_match.does_next_character_match(regex.compile(r"sa")) + assert not quotation_mark_string_match.next_character_matches(regex.compile(r"^s$")) + assert quotation_mark_string_match.next_character_matches(regex.compile(r"a")) + assert not quotation_mark_string_match.next_character_matches(regex.compile(r"sa")) quotation_mark_string_match = QuotationMarkStringMatch( TextSegment.Builder().set_text("sample text").build(), 10, 11 ) - assert not quotation_mark_string_match.does_next_character_match(regex.compile(r".*")) + assert not quotation_mark_string_match.next_character_matches(regex.compile(r".*")) def test_does_previous_character_match() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) - assert quotation_mark_string_match.does_previous_character_match(regex.compile(r"^s$")) - assert not quotation_mark_string_match.does_previous_character_match(regex.compile(r"a")) - assert not quotation_mark_string_match.does_previous_character_match(regex.compile(r"sa")) + assert quotation_mark_string_match.previous_character_matches(regex.compile(r"^s$")) + assert not quotation_mark_string_match.previous_character_matches(regex.compile(r"a")) + assert not quotation_mark_string_match.previous_character_matches(regex.compile(r"sa")) quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) - assert not quotation_mark_string_match.does_previous_character_match(regex.compile(r".*")) + assert not quotation_mark_string_match.previous_character_matches(regex.compile(r".*")) def test_get_previous_character() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) - assert quotation_mark_string_match.get_previous_character() == "s" + assert quotation_mark_string_match.previous_character == "s" quotation_mark_string_match = QuotationMarkStringMatch( TextSegment.Builder().set_text("sample text").build(), 10, 11 ) - assert quotation_mark_string_match.get_previous_character() == "x" + assert quotation_mark_string_match.previous_character == "x" quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) - assert quotation_mark_string_match.get_previous_character() is None + assert quotation_mark_string_match.previous_character is None quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) - assert quotation_mark_string_match.get_previous_character() == "“" + assert quotation_mark_string_match.previous_character == "“" def test_get_next_character() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) - assert quotation_mark_string_match.get_next_character() == "m" + assert quotation_mark_string_match.next_character == "m" quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) - assert quotation_mark_string_match.get_next_character() == "a" + assert quotation_mark_string_match.next_character == "a" quotation_mark_string_match = QuotationMarkStringMatch( TextSegment.Builder().set_text("sample text").build(), 10, 11 ) - assert quotation_mark_string_match.get_next_character() is None + assert quotation_mark_string_match.next_character is None quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1) - assert quotation_mark_string_match.get_next_character() == "”" + assert quotation_mark_string_match.next_character == "”" def test_does_leading_substring_match() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 5, 6) - assert quotation_mark_string_match.does_leading_substring_match(regex.compile(r"^sampl$")) + assert quotation_mark_string_match.leading_substring_matches(regex.compile(r"^sampl$")) quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) - assert not quotation_mark_string_match.does_leading_substring_match(regex.compile(r".+")) + assert not quotation_mark_string_match.leading_substring_matches(regex.compile(r".+")) quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) - assert quotation_mark_string_match.does_leading_substring_match(regex.compile(r"\u201c")) + assert quotation_mark_string_match.leading_substring_matches(regex.compile(r"\u201c")) def test_does_trailing_substring_match() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 5, 6) - assert quotation_mark_string_match.does_trailing_substring_match(regex.compile(r"^ text$")) + assert quotation_mark_string_match.trailing_substring_matches(regex.compile(r"^ text$")) quotation_mark_string_match = QuotationMarkStringMatch( TextSegment.Builder().set_text("sample text").build(), 11, 12 ) - assert not quotation_mark_string_match.does_trailing_substring_match(regex.compile(r".+")) + assert not quotation_mark_string_match.trailing_substring_matches(regex.compile(r".+")) quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1) - assert quotation_mark_string_match.does_trailing_substring_match(regex.compile(r"\u201d")) + assert quotation_mark_string_match.trailing_substring_matches(regex.compile(r"\u201d")) def test_get_context() -> None: quotation_mark_string_match = QuotationMarkStringMatch( TextSegment.Builder().set_text("this is a bunch' of sample text").build(), 15, 16 ) - assert quotation_mark_string_match.get_context() == "is a bunch' of sample" + assert quotation_mark_string_match.context == "is a bunch' of sample" quotation_mark_string_match = QuotationMarkStringMatch( TextSegment.Builder().set_text("this is a bunch' of sample text").build(), 5, 6 ) - assert quotation_mark_string_match.get_context() == "this is a bunch'" + assert quotation_mark_string_match.context == "this is a bunch'" quotation_mark_string_match = QuotationMarkStringMatch( TextSegment.Builder().set_text("this is a bunch' of sample text").build(), 25, 26 ) - assert quotation_mark_string_match.get_context() == "' of sample text" + assert quotation_mark_string_match.context == "' of sample text" quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("short").build(), 3, 4) - assert quotation_mark_string_match.get_context() == "short" + assert quotation_mark_string_match.context == "short" def test_resolve() -> None: text_segment = TextSegment.Builder().set_text("'").build() quotation_mark_string_match = QuotationMarkStringMatch(text_segment, 0, 1) - assert quotation_mark_string_match.resolve(2, QuotationMarkDirection.Opening) == QuotationMarkMetadata( - "'", 2, QuotationMarkDirection.Opening, text_segment, 0, 1 + assert quotation_mark_string_match.resolve(2, QuotationMarkDirection.OPENING) == QuotationMarkMetadata( + "'", 2, QuotationMarkDirection.OPENING, text_segment, 0, 1 ) - assert quotation_mark_string_match.resolve(1, QuotationMarkDirection.Opening) == QuotationMarkMetadata( - "'", 1, QuotationMarkDirection.Opening, text_segment, 0, 1 + assert quotation_mark_string_match.resolve(1, QuotationMarkDirection.OPENING) == QuotationMarkMetadata( + "'", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1 ) - assert quotation_mark_string_match.resolve(1, QuotationMarkDirection.Closing) == QuotationMarkMetadata( - "'", 1, QuotationMarkDirection.Closing, text_segment, 0, 1 + assert quotation_mark_string_match.resolve(1, QuotationMarkDirection.CLOSING) == QuotationMarkMetadata( + "'", 1, QuotationMarkDirection.CLOSING, text_segment, 0, 1 ) @@ -245,32 +245,34 @@ def test_has_leading_whitespace() -> None: assert not quotation_mark_string_match.has_leading_whitespace() quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), 0, 1 + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, ) assert quotation_mark_string_match.has_leading_whitespace() quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.EmbedMarker).build(), 0, 1 + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.EMBED).build(), 0, 1 ) assert quotation_mark_string_match.has_leading_whitespace() quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.VerseMarker).build(), 0, 1 + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.VERSE).build(), 0, 1 ) assert quotation_mark_string_match.has_leading_whitespace() quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.ChapterMarker).build(), 0, 1 + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.CHAPTER).build(), 0, 1 ) assert not quotation_mark_string_match.has_leading_whitespace() quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.CharacterMarker).build(), 0, 1 + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.CHARACTER).build(), 0, 1 ) assert not quotation_mark_string_match.has_leading_whitespace() quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("\u201csample text").add_preceding_marker(UsfmMarkerType.VerseMarker).build(), + TextSegment.Builder().set_text("\u201csample text").add_preceding_marker(UsfmMarkerType.VERSE).build(), 0, 1, ) @@ -290,19 +292,19 @@ def test_has_trailing_whitespace() -> None: assert not quotation_mark_string_match.has_trailing_whitespace() quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.ParagraphMarker).build(), + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 10, 11, ) assert not quotation_mark_string_match.has_trailing_whitespace() quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.EmbedMarker).build(), 10, 11 + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.EMBED).build(), 10, 11 ) assert not quotation_mark_string_match.has_trailing_whitespace() quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.VerseMarker).build(), 10, 11 + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.VERSE).build(), 10, 11 ) assert not quotation_mark_string_match.has_trailing_whitespace() diff --git a/tests/corpora/analysis/test_quotation_mark_tabulator.py b/tests/corpora/analysis/test_quotation_mark_tabulator.py index 25052a07..dde9ea3a 100644 --- a/tests/corpora/analysis/test_quotation_mark_tabulator.py +++ b/tests/corpora/analysis/test_quotation_mark_tabulator.py @@ -32,7 +32,7 @@ def test_get_best_proportion() -> None: counts.count_quotation_mark('"') counts.count_quotation_mark("'") - best_str, best_count, total_count = counts.get_best_proportion() + best_str, best_count, total_count = counts.find_best_quotation_mark_proportion() assert best_str == '"' assert best_count == 2 assert total_count == 3 @@ -40,7 +40,7 @@ def test_get_best_proportion() -> None: counts.count_quotation_mark("'") counts.count_quotation_mark("'") - best_str, best_count, total_count = counts.get_best_proportion() + best_str, best_count, total_count = counts.find_best_quotation_mark_proportion() assert best_str == "'" assert best_count == 3 assert total_count == 5 @@ -67,8 +67,8 @@ def test_calculate_similarity() -> None: single_level_quotation_mark_tabulator = QuotationMarkTabulator() single_level_quotation_mark_tabulator.tabulate( [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 0, 1), ] ) @@ -110,10 +110,10 @@ def test_calculate_similarity() -> None: two_level_quotation_mark_tabulator = QuotationMarkTabulator() two_level_quotation_mark_tabulator.tabulate( [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 2), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 0, 2), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 2), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 0, 2), ] ) assert two_level_quotation_mark_tabulator.calculate_similarity( diff --git a/tests/corpora/analysis/test_quote_convention.py b/tests/corpora/analysis/test_quote_convention.py index 165b1b4a..6a863f4f 100644 --- a/tests/corpora/analysis/test_quote_convention.py +++ b/tests/corpora/analysis/test_quote_convention.py @@ -5,83 +5,83 @@ def test_single_level_quote_convention_normalize() -> None: english_level1_quote_convention = SingleLevelQuoteConvention("\u201c", "\u201d") normalized_english_level1_quote_convention = english_level1_quote_convention.normalize() - assert normalized_english_level1_quote_convention.get_opening_quote() == '"' - assert normalized_english_level1_quote_convention.get_closing_quote() == '"' + assert normalized_english_level1_quote_convention.opening_quote == '"' + assert normalized_english_level1_quote_convention.closing_quote == '"' english_level2_quote_convention = SingleLevelQuoteConvention("\u2018", "\u2019") normalized_english_level2_quote_convention = english_level2_quote_convention.normalize() - assert normalized_english_level2_quote_convention.get_opening_quote() == "'" - assert normalized_english_level2_quote_convention.get_closing_quote() == "'" + assert normalized_english_level2_quote_convention.opening_quote == "'" + assert normalized_english_level2_quote_convention.closing_quote == "'" already_normalized_english_level1_quote_convention = SingleLevelQuoteConvention('"', '"') doubly_normalized_english_level1_quote_convention = already_normalized_english_level1_quote_convention.normalize() - assert doubly_normalized_english_level1_quote_convention.get_opening_quote() == '"' - assert doubly_normalized_english_level1_quote_convention.get_closing_quote() == '"' + assert doubly_normalized_english_level1_quote_convention.opening_quote == '"' + assert doubly_normalized_english_level1_quote_convention.closing_quote == '"' already_normalized_english_level2_quote_convention = SingleLevelQuoteConvention("'", "'") doubly_normalized_english_level2_quote_convention = already_normalized_english_level2_quote_convention.normalize() - assert doubly_normalized_english_level2_quote_convention.get_opening_quote() == "'" - assert doubly_normalized_english_level2_quote_convention.get_closing_quote() == "'" + assert doubly_normalized_english_level2_quote_convention.opening_quote == "'" + assert doubly_normalized_english_level2_quote_convention.closing_quote == "'" french_level1_quote_convention = SingleLevelQuoteConvention("\u00ab", "\u00bb") normalized_french_level1_quote_convention = french_level1_quote_convention.normalize() - assert normalized_french_level1_quote_convention.get_opening_quote() == '"' - assert normalized_french_level1_quote_convention.get_closing_quote() == '"' + assert normalized_french_level1_quote_convention.opening_quote == '"' + assert normalized_french_level1_quote_convention.closing_quote == '"' french_level2_quote_convention = SingleLevelQuoteConvention("\u2039", "\u203a") normalized_french_level2_quote_convention = french_level2_quote_convention.normalize() - assert normalized_french_level2_quote_convention.get_opening_quote() == "\u2039" - assert normalized_french_level2_quote_convention.get_closing_quote() == "\u203a" + assert normalized_french_level2_quote_convention.opening_quote == "\u2039" + assert normalized_french_level2_quote_convention.closing_quote == "\u203a" typewriter_french_level1_quote_convention = SingleLevelQuoteConvention("<<", ">>") normalized_typewriter_french_level1_quote_convention = typewriter_french_level1_quote_convention.normalize() - assert normalized_typewriter_french_level1_quote_convention.get_opening_quote() == "<<" - assert normalized_typewriter_french_level1_quote_convention.get_closing_quote() == ">>" + assert normalized_typewriter_french_level1_quote_convention.opening_quote == "<<" + assert normalized_typewriter_french_level1_quote_convention.closing_quote == ">>" typewriter_french_level2_quote_convention = SingleLevelQuoteConvention("<", ">") normalized_typewriter_french_level2_quote_convention = typewriter_french_level2_quote_convention.normalize() - assert normalized_typewriter_french_level2_quote_convention.get_opening_quote() == "<" - assert normalized_typewriter_french_level2_quote_convention.get_closing_quote() == ">" + assert normalized_typewriter_french_level2_quote_convention.opening_quote == "<" + assert normalized_typewriter_french_level2_quote_convention.closing_quote == ">" central_european_level1_quote_convention = SingleLevelQuoteConvention("\u201e", "\u201c") normalized_central_european_level1_quote_convention = central_european_level1_quote_convention.normalize() - assert normalized_central_european_level1_quote_convention.get_opening_quote() == '"' - assert normalized_central_european_level1_quote_convention.get_closing_quote() == '"' + assert normalized_central_european_level1_quote_convention.opening_quote == '"' + assert normalized_central_european_level1_quote_convention.closing_quote == '"' central_european_level2_quote_convention = SingleLevelQuoteConvention("\u201a", "\u2018") normalized_central_european_level2_quote_convention = central_european_level2_quote_convention.normalize() - assert normalized_central_european_level2_quote_convention.get_opening_quote() == "'" - assert normalized_central_european_level2_quote_convention.get_closing_quote() == "'" + assert normalized_central_european_level2_quote_convention.opening_quote == "'" + assert normalized_central_european_level2_quote_convention.closing_quote == "'" central_european_guillemets_quote_convention = SingleLevelQuoteConvention("\u00bb", "\u00ab") normalized_central_european_guillemets_quote_convention = central_european_guillemets_quote_convention.normalize() - assert normalized_central_european_guillemets_quote_convention.get_opening_quote() == '"' - assert normalized_central_european_guillemets_quote_convention.get_closing_quote() == '"' + assert normalized_central_european_guillemets_quote_convention.opening_quote == '"' + assert normalized_central_european_guillemets_quote_convention.closing_quote == '"' swedish_level1_quote_convention = SingleLevelQuoteConvention("\u201d", "\u201d") normalized_swedish_level1_quote_convention = swedish_level1_quote_convention.normalize() - assert normalized_swedish_level1_quote_convention.get_opening_quote() == '"' - assert normalized_swedish_level1_quote_convention.get_closing_quote() == '"' + assert normalized_swedish_level1_quote_convention.opening_quote == '"' + assert normalized_swedish_level1_quote_convention.closing_quote == '"' swedish_level2_quote_convention = SingleLevelQuoteConvention("\u2019", "\u2019") normalized_swedish_level2_quote_convention = swedish_level2_quote_convention.normalize() - assert normalized_swedish_level2_quote_convention.get_opening_quote() == "'" - assert normalized_swedish_level2_quote_convention.get_closing_quote() == "'" + assert normalized_swedish_level2_quote_convention.opening_quote == "'" + assert normalized_swedish_level2_quote_convention.closing_quote == "'" finnish_level1_quote_convention = SingleLevelQuoteConvention("\u00bb", "\u00bb") normalized_finnish_level1_quote_convention = finnish_level1_quote_convention.normalize() - assert normalized_finnish_level1_quote_convention.get_opening_quote() == '"' - assert normalized_finnish_level1_quote_convention.get_closing_quote() == '"' + assert normalized_finnish_level1_quote_convention.opening_quote == '"' + assert normalized_finnish_level1_quote_convention.closing_quote == '"' arabic_level1_quote_convention = SingleLevelQuoteConvention("\u201d", "\u201c") normalized_arabic_level1_quote_convention = arabic_level1_quote_convention.normalize() - assert normalized_arabic_level1_quote_convention.get_opening_quote() == '"' - assert normalized_arabic_level1_quote_convention.get_closing_quote() == '"' + assert normalized_arabic_level1_quote_convention.opening_quote == '"' + assert normalized_arabic_level1_quote_convention.closing_quote == '"' arabic_level2_quote_convention = SingleLevelQuoteConvention("\u2019", "\u2018") normalized_arabic_level2_quote_convention = arabic_level2_quote_convention.normalize() - assert normalized_arabic_level2_quote_convention.get_opening_quote() == "'" - assert normalized_arabic_level2_quote_convention.get_closing_quote() == "'" + assert normalized_arabic_level2_quote_convention.opening_quote == "'" + assert normalized_arabic_level2_quote_convention.closing_quote == "'" def test_get_num_levels() -> None: @@ -151,16 +151,16 @@ def test_get_expected_quotation_mark() -> None: SingleLevelQuoteConvention("\u00ab", "\u00bb"), ], ) - assert quote_convention.get_expected_quotation_mark(1, QuotationMarkDirection.Opening) == "\u201c" - assert quote_convention.get_expected_quotation_mark(1, QuotationMarkDirection.Closing) == "\u201d" - assert quote_convention.get_expected_quotation_mark(2, QuotationMarkDirection.Opening) == "\u2018" - assert quote_convention.get_expected_quotation_mark(2, QuotationMarkDirection.Closing) == "\u2019" - assert quote_convention.get_expected_quotation_mark(3, QuotationMarkDirection.Opening) == "\u00ab" - assert quote_convention.get_expected_quotation_mark(3, QuotationMarkDirection.Closing) == "\u00bb" - assert quote_convention.get_expected_quotation_mark(4, QuotationMarkDirection.Opening) == "" - assert quote_convention.get_expected_quotation_mark(4, QuotationMarkDirection.Closing) == "" - assert quote_convention.get_expected_quotation_mark(0, QuotationMarkDirection.Opening) == "" - assert quote_convention.get_expected_quotation_mark(0, QuotationMarkDirection.Closing) == "" + assert quote_convention.get_expected_quotation_mark(1, QuotationMarkDirection.OPENING) == "\u201c" + assert quote_convention.get_expected_quotation_mark(1, QuotationMarkDirection.CLOSING) == "\u201d" + assert quote_convention.get_expected_quotation_mark(2, QuotationMarkDirection.OPENING) == "\u2018" + assert quote_convention.get_expected_quotation_mark(2, QuotationMarkDirection.CLOSING) == "\u2019" + assert quote_convention.get_expected_quotation_mark(3, QuotationMarkDirection.OPENING) == "\u00ab" + assert quote_convention.get_expected_quotation_mark(3, QuotationMarkDirection.CLOSING) == "\u00bb" + assert quote_convention.get_expected_quotation_mark(4, QuotationMarkDirection.OPENING) == "" + assert quote_convention.get_expected_quotation_mark(4, QuotationMarkDirection.CLOSING) == "" + assert quote_convention.get_expected_quotation_mark(0, QuotationMarkDirection.OPENING) == "" + assert quote_convention.get_expected_quotation_mark(0, QuotationMarkDirection.CLOSING) == "" def test_includes_opening_quotation_mark() -> None: @@ -257,16 +257,16 @@ def test_get_possible_depths() -> None: SingleLevelQuoteConvention("\u2018", "\u2019"), ], ) - assert quote_convention.get_possible_depths("\u201c", QuotationMarkDirection.Opening) == {1, 3} - assert quote_convention.get_possible_depths("\u201c", QuotationMarkDirection.Closing) == set() - assert quote_convention.get_possible_depths("\u2018", QuotationMarkDirection.Opening) == {2, 4} - assert quote_convention.get_possible_depths("\u2018", QuotationMarkDirection.Closing) == set() - assert quote_convention.get_possible_depths("\u201d", QuotationMarkDirection.Opening) == set() - assert quote_convention.get_possible_depths("\u201d", QuotationMarkDirection.Closing) == {1, 3} - assert quote_convention.get_possible_depths("\u2019", QuotationMarkDirection.Opening) == set() - assert quote_convention.get_possible_depths("\u2019", QuotationMarkDirection.Closing) == {2, 4} - assert quote_convention.get_possible_depths("\u00ab", QuotationMarkDirection.Opening) == set() - assert quote_convention.get_possible_depths("\u00ab", QuotationMarkDirection.Closing) == set() + assert quote_convention.get_possible_depths("\u201c", QuotationMarkDirection.OPENING) == {1, 3} + assert quote_convention.get_possible_depths("\u201c", QuotationMarkDirection.CLOSING) == set() + assert quote_convention.get_possible_depths("\u2018", QuotationMarkDirection.OPENING) == {2, 4} + assert quote_convention.get_possible_depths("\u2018", QuotationMarkDirection.CLOSING) == set() + assert quote_convention.get_possible_depths("\u201d", QuotationMarkDirection.OPENING) == set() + assert quote_convention.get_possible_depths("\u201d", QuotationMarkDirection.CLOSING) == {1, 3} + assert quote_convention.get_possible_depths("\u2019", QuotationMarkDirection.OPENING) == set() + assert quote_convention.get_possible_depths("\u2019", QuotationMarkDirection.CLOSING) == {2, 4} + assert quote_convention.get_possible_depths("\u00ab", QuotationMarkDirection.OPENING) == set() + assert quote_convention.get_possible_depths("\u00ab", QuotationMarkDirection.CLOSING) == set() def test_is_compatible_with_observed_quotation_marks() -> None: diff --git a/tests/corpora/analysis/test_quote_convention_detector.py b/tests/corpora/analysis/test_quote_convention_detector.py index a9c142b2..442643b6 100644 --- a/tests/corpora/analysis/test_quote_convention_detector.py +++ b/tests/corpora/analysis/test_quote_convention_detector.py @@ -16,7 +16,7 @@ def test_standard_english() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "standard_english" + assert analysis.best_quote_convention.get_name() == "standard_english" def test_typewriter_english() -> None: @@ -29,7 +29,7 @@ def test_typewriter_english() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "typewriter_english" + assert analysis.best_quote_convention.get_name() == "typewriter_english" def test_british_english() -> None: @@ -42,7 +42,7 @@ def test_british_english() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "british_english" + assert analysis.best_quote_convention.get_name() == "british_english" def test_british_typewriter_english() -> None: @@ -55,7 +55,7 @@ def test_british_typewriter_english() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "british_typewriter_english" + assert analysis.best_quote_convention.get_name() == "british_typewriter_english" def test_hybrid_typewriter_english() -> None: @@ -68,7 +68,7 @@ def test_hybrid_typewriter_english() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "hybrid_typewriter_english" + assert analysis.best_quote_convention.get_name() == "hybrid_typewriter_english" def test_standard_french() -> None: @@ -81,7 +81,7 @@ def test_standard_french() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "standard_french" + assert analysis.best_quote_convention.get_name() == "standard_french" def test_typewriter_french() -> None: @@ -94,7 +94,7 @@ def test_typewriter_french() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "typewriter_french" + assert analysis.best_quote_convention.get_name() == "typewriter_french" # french_variant requires a 3rd-level of quotes to differentiate from standard_french @@ -110,7 +110,7 @@ def test_western_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "western_european" + assert analysis.best_quote_convention.get_name() == "western_european" def test_british_inspired_western_european() -> None: @@ -123,7 +123,7 @@ def test_british_inspired_western_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "british_inspired_western_european" + assert analysis.best_quote_convention.get_name() == "british_inspired_western_european" def test_typewriter_western_european() -> None: @@ -136,7 +136,7 @@ def test_typewriter_western_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "typewriter_western_european" + assert analysis.best_quote_convention.get_name() == "typewriter_western_european" def test_typewriter_western_european_variant() -> None: @@ -149,7 +149,7 @@ def test_typewriter_western_european_variant() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "typewriter_western_european_variant" + assert analysis.best_quote_convention.get_name() == "typewriter_western_european_variant" def test_hybrid_typewriter_western_european() -> None: @@ -162,7 +162,7 @@ def test_hybrid_typewriter_western_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "hybrid_typewriter_western_european" + assert analysis.best_quote_convention.get_name() == "hybrid_typewriter_western_european" def test_hybrid_british_typewriter_western_european() -> None: @@ -175,7 +175,7 @@ def test_hybrid_british_typewriter_western_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "hybrid_british_typewriter_western_european" + assert analysis.best_quote_convention.get_name() == "hybrid_british_typewriter_western_european" def test_central_european() -> None: @@ -188,7 +188,7 @@ def test_central_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "central_european" + assert analysis.best_quote_convention.get_name() == "central_european" def test_central_european_guillemets() -> None: @@ -201,7 +201,7 @@ def test_central_european_guillemets() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "central_european_guillemets" + assert analysis.best_quote_convention.get_name() == "central_european_guillemets" def test_standard_swedish() -> None: @@ -214,7 +214,7 @@ def test_standard_swedish() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "standard_swedish" + assert analysis.best_quote_convention.get_name() == "standard_swedish" def test_standard_finnish() -> None: @@ -227,7 +227,7 @@ def test_standard_finnish() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "standard_finnish" + assert analysis.best_quote_convention.get_name() == "standard_finnish" def test_eastern_european() -> None: @@ -240,7 +240,7 @@ def test_eastern_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "eastern_european" + assert analysis.best_quote_convention.get_name() == "eastern_european" def test_standard_russian() -> None: @@ -253,7 +253,7 @@ def test_standard_russian() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "standard_russian" + assert analysis.best_quote_convention.get_name() == "standard_russian" def test_standard_arabic() -> None: @@ -266,7 +266,7 @@ def test_standard_arabic() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "standard_arabic" + assert analysis.best_quote_convention.get_name() == "standard_arabic" def test_non_standard_arabic() -> None: @@ -279,7 +279,7 @@ def test_non_standard_arabic() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "non-standard_arabic" + assert analysis.best_quote_convention.get_name() == "non-standard_arabic" def test_mismatched_quotation_marks() -> None: @@ -296,7 +296,7 @@ def test_mismatched_quotation_marks() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.get_best_quote_convention().get_name() == "standard_english" + assert analysis.best_quote_convention.get_name() == "standard_english" def detect_quote_convention(usfm: str) -> Union[QuoteConventionAnalysis, None]: diff --git a/tests/corpora/analysis/test_quote_convention_set.py b/tests/corpora/analysis/test_quote_convention_set.py index d24996e2..a2b4fb09 100644 --- a/tests/corpora/analysis/test_quote_convention_set.py +++ b/tests/corpora/analysis/test_quote_convention_set.py @@ -13,16 +13,16 @@ def test_quote_regexes() -> None: empty_quote_convention_set = QuoteConventionSet([]) - assert empty_quote_convention_set.opening_quotation_mark_regex.pattern == r"" - assert empty_quote_convention_set.closing_quotation_mark_regex.pattern == r"" - assert empty_quote_convention_set.all_quotation_mark_regex.pattern == r"" + assert empty_quote_convention_set._opening_quotation_mark_regex.pattern == r"" + assert empty_quote_convention_set._closing_quotation_mark_regex.pattern == r"" + assert empty_quote_convention_set._all_quotation_mark_regex.pattern == r"" quote_convention_set_with_empty_conventions = QuoteConventionSet( [QuoteConvention("empty convention 1", []), QuoteConvention("empty convention 2", [])] ) - assert quote_convention_set_with_empty_conventions.opening_quotation_mark_regex.pattern == r"" - assert quote_convention_set_with_empty_conventions.closing_quotation_mark_regex.pattern == r"" - assert quote_convention_set_with_empty_conventions.all_quotation_mark_regex.pattern == r"" + assert quote_convention_set_with_empty_conventions._opening_quotation_mark_regex.pattern == r"" + assert quote_convention_set_with_empty_conventions._closing_quotation_mark_regex.pattern == r"" + assert quote_convention_set_with_empty_conventions._all_quotation_mark_regex.pattern == r"" standard_english_quote_convention_set = QuoteConventionSet( [ @@ -37,9 +37,9 @@ def test_quote_regexes() -> None: ) ] ) - assert standard_english_quote_convention_set.opening_quotation_mark_regex.pattern == r"[‘“]" - assert standard_english_quote_convention_set.closing_quotation_mark_regex.pattern == r"[’”]" - assert standard_english_quote_convention_set.all_quotation_mark_regex.pattern == r"[‘’“”]" + assert standard_english_quote_convention_set._opening_quotation_mark_regex.pattern == r"[‘“]" + assert standard_english_quote_convention_set._closing_quotation_mark_regex.pattern == r"[’”]" + assert standard_english_quote_convention_set._all_quotation_mark_regex.pattern == r"[‘’“”]" western_european_quote_convention_set = QuoteConventionSet( [ @@ -53,9 +53,9 @@ def test_quote_regexes() -> None: ), ] ) - assert western_european_quote_convention_set.opening_quotation_mark_regex.pattern == r"[«‘“]" - assert western_european_quote_convention_set.closing_quotation_mark_regex.pattern == r"[»’”]" - assert western_european_quote_convention_set.all_quotation_mark_regex.pattern == r"[«»‘’“”]" + assert western_european_quote_convention_set._opening_quotation_mark_regex.pattern == r"[«‘“]" + assert western_european_quote_convention_set._closing_quotation_mark_regex.pattern == r"[»’”]" + assert western_european_quote_convention_set._all_quotation_mark_regex.pattern == r"[«»‘’“”]" multiple_quote_convention_set = QuoteConventionSet( [ @@ -88,9 +88,9 @@ def test_quote_regexes() -> None: ), ] ) - assert multiple_quote_convention_set.opening_quotation_mark_regex.pattern == r"[<<<«‘“‹]" - assert multiple_quote_convention_set.closing_quotation_mark_regex.pattern == r"[>>>»’”›]" - assert multiple_quote_convention_set.all_quotation_mark_regex.pattern == r"[<<<>>>«»‘’“”‹›]" + assert multiple_quote_convention_set._opening_quotation_mark_regex.pattern == r"[<<<«‘“‹]" + assert multiple_quote_convention_set._closing_quotation_mark_regex.pattern == r"[>>>»’”›]" + assert multiple_quote_convention_set._all_quotation_mark_regex.pattern == r"[<<<>>>«»‘’“”‹›]" def test_quotation_mark_pair_map() -> None: @@ -546,34 +546,34 @@ def test_are_marks_a_valid_pair() -> None: ) standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) - assert standard_english_quote_convention_set.are_marks_a_valid_pair("“", "”") - assert not standard_english_quote_convention_set.are_marks_a_valid_pair("”", "“") - assert standard_english_quote_convention_set.are_marks_a_valid_pair("‘", "’") - assert not standard_english_quote_convention_set.are_marks_a_valid_pair("’", "‘") - assert not standard_english_quote_convention_set.are_marks_a_valid_pair("‘", "”") - assert not standard_english_quote_convention_set.are_marks_a_valid_pair("‘", "”") - assert not standard_english_quote_convention_set.are_marks_a_valid_pair("‘", "") - assert not standard_english_quote_convention_set.are_marks_a_valid_pair("", "") + assert standard_english_quote_convention_set.marks_are_a_valid_pair("“", "”") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("”", "“") + assert standard_english_quote_convention_set.marks_are_a_valid_pair("‘", "’") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("’", "‘") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("‘", "”") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("‘", "”") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("‘", "") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("", "") central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) - assert central_european_quote_convention_set.are_marks_a_valid_pair("„", "“") - assert central_european_quote_convention_set.are_marks_a_valid_pair("‚", "‘") - assert not central_european_quote_convention_set.are_marks_a_valid_pair("“", "„") - assert not central_european_quote_convention_set.are_marks_a_valid_pair("’", "‚") - assert not central_european_quote_convention_set.are_marks_a_valid_pair("‚", "“") - assert not central_european_quote_convention_set.are_marks_a_valid_pair("‚", "’") + assert central_european_quote_convention_set.marks_are_a_valid_pair("„", "“") + assert central_european_quote_convention_set.marks_are_a_valid_pair("‚", "‘") + assert not central_european_quote_convention_set.marks_are_a_valid_pair("“", "„") + assert not central_european_quote_convention_set.marks_are_a_valid_pair("’", "‚") + assert not central_european_quote_convention_set.marks_are_a_valid_pair("‚", "“") + assert not central_european_quote_convention_set.marks_are_a_valid_pair("‚", "’") standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) - assert standard_swedish_quote_convention_set.are_marks_a_valid_pair("”", "”") - assert standard_swedish_quote_convention_set.are_marks_a_valid_pair("’", "’") - assert not standard_swedish_quote_convention_set.are_marks_a_valid_pair("”", "’") - assert not standard_swedish_quote_convention_set.are_marks_a_valid_pair("’", "”") + assert standard_swedish_quote_convention_set.marks_are_a_valid_pair("”", "”") + assert standard_swedish_quote_convention_set.marks_are_a_valid_pair("’", "’") + assert not standard_swedish_quote_convention_set.marks_are_a_valid_pair("”", "’") + assert not standard_swedish_quote_convention_set.marks_are_a_valid_pair("’", "”") standard_french_quote_convention_set = QuoteConventionSet([standard_french_quote_convention]) - assert standard_french_quote_convention_set.are_marks_a_valid_pair("«", "»") - assert standard_french_quote_convention_set.are_marks_a_valid_pair("‹", "›") - assert not standard_french_quote_convention_set.are_marks_a_valid_pair("«", "›") - assert not standard_french_quote_convention_set.are_marks_a_valid_pair("‹", "»") + assert standard_french_quote_convention_set.marks_are_a_valid_pair("«", "»") + assert standard_french_quote_convention_set.marks_are_a_valid_pair("‹", "›") + assert not standard_french_quote_convention_set.marks_are_a_valid_pair("«", "›") + assert not standard_french_quote_convention_set.marks_are_a_valid_pair("‹", "»") multiple_quote_convention_set = QuoteConventionSet( [ @@ -583,18 +583,18 @@ def test_are_marks_a_valid_pair() -> None: standard_french_quote_convention, ] ) - assert multiple_quote_convention_set.are_marks_a_valid_pair("“", "”") - assert multiple_quote_convention_set.are_marks_a_valid_pair("‘", "’") - assert multiple_quote_convention_set.are_marks_a_valid_pair("„", "“") - assert multiple_quote_convention_set.are_marks_a_valid_pair("‚", "‘") - assert multiple_quote_convention_set.are_marks_a_valid_pair("”", "”") - assert multiple_quote_convention_set.are_marks_a_valid_pair("’", "’") - assert multiple_quote_convention_set.are_marks_a_valid_pair("«", "»") - assert multiple_quote_convention_set.are_marks_a_valid_pair("‹", "›") - assert not multiple_quote_convention_set.are_marks_a_valid_pair("‹", "»") - assert not multiple_quote_convention_set.are_marks_a_valid_pair("‹", "”") - assert not multiple_quote_convention_set.are_marks_a_valid_pair("„", "”") - assert not multiple_quote_convention_set.are_marks_a_valid_pair("’", "‘") + assert multiple_quote_convention_set.marks_are_a_valid_pair("“", "”") + assert multiple_quote_convention_set.marks_are_a_valid_pair("‘", "’") + assert multiple_quote_convention_set.marks_are_a_valid_pair("„", "“") + assert multiple_quote_convention_set.marks_are_a_valid_pair("‚", "‘") + assert multiple_quote_convention_set.marks_are_a_valid_pair("”", "”") + assert multiple_quote_convention_set.marks_are_a_valid_pair("’", "’") + assert multiple_quote_convention_set.marks_are_a_valid_pair("«", "»") + assert multiple_quote_convention_set.marks_are_a_valid_pair("‹", "›") + assert not multiple_quote_convention_set.marks_are_a_valid_pair("‹", "»") + assert not multiple_quote_convention_set.marks_are_a_valid_pair("‹", "”") + assert not multiple_quote_convention_set.marks_are_a_valid_pair("„", "”") + assert not multiple_quote_convention_set.marks_are_a_valid_pair("’", "‘") def test_is_quotation_mark_direction_ambiguous() -> None: @@ -809,54 +809,54 @@ def test_get_possible_depths() -> None: ) standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) - assert standard_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Opening) == {1, 3} - assert standard_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Closing) == set() - assert standard_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Closing) == {1, 3} - assert standard_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Opening) == set() - assert standard_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Opening) == {2, 4} - assert standard_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Closing) == set() - assert standard_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Closing) == {2, 4} - assert standard_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Opening) == set() - assert standard_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Opening) == set() - assert standard_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Closing) == set() - assert standard_english_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.Opening) == set() - assert standard_english_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.Closing) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.OPENING) == {1, 3} + assert standard_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.CLOSING) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.CLOSING) == {1, 3} + assert standard_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.OPENING) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.OPENING) == {2, 4} + assert standard_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.CLOSING) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.CLOSING) == {2, 4} + assert standard_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.OPENING) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.OPENING) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.CLOSING) == set() + assert standard_english_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.OPENING) == set() + assert standard_english_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.CLOSING) == set() british_english_quote_convention_set = QuoteConventionSet([british_english_quote_convention]) - assert british_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Opening) == {1, 3} - assert british_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Closing) == set() - assert british_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Closing) == {1, 3} - assert british_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Opening) == set() - assert british_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Opening) == {2, 4} - assert british_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Closing) == set() - assert british_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Closing) == {2, 4} - assert british_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Opening) == set() - assert british_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Opening) == set() - assert british_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Closing) == set() - assert british_english_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.Opening) == set() - assert british_english_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.Closing) == set() + assert british_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.OPENING) == {1, 3} + assert british_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.CLOSING) == set() + assert british_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.CLOSING) == {1, 3} + assert british_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.OPENING) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.OPENING) == {2, 4} + assert british_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.CLOSING) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.CLOSING) == {2, 4} + assert british_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.OPENING) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.OPENING) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.CLOSING) == set() + assert british_english_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.OPENING) == set() + assert british_english_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.CLOSING) == set() normalized_western_european_quote_convention_set = QuoteConventionSet( [normalized_western_european_quote_convention] ) assert normalized_western_european_quote_convention_set.get_possible_depths( - '"', QuotationMarkDirection.Opening + '"', QuotationMarkDirection.OPENING ) == {1, 2} assert normalized_western_european_quote_convention_set.get_possible_depths( - '"', QuotationMarkDirection.Closing + '"', QuotationMarkDirection.CLOSING ) == {1, 2} assert normalized_western_european_quote_convention_set.get_possible_depths( - "'", QuotationMarkDirection.Opening + "'", QuotationMarkDirection.OPENING ) == {3} assert normalized_western_european_quote_convention_set.get_possible_depths( - "'", QuotationMarkDirection.Closing + "'", QuotationMarkDirection.CLOSING ) == {3} assert ( - normalized_western_european_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Opening) + normalized_western_european_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.OPENING) == set() ) assert ( - normalized_western_european_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Closing) + normalized_western_european_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.CLOSING) == set() ) @@ -867,20 +867,20 @@ def test_get_possible_depths() -> None: normalized_western_european_quote_convention, ] ) - assert multiple_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Opening) == {1, 2, 3, 4} - assert multiple_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.Closing) == set() - assert multiple_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Closing) == {1, 2, 3, 4} - assert multiple_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.Opening) == set() - assert multiple_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Opening) == {1, 2, 3, 4} - assert multiple_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.Closing) == set() - assert multiple_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Closing) == {1, 2, 3, 4} - assert multiple_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.Opening) == set() - assert multiple_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Opening) == set() - assert multiple_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.Closing) == set() - assert multiple_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.Opening) == {1, 2} - assert multiple_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.Closing) == {1, 2} - assert multiple_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.Opening) == {3} - assert multiple_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.Closing) == {3} + assert multiple_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.OPENING) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.CLOSING) == set() + assert multiple_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.CLOSING) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.OPENING) == set() + assert multiple_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.OPENING) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.CLOSING) == set() + assert multiple_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.CLOSING) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.OPENING) == set() + assert multiple_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.OPENING) == set() + assert multiple_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.CLOSING) == set() + assert multiple_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.OPENING) == {1, 2} + assert multiple_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.CLOSING) == {1, 2} + assert multiple_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.OPENING) == {3} + assert multiple_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.CLOSING) == {3} def test_does_metadata_match_quotation_mark() -> None: @@ -895,125 +895,125 @@ def test_does_metadata_match_quotation_mark() -> None: ) standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) - assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201c", 1, QuotationMarkDirection.Opening + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 1, QuotationMarkDirection.OPENING ) - assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201c", 3, QuotationMarkDirection.Opening + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 3, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201c", 2, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 2, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201c", 4, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 4, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201c", 1, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 1, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201c", 2, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 2, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201c", 3, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 3, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201c", 4, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 4, QuotationMarkDirection.CLOSING ) - assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201d", 1, QuotationMarkDirection.Closing + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 1, QuotationMarkDirection.CLOSING ) - assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201d", 3, QuotationMarkDirection.Closing + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 3, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201d", 2, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 2, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201d", 4, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 4, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201d", 1, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 1, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201d", 2, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 2, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201d", 3, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 3, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201d", 4, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 4, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2018", 1, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 1, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2018", 3, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 3, QuotationMarkDirection.OPENING ) - assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2018", 2, QuotationMarkDirection.Opening + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 2, QuotationMarkDirection.OPENING ) - assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2018", 4, QuotationMarkDirection.Opening + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 4, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2018", 1, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 1, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2018", 2, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 2, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2018", 3, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 3, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2018", 4, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 4, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2019", 1, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 1, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2019", 3, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 3, QuotationMarkDirection.CLOSING ) - assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2019", 2, QuotationMarkDirection.Closing + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 2, QuotationMarkDirection.CLOSING ) - assert standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2019", 4, QuotationMarkDirection.Closing + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 4, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2019", 1, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 1, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2019", 2, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 2, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2019", 3, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 3, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u2019", 4, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 4, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201e", 1, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 1, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201e", 1, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 1, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201e", 2, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 2, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201e", 2, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 2, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201e", 3, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 3, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201e", 3, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 3, QuotationMarkDirection.CLOSING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201e", 4, QuotationMarkDirection.Opening + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 4, QuotationMarkDirection.OPENING ) - assert not standard_english_quote_convention_set.does_metadata_match_quotation_mark( - "\u201e", 4, QuotationMarkDirection.Closing + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 4, QuotationMarkDirection.CLOSING ) @@ -1189,12 +1189,12 @@ def test_find_most_similar_convention() -> None: multiple_english_quotes_tabulator = QuotationMarkTabulator() multiple_english_quotes_tabulator.tabulate( [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 14, 15), - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 28, 29), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 42, 43), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 42, 43), ] ) assert all_three_quote_convention_set.find_most_similar_convention(multiple_english_quotes_tabulator) == ( @@ -1205,12 +1205,12 @@ def test_find_most_similar_convention() -> None: multiple_western_european_quotes_tabulator = QuotationMarkTabulator() multiple_western_european_quotes_tabulator.tabulate( [ - QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), - QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), - QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), - QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 14, 15), - QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 28, 29), - QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 42, 43), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 42, 43), ] ) assert all_three_quote_convention_set.find_most_similar_convention(multiple_western_european_quotes_tabulator) == ( @@ -1221,12 +1221,12 @@ def test_find_most_similar_convention() -> None: multiple_french_quotes_tabulator = QuotationMarkTabulator() multiple_french_quotes_tabulator.tabulate( [ - QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), - QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), - QuotationMarkMetadata("\u203a", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), - QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 14, 15), - QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 28, 29), - QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 42, 43), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u203a", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 42, 43), ] ) assert all_three_quote_convention_set.find_most_similar_convention(multiple_french_quotes_tabulator) == ( @@ -1241,12 +1241,12 @@ def test_find_most_similar_convention() -> None: noisy_multiple_english_quotes_tabulator = QuotationMarkTabulator() noisy_multiple_english_quotes_tabulator.tabulate( [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), - QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 14, 15), - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 28, 29), - QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 42, 43), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 42, 43), ] ) assert all_three_quote_convention_set.find_most_similar_convention(noisy_multiple_english_quotes_tabulator) == ( @@ -1261,14 +1261,14 @@ def test_find_most_similar_convention() -> None: noisy_multiple_french_quotes_tabulator = QuotationMarkTabulator() noisy_multiple_french_quotes_tabulator.tabulate( [ - QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), - QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), - QuotationMarkMetadata("\u203a", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), - QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), - QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 13, 14), - QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 14, 15), - QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 28, 29), - QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.Closing, TextSegment.Builder().build(), 42, 43), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u203a", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 42, 43), ] ) assert all_three_quote_convention_set.find_most_similar_convention(noisy_multiple_french_quotes_tabulator) == ( @@ -1279,11 +1279,11 @@ def test_find_most_similar_convention() -> None: too_deep_english_quotes_tabulator = QuotationMarkTabulator() too_deep_english_quotes_tabulator.tabulate( [ - QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1), - QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 5, 6), - QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 13, 14), - QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 15, 16), - QuotationMarkMetadata("\u201c", 5, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 17, 18), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 15, 16), + QuotationMarkMetadata("\u201c", 5, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 17, 18), ] ) assert all_three_quote_convention_set.find_most_similar_convention(too_deep_english_quotes_tabulator) == ( @@ -1294,7 +1294,7 @@ def test_find_most_similar_convention() -> None: # in case of ties, the earlier convention in the list should be returned unknown_quote_tabulator = QuotationMarkTabulator() unknown_quote_tabulator.tabulate( - [QuotationMarkMetadata("\u201a", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1)] + [QuotationMarkMetadata("\u201a", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1)] ) assert all_three_quote_convention_set.find_most_similar_convention(unknown_quote_tabulator) == ( standard_english_quote_convention, @@ -1303,7 +1303,7 @@ def test_find_most_similar_convention() -> None: single_french_opening_quote_tabulator = QuotationMarkTabulator() single_french_opening_quote_tabulator.tabulate( - [QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1)] + [QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1)] ) assert all_three_quote_convention_set.find_most_similar_convention(single_french_opening_quote_tabulator) == ( standard_french_quote_convention, @@ -1317,7 +1317,7 @@ def test_find_most_similar_convention() -> None: # Default values should be returned when the QuoteConventionSet is empty single_english_opening_quote_tabulator = QuotationMarkTabulator() single_english_opening_quote_tabulator.tabulate( - [QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.Opening, TextSegment.Builder().build(), 0, 1)] + [QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1)] ) empty_quote_convention_set = QuoteConventionSet([]) assert empty_quote_convention_set.find_most_similar_convention(single_english_opening_quote_tabulator) == ( diff --git a/tests/corpora/analysis/test_text_segment.py b/tests/corpora/analysis/test_text_segment.py index 9deaa60e..e5696e38 100644 --- a/tests/corpora/analysis/test_text_segment.py +++ b/tests/corpora/analysis/test_text_segment.py @@ -5,14 +5,14 @@ def test_builder_initialization() -> None: builder = TextSegment.Builder() - assert builder.text_segment.text == "" - assert builder.text_segment.previous_segment is None - assert builder.text_segment.next_segment is None - assert builder.text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker - assert builder.text_segment.markers_in_preceding_context == set() - assert builder.text_segment.index_in_verse == 0 - assert builder.text_segment.num_segments_in_verse == 0 - assert builder.text_segment.usfm_token is None + assert builder._text_segment._text == "" + assert builder._text_segment._previous_segment is None + assert builder._text_segment._next_segment is None + assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert builder._text_segment._markers_in_preceding_context == set() + assert builder._text_segment._index_in_verse == 0 + assert builder._text_segment._num_segments_in_verse == 0 + assert builder._text_segment._usfm_token is None def test_builder_set_text() -> None: @@ -20,7 +20,7 @@ def test_builder_set_text() -> None: text = "Example text" builder.set_text(text) - assert builder.text_segment.text == text + assert builder._text_segment._text == text def test_builder_set_previous_segment() -> None: @@ -28,43 +28,43 @@ def test_builder_set_previous_segment() -> None: previous_segment = TextSegment.Builder().set_text("previous segment text").build() builder.set_previous_segment(previous_segment) - assert builder.text_segment.previous_segment == previous_segment - assert builder.text_segment.next_segment is None - assert builder.text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker - assert builder.text_segment.markers_in_preceding_context == set() - assert builder.text_segment.index_in_verse == 0 - assert builder.text_segment.num_segments_in_verse == 0 + assert builder._text_segment._previous_segment == previous_segment + assert builder._text_segment._next_segment is None + assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert builder._text_segment._markers_in_preceding_context == set() + assert builder._text_segment._index_in_verse == 0 + assert builder._text_segment._num_segments_in_verse == 0 def test_builder_add_preceding_marker() -> None: builder = TextSegment.Builder() - builder.add_preceding_marker(UsfmMarkerType.ChapterMarker) - - assert builder.text_segment.immediate_preceding_marker is UsfmMarkerType.ChapterMarker - assert builder.text_segment.markers_in_preceding_context == {UsfmMarkerType.ChapterMarker} - assert builder.text_segment.previous_segment is None - assert builder.text_segment.next_segment is None - - builder.add_preceding_marker(UsfmMarkerType.VerseMarker) - assert builder.text_segment.immediate_preceding_marker == UsfmMarkerType.VerseMarker - assert builder.text_segment.markers_in_preceding_context == { - UsfmMarkerType.ChapterMarker, - UsfmMarkerType.VerseMarker, + builder.add_preceding_marker(UsfmMarkerType.CHAPTER) + + assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.CHAPTER + assert builder._text_segment._markers_in_preceding_context == {UsfmMarkerType.CHAPTER} + assert builder._text_segment._previous_segment is None + assert builder._text_segment._next_segment is None + + builder.add_preceding_marker(UsfmMarkerType.VERSE) + assert builder._text_segment._immediate_preceding_marker == UsfmMarkerType.VERSE + assert builder._text_segment._markers_in_preceding_context == { + UsfmMarkerType.CHAPTER, + UsfmMarkerType.VERSE, } - assert builder.text_segment.previous_segment is None - assert builder.text_segment.next_segment is None + assert builder._text_segment._previous_segment is None + assert builder._text_segment._next_segment is None def test_builder_set_usfm_token() -> None: builder = TextSegment.Builder() builder.set_usfm_token(UsfmToken(type=UsfmTokenType.TEXT, text="USFM token text")) - assert builder.text_segment.usfm_token is not None - assert builder.text_segment.usfm_token.type == UsfmTokenType.TEXT - assert builder.text_segment.usfm_token.text == "USFM token text" - assert builder.text_segment.text == "" - assert builder.text_segment.previous_segment is None - assert builder.text_segment.next_segment is None + assert builder._text_segment._usfm_token is not None + assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT + assert builder._text_segment._usfm_token.text == "USFM token text" + assert builder._text_segment._text == "" + assert builder._text_segment._previous_segment is None + assert builder._text_segment._next_segment is None def test_set_previous_segment() -> None: @@ -72,12 +72,12 @@ def test_set_previous_segment() -> None: previous_segment = TextSegment.Builder().set_text("previous segment text").build() text_segment.set_previous_segment(previous_segment) - assert text_segment.previous_segment == previous_segment - assert text_segment.next_segment is None - assert text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker - assert text_segment.markers_in_preceding_context == set() - assert text_segment.index_in_verse == 0 - assert text_segment.num_segments_in_verse == 0 + assert text_segment._previous_segment == previous_segment + assert text_segment._next_segment is None + assert text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert text_segment._markers_in_preceding_context == set() + assert text_segment._index_in_verse == 0 + assert text_segment._num_segments_in_verse == 0 def test_set_next_segment() -> None: @@ -85,36 +85,36 @@ def test_set_next_segment() -> None: next_segment = TextSegment.Builder().set_text("next segment text").build() text_segment.set_next_segment(next_segment) - assert text_segment.previous_segment is None - assert text_segment.next_segment == next_segment - assert text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker - assert text_segment.markers_in_preceding_context == set() - assert text_segment.index_in_verse == 0 - assert text_segment.num_segments_in_verse == 0 + assert text_segment._previous_segment is None + assert text_segment._next_segment == next_segment + assert text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert text_segment._markers_in_preceding_context == set() + assert text_segment._index_in_verse == 0 + assert text_segment._num_segments_in_verse == 0 def test_set_index_in_verse() -> None: text_segment = TextSegment.Builder().set_text("example text").build() text_segment.set_index_in_verse(2) - assert text_segment.index_in_verse == 2 - assert text_segment.previous_segment is None - assert text_segment.next_segment is None - assert text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker - assert text_segment.markers_in_preceding_context == set() - assert text_segment.num_segments_in_verse == 0 + assert text_segment._index_in_verse == 2 + assert text_segment._previous_segment is None + assert text_segment._next_segment is None + assert text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert text_segment._markers_in_preceding_context == set() + assert text_segment._num_segments_in_verse == 0 def test_set_num_segments_in_verse() -> None: text_segment = TextSegment.Builder().set_text("example text").build() text_segment.set_num_segments_in_verse(5) - assert text_segment.num_segments_in_verse == 5 - assert text_segment.previous_segment is None - assert text_segment.next_segment is None - assert text_segment.immediate_preceding_marker is UsfmMarkerType.NoMarker - assert text_segment.markers_in_preceding_context == set() - assert text_segment.index_in_verse == 0 + assert text_segment._num_segments_in_verse == 5 + assert text_segment._previous_segment is None + assert text_segment._next_segment is None + assert text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert text_segment._markers_in_preceding_context == set() + assert text_segment._index_in_verse == 0 def test_equals() -> None: @@ -139,19 +139,19 @@ def test_equals() -> None: assert segment_with_index != basic_segment segment_with_preceding_marker = ( - TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.VerseMarker).build() + TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.VERSE).build() ) segment_with_same_preceding_marker = ( - TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.VerseMarker).build() + TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.VERSE).build() ) segment_with_different_preceding_marker = ( - TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.ChapterMarker).build() + TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.CHAPTER).build() ) segment_with_multiple_preceding_markers = ( TextSegment.Builder() .set_text("text1") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) .build() ) @@ -198,18 +198,18 @@ def test_equals() -> None: def test_get_text() -> None: text_segment = TextSegment.Builder().set_text("example text").build() - assert text_segment.get_text() == "example text" + assert text_segment.text == "example text" text_segment = TextSegment.Builder().set_text("new example text").build() - assert text_segment.get_text() == "new example text" + assert text_segment.text == "new example text" def test_length() -> None: text_segment = TextSegment.Builder().set_text("example text").build() - assert text_segment.length() == len("example text") + assert text_segment.length == len("example text") text_segment = TextSegment.Builder().set_text("new example text").build() - assert text_segment.length() == len("new example text") + assert text_segment.length == len("new example text") def test_substring_before() -> None: @@ -231,40 +231,40 @@ def test_substring_after() -> None: def test_is_marker_in_preceding_context() -> None: no_preceding_marker_segment = TextSegment.Builder().set_text("example text").build() - assert no_preceding_marker_segment.is_marker_in_preceding_context(UsfmMarkerType.ChapterMarker) is False - assert no_preceding_marker_segment.is_marker_in_preceding_context(UsfmMarkerType.VerseMarker) is False - assert no_preceding_marker_segment.is_marker_in_preceding_context(UsfmMarkerType.CharacterMarker) is False + assert no_preceding_marker_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER) is False + assert no_preceding_marker_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) is False + assert no_preceding_marker_segment.marker_is_in_preceding_context(UsfmMarkerType.CHARACTER) is False one_preceding_marker_text_segment = ( - TextSegment.Builder().set_text("example text").add_preceding_marker(UsfmMarkerType.CharacterMarker).build() + TextSegment.Builder().set_text("example text").add_preceding_marker(UsfmMarkerType.CHARACTER).build() ) - assert one_preceding_marker_text_segment.is_marker_in_preceding_context(UsfmMarkerType.CharacterMarker) is True - assert one_preceding_marker_text_segment.is_marker_in_preceding_context(UsfmMarkerType.VerseMarker) is False - assert one_preceding_marker_text_segment.is_marker_in_preceding_context(UsfmMarkerType.ChapterMarker) is False + assert one_preceding_marker_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHARACTER) is True + assert one_preceding_marker_text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) is False + assert one_preceding_marker_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER) is False two_preceding_markers_text_segment = ( TextSegment.Builder() .set_text("example text") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) .build() ) - assert two_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.ChapterMarker) is True - assert two_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.VerseMarker) is True - assert two_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.CharacterMarker) is False + assert two_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER) is True + assert two_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) is True + assert two_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHARACTER) is False three_preceding_markers_text_segment = ( TextSegment.Builder() .set_text("example text") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) - .add_preceding_marker(UsfmMarkerType.CharacterMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.CHARACTER) .build() ) - assert three_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.ChapterMarker) is True - assert three_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.VerseMarker) is True - assert three_preceding_markers_text_segment.is_marker_in_preceding_context(UsfmMarkerType.CharacterMarker) is True + assert three_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER) is True + assert three_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) is True + assert three_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHARACTER) is True def test_is_first_segment_in_verse() -> None: @@ -293,28 +293,28 @@ def test_is_last_segment_in_verse() -> None: def test_replace_substring() -> None: text_segment = TextSegment.Builder().set_text("example text").build() text_segment.replace_substring(0, 7, "sample") - assert text_segment.get_text() == "sample text" + assert text_segment.text == "sample text" text_segment.replace_substring(7, 11, "text") - assert text_segment.get_text() == "sample text" + assert text_segment.text == "sample text" text_segment.replace_substring(0, 7, "") - assert text_segment.get_text() == "text" + assert text_segment.text == "text" text_segment.replace_substring(0, 4, "new'") - assert text_segment.get_text() == "new'" + assert text_segment.text == "new'" text_segment.replace_substring(3, 4, "\u2019") - assert text_segment.get_text() == "new\u2019" + assert text_segment.text == "new\u2019" text_segment.replace_substring(0, 0, "prefix ") - assert text_segment.get_text() == "prefix new\u2019" + assert text_segment.text == "prefix new\u2019" text_segment.replace_substring(0, 0, "") - assert text_segment.get_text() == "prefix new\u2019" + assert text_segment.text == "prefix new\u2019" text_segment.replace_substring(11, 11, " suffix") - assert text_segment.get_text() == "prefix new\u2019 suffix" + assert text_segment.text == "prefix new\u2019 suffix" text_segment.replace_substring(6, 6, "-") - assert text_segment.get_text() == "prefix- new\u2019 suffix" + assert text_segment.text == "prefix- new\u2019 suffix" diff --git a/tests/corpora/analysis/test_usfm_structure_extractor.py b/tests/corpora/analysis/test_usfm_structure_extractor.py index edaef383..31d26e27 100644 --- a/tests/corpora/analysis/test_usfm_structure_extractor.py +++ b/tests/corpora/analysis/test_usfm_structure_extractor.py @@ -20,8 +20,8 @@ def test_chapter_and_verse_markers(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) .build() ] ) @@ -31,8 +31,8 @@ def test_chapter_and_verse_markers(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) - assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None def test_start_paragraph_marker(): @@ -49,9 +49,9 @@ def test_start_paragraph_marker(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) - .add_preceding_marker(UsfmMarkerType.ParagraphMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) .build() ] ) @@ -61,8 +61,8 @@ def test_start_paragraph_marker(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) - assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None def test_start_character_marker(): @@ -79,9 +79,9 @@ def test_start_character_marker(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) - .add_preceding_marker(UsfmMarkerType.CharacterMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.CHARACTER) .build() ] ) @@ -91,8 +91,8 @@ def test_start_character_marker(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) - assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None def test_end_character_marker(): @@ -109,9 +109,9 @@ def test_end_character_marker(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) - .add_preceding_marker(UsfmMarkerType.CharacterMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.CHARACTER) .build() ] ) @@ -121,8 +121,8 @@ def test_end_character_marker(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) - assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None def test_end_note_marker(): @@ -139,9 +139,9 @@ def test_end_note_marker(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) - .add_preceding_marker(UsfmMarkerType.EmbedMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.EMBED) .build() ] ) @@ -151,8 +151,8 @@ def test_end_note_marker(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) - assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None def test_end_table_marker(): @@ -169,9 +169,9 @@ def test_end_table_marker(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) - .add_preceding_marker(UsfmMarkerType.EmbedMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.EMBED) .build() ] ) @@ -181,8 +181,8 @@ def test_end_table_marker(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) - assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None def test_ref_marker(): @@ -199,9 +199,9 @@ def test_ref_marker(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) - .add_preceding_marker(UsfmMarkerType.EmbedMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.EMBED) .build() ] ) @@ -211,8 +211,8 @@ def test_ref_marker(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) - assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None def test_sidebar_marker(): @@ -229,9 +229,9 @@ def test_sidebar_marker(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) - .add_preceding_marker(UsfmMarkerType.EmbedMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.EMBED) .build() ] ) @@ -241,8 +241,8 @@ def test_sidebar_marker(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) - assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None def test_multiple_verses(): @@ -260,8 +260,8 @@ def test_multiple_verses(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) .build() ] ), @@ -269,8 +269,8 @@ def test_multiple_verses(): [ TextSegment.Builder() .set_text("test2") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) .build() ] ), @@ -280,10 +280,10 @@ def test_multiple_verses(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) - assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None - assert actual_chapters[0].verses[1].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[1].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + assert actual_chapters[0].verses[1]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[1]._text_segments[0].next_segment is None def test_multiple_chapters(): @@ -302,8 +302,8 @@ def test_multiple_chapters(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) .build() ] ), @@ -315,8 +315,8 @@ def test_multiple_chapters(): [ TextSegment.Builder() .set_text("test2") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) .build() ] ), @@ -326,10 +326,10 @@ def test_multiple_chapters(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) - assert actual_chapters[0].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[0].verses[0].text_segments[0].get_next_segment() is None - assert actual_chapters[1].verses[0].text_segments[0].get_previous_segment() is None - assert actual_chapters[1].verses[0].text_segments[0].get_next_segment() is None + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + assert actual_chapters[1].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[1].verses[0]._text_segments[0].next_segment is None def test_character_marker_in_text(): @@ -347,14 +347,14 @@ def test_character_marker_in_text(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) .build(), TextSegment.Builder() .set_text("test2") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) - .add_preceding_marker(UsfmMarkerType.CharacterMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.CHARACTER) .build(), ] ), @@ -365,12 +365,11 @@ def test_character_marker_in_text(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) assert ( - actual_chapters[0].verses[0].text_segments[1].get_previous_segment() - == expected_chapters[0].verses[0].text_segments[0] + actual_chapters[0].verses[0]._text_segments[1].previous_segment + == expected_chapters[0].verses[0]._text_segments[0] ) assert ( - actual_chapters[0].verses[0].text_segments[0].get_next_segment() - == expected_chapters[0].verses[0].text_segments[1] + actual_chapters[0].verses[0]._text_segments[0].next_segment == expected_chapters[0].verses[0]._text_segments[1] ) @@ -391,14 +390,14 @@ def test_empty_text(): [ TextSegment.Builder() .set_text("test") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) .build(), TextSegment.Builder() .set_text("test2") - .add_preceding_marker(UsfmMarkerType.ChapterMarker) - .add_preceding_marker(UsfmMarkerType.VerseMarker) - .add_preceding_marker(UsfmMarkerType.CharacterMarker) + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.CHARACTER) .build(), ] ), @@ -409,12 +408,11 @@ def test_empty_text(): actual_chapters = usfm_structure_extractor.get_chapters() assert_chapter_equal(expected_chapters, actual_chapters) assert ( - actual_chapters[0].verses[0].text_segments[1].get_previous_segment() - == expected_chapters[0].verses[0].text_segments[0] + actual_chapters[0].verses[0]._text_segments[1].previous_segment + == expected_chapters[0].verses[0]._text_segments[0] ) assert ( - actual_chapters[0].verses[0].text_segments[0].get_next_segment() - == expected_chapters[0].verses[0].text_segments[1] + actual_chapters[0].verses[0]._text_segments[0].next_segment == expected_chapters[0].verses[0]._text_segments[1] ) @@ -436,6 +434,6 @@ def assert_chapter_equal(expected_chapters: List[Chapter], actual_chapters: List for expected_chapter, actual_chapter in zip(expected_chapters, actual_chapters): assert len(expected_chapter.verses) == len(actual_chapter.verses) for expected_verse, actual_verse in zip(expected_chapter.verses, actual_chapter.verses): - assert len(expected_verse.text_segments) == len(actual_verse.text_segments) - for expected_segment, actual_segment in zip(expected_verse.text_segments, actual_verse.text_segments): + assert len(expected_verse._text_segments) == len(actual_verse._text_segments) + for expected_segment, actual_segment in zip(expected_verse._text_segments, actual_verse._text_segments): assert expected_segment == actual_segment diff --git a/tests/corpora/analysis/test_verse.py b/tests/corpora/analysis/test_verse.py index aa8e3fb4..3d6de831 100644 --- a/tests/corpora/analysis/test_verse.py +++ b/tests/corpora/analysis/test_verse.py @@ -10,8 +10,8 @@ def test_initialize_verse() -> None: verse = Verse(text_segments) - assert len(verse.get_text_segments()) == 3 - assert verse.get_text_segments() == text_segments + assert len(verse.text_segments) == 3 + assert verse.text_segments == text_segments def test_segment_indices() -> None: @@ -23,9 +23,9 @@ def test_segment_indices() -> None: verse = Verse(text_segments) - assert verse.get_text_segments()[0].index_in_verse == 0 - assert verse.get_text_segments()[1].index_in_verse == 1 - assert verse.get_text_segments()[2].index_in_verse == 2 + assert verse.text_segments[0]._index_in_verse == 0 + assert verse.text_segments[1]._index_in_verse == 1 + assert verse.text_segments[2]._index_in_verse == 2 def test_num_segments_in_verse() -> None: @@ -37,6 +37,6 @@ def test_num_segments_in_verse() -> None: verse = Verse(text_segments) - assert verse.get_text_segments()[0].num_segments_in_verse == 3 - assert verse.get_text_segments()[1].num_segments_in_verse == 3 - assert verse.get_text_segments()[2].num_segments_in_verse == 3 + assert verse.text_segments[0]._num_segments_in_verse == 3 + assert verse.text_segments[1]._num_segments_in_verse == 3 + assert verse.text_segments[2]._num_segments_in_verse == 3 diff --git a/tests/corpora/test_fallback_quotation_mark_resolver.py b/tests/corpora/test_fallback_quotation_mark_resolver.py index 16ca3fac..241b2222 100644 --- a/tests/corpora/test_fallback_quotation_mark_resolver.py +++ b/tests/corpora/test_fallback_quotation_mark_resolver.py @@ -22,7 +22,7 @@ def test_reset(): ) basic_quotation_mark_resolver._last_quotation_mark = QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text('"\'test text"').build(), 0, 1 + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"\'test text"').build(), 0, 1 ) basic_quotation_mark_resolver._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) @@ -51,10 +51,10 @@ def test_simple_quotation_mark_resolution(): ) expected_resolved_quotation_marks = [ QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text('"test text"').build(), 0, 1 + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"test text"').build(), 0, 1 ), QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.Closing, TextSegment.Builder().set_text('"test text"').build(), 10, 11 + '"', 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().set_text('"test text"').build(), 10, 11 ), ] @@ -150,7 +150,7 @@ def test_is_opening_quote_stateful(): # immediately preceding quote basic_quotation_mark_resolver._last_quotation_mark = QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text('"\'test text"').build(), 0, 1 + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"\'test text"').build(), 0, 1 ) assert basic_quotation_mark_resolver._is_opening_quote(quote_match) is True @@ -171,26 +171,26 @@ def test_does_most_recent_opening_mark_immediately_precede(): # correct preceding quote basic_quotation_mark_resolver._last_quotation_mark = QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text('"\'test text"').build(), 0, 1 + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"\'test text"').build(), 0, 1 ) assert basic_quotation_mark_resolver._does_most_recent_opening_mark_immediately_precede(nested_quote_match) is True # wrong direction for preceding quote basic_quotation_mark_resolver._last_quotation_mark = QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.Closing, TextSegment.Builder().set_text('"\'test text"').build(), 0, 1 + '"', 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().set_text('"\'test text"').build(), 0, 1 ) assert basic_quotation_mark_resolver._does_most_recent_opening_mark_immediately_precede(nested_quote_match) is False # different text segment for preceding quote basic_quotation_mark_resolver._last_quotation_mark = QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text('"\'different text"').build(), 0, 1 + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"\'different text"').build(), 0, 1 ) assert basic_quotation_mark_resolver._does_most_recent_opening_mark_immediately_precede(nested_quote_match) is False # previous quote is not *immediately* before the current quote nested_quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('" \'test text"').build(), 2, 3) basic_quotation_mark_resolver._last_quotation_mark = QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text('" \'test text"').build(), 0, 1 + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('" \'test text"').build(), 0, 1 ) assert basic_quotation_mark_resolver._does_most_recent_opening_mark_immediately_precede(nested_quote_match) is False @@ -276,7 +276,7 @@ def test_resolve_opening_quote(): ) expected_resolved_quotation_mark = QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.Opening, TextSegment.Builder().set_text('"test text"').build(), 0, 1 + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"test text"').build(), 0, 1 ) actual_resolved_quotation_mark = basic_quotation_mark_resolver._resolve_opening_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1) @@ -296,7 +296,7 @@ def test_resolve_closing_quote(): ) expected_resolved_quotation_mark = QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.Closing, TextSegment.Builder().set_text('"test text"').build(), 10, 11 + '"', 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().set_text('"test text"').build(), 10, 11 ) actual_resolved_quotation_mark = basic_quotation_mark_resolver._resolve_closing_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 10, 11) @@ -310,5 +310,4 @@ def assert_resolved_quotation_marks_equal( ) -> None: assert len(actual_resolved_quotation_marks) == len(expected_resolved_quotation_marks) for actual_mark, expected_mark in zip(actual_resolved_quotation_marks, expected_resolved_quotation_marks): - print(f"Actual: {actual_mark.get_quotation_mark()}, Expected: {expected_mark.get_quotation_mark()}") assert actual_mark == expected_mark diff --git a/tests/corpora/test_quotation_mark_update_first_pass.py b/tests/corpora/test_quotation_mark_update_first_pass.py index 3ccc937c..af38368a 100644 --- a/tests/corpora/test_quotation_mark_update_first_pass.py +++ b/tests/corpora/test_quotation_mark_update_first_pass.py @@ -334,24 +334,24 @@ def test_choose_best_action_based_on_observed_issues() -> None: first_pass_analyzer._will_fallback_mode_work = False # Test with no issues - best_action = first_pass_analyzer._choose_best_action_based_on_observed_issues([]) + best_action = first_pass_analyzer._choose_best_strategy_based_on_observed_issues([]) assert best_action == QuotationMarkUpdateStrategy.APPLY_FULL # Test with one issue assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK] ) == QuotationMarkUpdateStrategy.SKIP ) assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK] ) == QuotationMarkUpdateStrategy.SKIP ) assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [QuotationMarkResolutionIssue.TOO_DEEP_NESTING] ) == QuotationMarkUpdateStrategy.SKIP @@ -359,7 +359,7 @@ def test_choose_best_action_based_on_observed_issues() -> None: # Test with multiple issues assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [ QuotationMarkResolutionIssue.TOO_DEEP_NESTING, QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, @@ -368,7 +368,7 @@ def test_choose_best_action_based_on_observed_issues() -> None: == QuotationMarkUpdateStrategy.SKIP ) assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [ QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, @@ -377,7 +377,7 @@ def test_choose_best_action_based_on_observed_issues() -> None: == QuotationMarkUpdateStrategy.SKIP ) assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [ QuotationMarkResolutionIssue.TOO_DEEP_NESTING, QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, @@ -392,24 +392,24 @@ def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> No first_pass_analyzer._will_fallback_mode_work = True # Test with no issues - best_action = first_pass_analyzer._choose_best_action_based_on_observed_issues([]) + best_action = first_pass_analyzer._choose_best_strategy_based_on_observed_issues([]) assert best_action == QuotationMarkUpdateStrategy.APPLY_FULL # Test with one issue assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK] ) == QuotationMarkUpdateStrategy.APPLY_FALLBACK ) assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK] ) == QuotationMarkUpdateStrategy.SKIP ) assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [QuotationMarkResolutionIssue.TOO_DEEP_NESTING] ) == QuotationMarkUpdateStrategy.APPLY_FALLBACK @@ -417,7 +417,7 @@ def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> No # Test with multiple issues assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [ QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, @@ -426,7 +426,7 @@ def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> No == QuotationMarkUpdateStrategy.SKIP ) assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [ QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, QuotationMarkResolutionIssue.TOO_DEEP_NESTING, @@ -435,7 +435,7 @@ def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> No == QuotationMarkUpdateStrategy.SKIP ) assert ( - first_pass_analyzer._choose_best_action_based_on_observed_issues( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( [ QuotationMarkResolutionIssue.TOO_DEEP_NESTING, QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, @@ -644,7 +644,7 @@ def run_first_pass( first_pass_analyzer = QuotationMarkUpdateFirstPass(source_quote_convention, target_quote_convention) parse_usfm(normalized_usfm, first_pass_analyzer) - return first_pass_analyzer.get_best_actions_by_chapter() + return first_pass_analyzer.find_best_chapter_strategies() def run_first_pass_on_chapter( @@ -664,7 +664,7 @@ def run_first_pass_on_chapter( chapter = Chapter([Verse([TextSegment.Builder().set_text(verse_text).build() for verse_text in verse_texts])]) - return first_pass_analyzer._find_best_action_for_chapter(chapter) + return first_pass_analyzer._find_best_strategy_for_chapter(chapter) def get_quote_convention_by_name(name: str) -> QuoteConvention: diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py index baab9180..ff67f957 100644 --- a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py +++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py @@ -1,6 +1,7 @@ -from typing import Generator, List, Union +from typing import Generator, List, Set, Union from machine.corpora import ( + QuotationMarkUpdateResolutionSettings, QuotationMarkUpdateSettings, QuotationMarkUpdateStrategy, QuoteConventionChangingUsfmUpdateBlockHandler, @@ -18,9 +19,9 @@ QuotationMarkFinder, QuotationMarkMetadata, QuotationMarkResolutionIssue, - QuotationMarkResolutionSettings, QuotationMarkResolver, QuotationMarkStringMatch, + QuoteConvention, QuoteConventionSet, TextSegment, UsfmMarkerType, @@ -439,9 +440,10 @@ def test_process_scripture_element() -> None: assert quote_convention_changer._quotation_mark_finder.num_times_called == 1 assert mock_quotation_mark_resolver.num_times_called == 1 - assert quote_convention_changer._quotation_mark_finder.matches_to_return[0].text_segment.text == "this is a ‘test" + assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment._text == "this is a ‘test" assert ( - quote_convention_changer._quotation_mark_finder.matches_to_return[1].text_segment.text == "the test ends” here" + quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment._text + == "the test ends” here" ) @@ -456,11 +458,11 @@ def test_create_text_segments_basic() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 1 - assert text_segments[0].text == "test segment" - assert text_segments[0].immediate_preceding_marker is UsfmMarkerType.NoMarker - assert text_segments[0].markers_in_preceding_context == set() - assert text_segments[0].previous_segment is None - assert text_segments[0].next_segment is None + assert text_segments[0]._text == "test segment" + assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert text_segments[0]._markers_in_preceding_context == set() + assert text_segments[0]._previous_segment is None + assert text_segments[0]._next_segment is None def test_create_text_segments_with_preceding_markers() -> None: @@ -479,14 +481,14 @@ def test_create_text_segments_with_preceding_markers() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 1 - assert text_segments[0].text == "test segment" - assert text_segments[0].immediate_preceding_marker == UsfmMarkerType.ParagraphMarker - assert text_segments[0].markers_in_preceding_context == { - UsfmMarkerType.VerseMarker, - UsfmMarkerType.ParagraphMarker, + assert text_segments[0]._text == "test segment" + assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH + assert text_segments[0]._markers_in_preceding_context == { + UsfmMarkerType.VERSE, + UsfmMarkerType.PARAGRAPH, } - assert text_segments[0].previous_segment is None - assert text_segments[0].next_segment is None + assert text_segments[0]._previous_segment is None + assert text_segments[0]._next_segment is None def test_create_text_segments_with_multiple_text_tokens() -> None: @@ -509,16 +511,22 @@ def test_create_text_segments_with_multiple_text_tokens() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 2 - assert text_segments[0].text == "test segment1" - assert text_segments[0].immediate_preceding_marker == UsfmMarkerType.ParagraphMarker - assert text_segments[0].markers_in_preceding_context == {UsfmMarkerType.VerseMarker, UsfmMarkerType.ParagraphMarker} - assert text_segments[0].previous_segment is None - assert text_segments[0].next_segment == text_segments[1] - assert text_segments[1].text == "test segment2" - assert text_segments[1].immediate_preceding_marker == UsfmMarkerType.CharacterMarker - assert text_segments[1].markers_in_preceding_context == {UsfmMarkerType.VerseMarker, UsfmMarkerType.CharacterMarker} - assert text_segments[1].previous_segment == text_segments[0] - assert text_segments[1].next_segment is None + assert text_segments[0]._text == "test segment1" + assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH + assert text_segments[0]._markers_in_preceding_context == { + UsfmMarkerType.VERSE, + UsfmMarkerType.PARAGRAPH, + } + assert text_segments[0]._previous_segment is None + assert text_segments[0]._next_segment == text_segments[1] + assert text_segments[1]._text == "test segment2" + assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER + assert text_segments[1]._markers_in_preceding_context == { + UsfmMarkerType.VERSE, + UsfmMarkerType.CHARACTER, + } + assert text_segments[1]._previous_segment == text_segments[0] + assert text_segments[1]._next_segment is None def test_create_text_segment() -> None: @@ -530,10 +538,10 @@ def test_create_text_segment() -> None: segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token) assert segment is not None - assert segment.text == "test segment" - assert segment.immediate_preceding_marker is UsfmMarkerType.NoMarker - assert segment.markers_in_preceding_context == set() - assert segment.usfm_token == usfm_token + assert segment._text == "test segment" + assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert segment._markers_in_preceding_context == set() + assert segment._usfm_token == usfm_token def test_set_previous_and_next_for_segments() -> None: @@ -549,12 +557,12 @@ def test_set_previous_and_next_for_segments() -> None: quote_convention_changer._set_previous_and_next_for_segments(segments) - assert segments[0].previous_segment is None - assert segments[0].next_segment == segments[1] - assert segments[1].previous_segment == segments[0] - assert segments[1].next_segment == segments[2] - assert segments[2].previous_segment == segments[1] - assert segments[2].next_segment is None + assert segments[0]._previous_segment is None + assert segments[0]._next_segment == segments[1] + assert segments[1]._previous_segment == segments[0] + assert segments[1]._next_segment == segments[2] + assert segments[2]._previous_segment == segments[1] + assert segments[2]._next_segment is None def test_check_for_chapter_change() -> None: @@ -588,9 +596,9 @@ def test_start_new_chapter() -> None: ) ) - quote_convention_changer._next_scripture_text_segment_builder.add_preceding_marker( - UsfmMarkerType.EmbedMarker - ).set_text("this text should be erased") + quote_convention_changer._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED).set_text( + "this text should be erased" + ) quote_convention_changer._verse_text_quotation_mark_resolver._issues.add( QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK ) @@ -598,9 +606,9 @@ def test_start_new_chapter() -> None: quote_convention_changer._start_new_chapter(1) segment = quote_convention_changer._next_scripture_text_segment_builder.build() assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP - assert segment.immediate_preceding_marker == UsfmMarkerType.ChapterMarker - assert segment.text == "" - assert UsfmMarkerType.EmbedMarker not in segment.markers_in_preceding_context + assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER + assert segment._text == "" + assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set() quote_convention_changer._start_new_chapter(2) @@ -673,7 +681,7 @@ def find_all_potential_quotation_marks_in_text_segments( class MockQuotationMarkResolver(QuotationMarkResolver): def __init__(self): - super().__init__(QuotationMarkResolutionSettings()) + super().__init__(QuotationMarkUpdateResolutionSettings(QuoteConvention("", []), QuoteConvention("", []))) self.num_times_called = 0 def resolve_quotation_marks( @@ -681,12 +689,15 @@ def resolve_quotation_marks( ) -> Generator[QuotationMarkMetadata, None, None]: self.num_times_called += 1 current_depth = 1 - current_direction = QuotationMarkDirection.Opening + current_direction = QuotationMarkDirection.OPENING for quote_match in quote_matches: yield quote_match.resolve(current_depth, current_direction) current_depth += 1 current_direction = ( - QuotationMarkDirection.Closing - if current_direction == QuotationMarkDirection.Opening - else QuotationMarkDirection.Opening + QuotationMarkDirection.CLOSING + if current_direction == QuotationMarkDirection.OPENING + else QuotationMarkDirection.OPENING ) + + def get_issues(self) -> Set[QuotationMarkResolutionIssue]: + return set() From 247fa4cc733d1a84f1ac2cc8ae99d11828d4eb2d Mon Sep 17 00:00:00 2001 From: Ben King Date: Wed, 2 Jul 2025 10:51:34 -0400 Subject: [PATCH 20/31] Correct TextSegment equality function --- machine/corpora/analysis/text_segment.py | 2 +- tests/corpora/analysis/test_text_segment.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/machine/corpora/analysis/text_segment.py b/machine/corpora/analysis/text_segment.py index 039bd579..491489fe 100644 --- a/machine/corpora/analysis/text_segment.py +++ b/machine/corpora/analysis/text_segment.py @@ -22,7 +22,7 @@ def __eq__(self, value): return False if self._index_in_verse != value._index_in_verse: return False - if self._index_in_verse != value._index_in_verse: + if self._num_segments_in_verse != value._num_segments_in_verse: return False if self._usfm_token != value._usfm_token: return False diff --git a/tests/corpora/analysis/test_text_segment.py b/tests/corpora/analysis/test_text_segment.py index e5696e38..08444faf 100644 --- a/tests/corpora/analysis/test_text_segment.py +++ b/tests/corpora/analysis/test_text_segment.py @@ -178,8 +178,8 @@ def test_equals() -> None: segment_with_different_num_verses.set_num_segments_in_verse(4) assert segment_with_num_verses == segment_with_same_num_verses - assert segment_with_num_verses == segment_with_different_num_verses - assert segment_with_num_verses == basic_segment + assert segment_with_num_verses != segment_with_different_num_verses + assert segment_with_num_verses != basic_segment assert segment_with_preceding_marker == segment_with_same_preceding_marker assert segment_with_preceding_marker != segment_with_different_preceding_marker From 5463f29eace751daca6eceb99f83bf047c81886a Mon Sep 17 00:00:00 2001 From: Ben King Date: Wed, 9 Jul 2025 14:13:11 -0400 Subject: [PATCH 21/31] Damien's requested code-review changes --- .../fallback_quotation_mark_resolver.py | 12 +-- .../__init__.py | 0 .../chapter.py | 2 +- .../depth_based_quotation_mark_resolver.py | 8 +- .../preliminary_quotation_analyzer.py | 6 +- .../quotation_mark_direction.py | 0 .../quotation_mark_finder.py | 4 +- .../quotation_mark_metadata.py | 0 .../quotation_mark_resolution_issue.py | 0 .../quotation_mark_resolution_settings.py | 7 +- .../quotation_mark_resolver.py | 0 .../quotation_mark_string_match.py | 28 +++---- .../quotation_mark_tabulator.py | 8 +- .../quote_convention.py | 35 ++++---- ...onvention_detection_resolution_settings.py | 7 +- .../quote_convention_detector.py | 10 +-- .../quote_convention_set.py | 12 +-- .../standard_quote_conventions.py | 2 +- .../text_segment.py | 0 .../usfm_marker_type.py | 0 .../usfm_structure_extractor.py | 0 .../verse.py | 0 .../quotation_denormalization_first_pass.py | 2 +- ...normalization_usfm_update_block_handler.py | 2 +- .../quotation_mark_update_first_pass.py | 22 ++--- ...otation_mark_update_resolution_settings.py | 17 ++-- ...tion_changing_usfm_update_block_handler.py | 16 ++-- .../test_chapter.py | 2 +- ...est_depth_based_quotation_mark_resolver.py | 82 +++++++++---------- .../test_preliminary_quotation_analyzer.py | 2 +- .../test_quotation_mark_finder.py | 14 ++-- .../test_quotation_mark_metadata.py | 4 +- .../test_quotation_mark_resolver.py | 4 +- .../test_quotation_mark_string_match.py | 2 +- .../test_quotation_mark_tabulator.py | 2 +- .../test_quote_convention.py | 30 +++---- .../test_quote_convention_detector.py | 46 +++++------ .../test_quote_convention_set.py | 2 +- .../test_text_segment.py | 2 +- .../test_usfm_structure_extractor.py | 2 +- .../test_verse.py | 2 +- .../test_fallback_quotation_mark_resolver.py | 22 ++--- ...normalization_usfm_block_update_handler.py | 4 +- .../test_quotation_mark_update_first_pass.py | 12 +-- ...tion_changing_usfm_block_update_handler.py | 6 +- 45 files changed, 223 insertions(+), 217 deletions(-) rename machine/corpora/{analysis => punctuation_analysis}/__init__.py (100%) rename machine/corpora/{analysis => punctuation_analysis}/chapter.py (80%) rename machine/corpora/{analysis => punctuation_analysis}/depth_based_quotation_mark_resolver.py (98%) rename machine/corpora/{analysis => punctuation_analysis}/preliminary_quotation_analyzer.py (98%) rename machine/corpora/{analysis => punctuation_analysis}/quotation_mark_direction.py (100%) rename machine/corpora/{analysis => punctuation_analysis}/quotation_mark_finder.py (92%) rename machine/corpora/{analysis => punctuation_analysis}/quotation_mark_metadata.py (100%) rename machine/corpora/{analysis => punctuation_analysis}/quotation_mark_resolution_issue.py (100%) rename machine/corpora/{analysis => punctuation_analysis}/quotation_mark_resolution_settings.py (85%) rename machine/corpora/{analysis => punctuation_analysis}/quotation_mark_resolver.py (100%) rename machine/corpora/{analysis => punctuation_analysis}/quotation_mark_string_match.py (83%) rename machine/corpora/{analysis => punctuation_analysis}/quotation_mark_tabulator.py (96%) rename machine/corpora/{analysis => punctuation_analysis}/quote_convention.py (84%) rename machine/corpora/{analysis => punctuation_analysis}/quote_convention_detection_resolution_settings.py (91%) rename machine/corpora/{analysis => punctuation_analysis}/quote_convention_detector.py (89%) rename machine/corpora/{analysis => punctuation_analysis}/quote_convention_set.py (93%) rename machine/corpora/{analysis => punctuation_analysis}/standard_quote_conventions.py (99%) rename machine/corpora/{analysis => punctuation_analysis}/text_segment.py (100%) rename machine/corpora/{analysis => punctuation_analysis}/usfm_marker_type.py (100%) rename machine/corpora/{analysis => punctuation_analysis}/usfm_structure_extractor.py (100%) rename machine/corpora/{analysis => punctuation_analysis}/verse.py (100%) rename tests/corpora/{analysis => punctuation_analysis}/test_chapter.py (90%) rename tests/corpora/{analysis => punctuation_analysis}/test_depth_based_quotation_mark_resolver.py (97%) rename tests/corpora/{analysis => punctuation_analysis}/test_preliminary_quotation_analyzer.py (99%) rename tests/corpora/{analysis => punctuation_analysis}/test_quotation_mark_finder.py (96%) rename tests/corpora/{analysis => punctuation_analysis}/test_quotation_mark_metadata.py (94%) rename tests/corpora/{analysis => punctuation_analysis}/test_quotation_mark_resolver.py (96%) rename tests/corpora/{analysis => punctuation_analysis}/test_quotation_mark_string_match.py (99%) rename tests/corpora/{analysis => punctuation_analysis}/test_quotation_mark_tabulator.py (98%) rename tests/corpora/{analysis => punctuation_analysis}/test_quote_convention.py (94%) rename tests/corpora/{analysis => punctuation_analysis}/test_quote_convention_detector.py (82%) rename tests/corpora/{analysis => punctuation_analysis}/test_quote_convention_set.py (99%) rename tests/corpora/{analysis => punctuation_analysis}/test_text_segment.py (99%) rename tests/corpora/{analysis => punctuation_analysis}/test_usfm_structure_extractor.py (99%) rename tests/corpora/{analysis => punctuation_analysis}/test_verse.py (95%) diff --git a/machine/corpora/fallback_quotation_mark_resolver.py b/machine/corpora/fallback_quotation_mark_resolver.py index 002fd4ea..9f4694c0 100644 --- a/machine/corpora/fallback_quotation_mark_resolver.py +++ b/machine/corpora/fallback_quotation_mark_resolver.py @@ -1,11 +1,11 @@ from typing import Generator, Set, Union -from .analysis.quotation_mark_direction import QuotationMarkDirection -from .analysis.quotation_mark_metadata import QuotationMarkMetadata -from .analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue -from .analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings -from .analysis.quotation_mark_resolver import QuotationMarkResolver -from .analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .punctuation_analysis.quotation_mark_direction import QuotationMarkDirection +from .punctuation_analysis.quotation_mark_metadata import QuotationMarkMetadata +from .punctuation_analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .punctuation_analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver +from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch class FallbackQuotationMarkResolver(QuotationMarkResolver): diff --git a/machine/corpora/analysis/__init__.py b/machine/corpora/punctuation_analysis/__init__.py similarity index 100% rename from machine/corpora/analysis/__init__.py rename to machine/corpora/punctuation_analysis/__init__.py diff --git a/machine/corpora/analysis/chapter.py b/machine/corpora/punctuation_analysis/chapter.py similarity index 80% rename from machine/corpora/analysis/chapter.py rename to machine/corpora/punctuation_analysis/chapter.py index 30d01add..342a91de 100644 --- a/machine/corpora/analysis/chapter.py +++ b/machine/corpora/punctuation_analysis/chapter.py @@ -3,6 +3,6 @@ from .verse import Verse -@dataclass +@dataclass(frozen=True) class Chapter: verses: list[Verse] diff --git a/machine/corpora/analysis/depth_based_quotation_mark_resolver.py b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py similarity index 98% rename from machine/corpora/analysis/depth_based_quotation_mark_resolver.py rename to machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py index 004212b1..bff404b3 100644 --- a/machine/corpora/analysis/depth_based_quotation_mark_resolver.py +++ b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py @@ -101,7 +101,7 @@ def add_quotation_continuer( class QuotationMarkCategorizer: - apostrophe_pattern = regex.compile(r"[\'\u2019\u2018]", regex.U) + _APOSTROPHE_PATTERN = regex.compile(r"[\'\u2019\u2018]", regex.U) def __init__( self, @@ -188,7 +188,7 @@ def _meets_quote_continuer_prerequisites( next_match: Union[QuotationMarkStringMatch, None], ) -> bool: if ( - self._settings.should_rely_on_paragraph_markers() + self._settings.should_rely_on_paragraph_markers and not quote_match._text_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH) ): return False @@ -228,7 +228,7 @@ def is_closing_quote( match.has_trailing_whitespace() or match.has_trailing_punctuation() or match.is_at_end_of_segment() - or match.next_character_matches(self._settings.get_closing_quotation_mark_regex()) + or match.next_character_matches(self._settings.closing_quotation_mark_regex) ) and not match.has_leading_whitespace() return True @@ -293,7 +293,7 @@ def is_apostrophe( match: QuotationMarkStringMatch, next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if not match.quotation_mark_matches(self.apostrophe_pattern): + if not match.quotation_mark_matches(self._APOSTROPHE_PATTERN): return False # Latin letters on both sides of punctuation mark diff --git a/machine/corpora/analysis/preliminary_quotation_analyzer.py b/machine/corpora/punctuation_analysis/preliminary_quotation_analyzer.py similarity index 98% rename from machine/corpora/analysis/preliminary_quotation_analyzer.py rename to machine/corpora/punctuation_analysis/preliminary_quotation_analyzer.py index c3a43a94..ab1b5ce2 100644 --- a/machine/corpora/analysis/preliminary_quotation_analyzer.py +++ b/machine/corpora/punctuation_analysis/preliminary_quotation_analyzer.py @@ -193,7 +193,7 @@ def has_distinct_paired_quotation_mark(self, quotation_mark: str) -> bool: class PreliminaryApostropheAnalyzer: - apostrophe_pattern = regex.compile(r"[\'\u2019]", regex.U) + _APOSTROPHE_PATTERN = regex.compile(r"[\'\u2019]", regex.U) def __init__(self): self._apostrophe_proportion_statistics = ApostropheProportionStatistics() @@ -213,7 +213,7 @@ def process_quotation_marks( self._process_quotation_mark(quotation_mark_match) def _process_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> None: - if quotation_mark_match.quotation_mark_matches(self.apostrophe_pattern): + if quotation_mark_match.quotation_mark_matches(self._APOSTROPHE_PATTERN): self._count_apostrophe(quotation_mark_match) def _count_apostrophe(self, apostrophe_match: QuotationMarkStringMatch) -> None: @@ -248,7 +248,7 @@ def _is_match_word_final(self, apostrophe_match: QuotationMarkStringMatch) -> bo return True def is_apostrophe_only(self, mark: str) -> bool: - if not self.apostrophe_pattern.search(mark): + if not self._APOSTROPHE_PATTERN.search(mark): return False if self._word_position_statistics.is_mark_rarely_initial( diff --git a/machine/corpora/analysis/quotation_mark_direction.py b/machine/corpora/punctuation_analysis/quotation_mark_direction.py similarity index 100% rename from machine/corpora/analysis/quotation_mark_direction.py rename to machine/corpora/punctuation_analysis/quotation_mark_direction.py diff --git a/machine/corpora/analysis/quotation_mark_finder.py b/machine/corpora/punctuation_analysis/quotation_mark_finder.py similarity index 92% rename from machine/corpora/analysis/quotation_mark_finder.py rename to machine/corpora/punctuation_analysis/quotation_mark_finder.py index 30a60cf1..16d00d34 100644 --- a/machine/corpora/analysis/quotation_mark_finder.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_finder.py @@ -10,7 +10,7 @@ class QuotationMarkFinder: - quote_pattern = regex.compile(r"(\p{Quotation_Mark}|<<|>>|<|>)", regex.U) + _QUOTE_PATTERN = regex.compile(r"(\p{Quotation_Mark}|<<|>>|<|>)", regex.U) def __init__(self, quote_convention_set: QuoteConventionSet): self._quote_convention_set = quote_convention_set @@ -36,7 +36,7 @@ def find_all_potential_quotation_marks_in_text_segment( self, text_segment: TextSegment ) -> List[QuotationMarkStringMatch]: quotation_matches: List[QuotationMarkStringMatch] = [] - for quote_match in self.quote_pattern.finditer(text_segment.text): + for quote_match in self._QUOTE_PATTERN.finditer(text_segment.text): if self._quote_convention_set.is_valid_opening_quotation_mark( quote_match.group() ) or self._quote_convention_set.is_valid_closing_quotation_mark(quote_match.group()): diff --git a/machine/corpora/analysis/quotation_mark_metadata.py b/machine/corpora/punctuation_analysis/quotation_mark_metadata.py similarity index 100% rename from machine/corpora/analysis/quotation_mark_metadata.py rename to machine/corpora/punctuation_analysis/quotation_mark_metadata.py diff --git a/machine/corpora/analysis/quotation_mark_resolution_issue.py b/machine/corpora/punctuation_analysis/quotation_mark_resolution_issue.py similarity index 100% rename from machine/corpora/analysis/quotation_mark_resolution_issue.py rename to machine/corpora/punctuation_analysis/quotation_mark_resolution_issue.py diff --git a/machine/corpora/analysis/quotation_mark_resolution_settings.py b/machine/corpora/punctuation_analysis/quotation_mark_resolution_settings.py similarity index 85% rename from machine/corpora/analysis/quotation_mark_resolution_settings.py rename to machine/corpora/punctuation_analysis/quotation_mark_resolution_settings.py index aed6711e..f636ae6d 100644 --- a/machine/corpora/analysis/quotation_mark_resolution_settings.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_resolution_settings.py @@ -15,15 +15,18 @@ def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStr @abstractmethod def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: ... + @property @abstractmethod - def get_opening_quotation_mark_regex(self) -> regex.Pattern: ... + def opening_quotation_mark_regex(self) -> regex.Pattern: ... + @property @abstractmethod - def get_closing_quotation_mark_regex(self) -> regex.Pattern: ... + def closing_quotation_mark_regex(self) -> regex.Pattern: ... @abstractmethod def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: ... + @property @abstractmethod def should_rely_on_paragraph_markers(self) -> bool: ... diff --git a/machine/corpora/analysis/quotation_mark_resolver.py b/machine/corpora/punctuation_analysis/quotation_mark_resolver.py similarity index 100% rename from machine/corpora/analysis/quotation_mark_resolver.py rename to machine/corpora/punctuation_analysis/quotation_mark_resolver.py diff --git a/machine/corpora/analysis/quotation_mark_string_match.py b/machine/corpora/punctuation_analysis/quotation_mark_string_match.py similarity index 83% rename from machine/corpora/analysis/quotation_mark_string_match.py rename to machine/corpora/punctuation_analysis/quotation_mark_string_match.py index 736b376a..c2e47aaf 100644 --- a/machine/corpora/analysis/quotation_mark_string_match.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_string_match.py @@ -13,11 +13,11 @@ class QuotationMarkStringMatch: # extra stuff in the regex to handle Western Cham - letter_pattern: Pattern = regex.compile(r"[\p{L}\U0001E200-\U0001E28F]", regex.U) - latin_letter_pattern: Pattern = regex.compile(r"^\p{script=Latin}$", regex.U) - whitespace_pattern: Pattern = regex.compile(r"[\s~]", regex.U) - punctuation_pattern: Pattern = regex.compile(r"[\.,;\?!\)\]\-—۔،؛]", regex.U) - quote_introducer_pattern: Pattern = regex.compile(r"[:,]\s*$", regex.U) + _LETTER_PATTERN: Pattern = regex.compile(r"[\p{L}\U0001E200-\U0001E28F]", regex.U) + _LATIN_LETTER_PATTERN: Pattern = regex.compile(r"^\p{script=Latin}$", regex.U) + _WHITESPACE_PATTERN: Pattern = regex.compile(r"[\s~]", regex.U) + _PUNCTUATION_PATTERN: Pattern = regex.compile(r"[\.,;\?!\)\]\-—۔،؛]", regex.U) + _QUOTE_INTRODUCER_PATTERN: Pattern = regex.compile(r"[:,]\s*$", regex.U) def __init__(self, text_segment: TextSegment, start_index: int, end_index: int): self._text_segment = text_segment @@ -123,28 +123,28 @@ def has_leading_whitespace(self) -> bool: or self._text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) ) - return self.previous_character_matches(self.whitespace_pattern) + return self.previous_character_matches(self._WHITESPACE_PATTERN) def has_trailing_whitespace(self) -> bool: - return self.next_character_matches(self.whitespace_pattern) + return self.next_character_matches(self._WHITESPACE_PATTERN) def has_leading_punctuation(self) -> bool: - return self.previous_character_matches(self.punctuation_pattern) + return self.previous_character_matches(self._PUNCTUATION_PATTERN) def has_trailing_punctuation(self) -> bool: - return self.next_character_matches(self.punctuation_pattern) + return self.next_character_matches(self._PUNCTUATION_PATTERN) def has_letter_in_leading_substring(self) -> bool: - return self.leading_substring_matches(self.letter_pattern) + return self.leading_substring_matches(self._LETTER_PATTERN) def has_letter_in_trailing_substring(self) -> bool: - return self.trailing_substring_matches(self.letter_pattern) + return self.trailing_substring_matches(self._LETTER_PATTERN) def has_leading_latin_letter(self) -> bool: - return self.previous_character_matches(self.latin_letter_pattern) + return self.previous_character_matches(self._LATIN_LETTER_PATTERN) def has_trailing_latin_letter(self) -> bool: - return self.next_character_matches(self.latin_letter_pattern) + return self.next_character_matches(self._LATIN_LETTER_PATTERN) def has_quote_introducer_in_leading_substring(self) -> bool: - return self.leading_substring_matches(self.quote_introducer_pattern) + return self.leading_substring_matches(self._QUOTE_INTRODUCER_PATTERN) diff --git a/machine/corpora/analysis/quotation_mark_tabulator.py b/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py similarity index 96% rename from machine/corpora/analysis/quotation_mark_tabulator.py rename to machine/corpora/punctuation_analysis/quotation_mark_tabulator.py index 522d145e..fd6935df 100644 --- a/machine/corpora/analysis/quotation_mark_tabulator.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, List from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_metadata import QuotationMarkMetadata @@ -73,7 +73,8 @@ def calculate_similarity(self, quote_convention: QuoteConvention) -> float: return 0 return 1 - (num_differences / num_total_quotation_marks) - def print_summary(self) -> None: + def get_summary_message(self) -> str: + message_lines: List[str] = [] for depth in range(1, 5): if self._depth_and_direction_observed( depth, QuotationMarkDirection.OPENING @@ -88,7 +89,7 @@ def print_summary(self) -> None: depth, QuotationMarkDirection.CLOSING ) ) - print( + message_lines.append( "The most common level %i quotes are %s (%i of %i opening quotes) and %s (%i of %i closing quotes)" % ( depth, @@ -100,3 +101,4 @@ def print_summary(self) -> None: total_closing_count, ) ) + return "\n".join(message_lines) diff --git a/machine/corpora/analysis/quote_convention.py b/machine/corpora/punctuation_analysis/quote_convention.py similarity index 84% rename from machine/corpora/analysis/quote_convention.py rename to machine/corpora/punctuation_analysis/quote_convention.py index 609c786b..3dc3a9e0 100644 --- a/machine/corpora/analysis/quote_convention.py +++ b/machine/corpora/punctuation_analysis/quote_convention.py @@ -3,7 +3,7 @@ from .quotation_mark_direction import QuotationMarkDirection -quote_normalization_map: Dict[str, str] = { +_QUOTATION_MARK_NORMALIZATION_MAP: Dict[str, str] = { "\u00ab": '"', "\u00bb": '"', "\u2018": "'", @@ -19,20 +19,20 @@ } -@dataclass +@dataclass(frozen=True) class SingleLevelQuoteConvention: opening_quote: str closing_quote: str def normalize(self) -> "SingleLevelQuoteConvention": normalized_opening_quote = ( - quote_normalization_map[self.opening_quote] - if self.opening_quote in quote_normalization_map + _QUOTATION_MARK_NORMALIZATION_MAP[self.opening_quote] + if self.opening_quote in _QUOTATION_MARK_NORMALIZATION_MAP else self.opening_quote ) normalized_closing_quote = ( - quote_normalization_map[self.closing_quote] - if self.closing_quote in quote_normalization_map + _QUOTATION_MARK_NORMALIZATION_MAP[self.closing_quote] + if self.closing_quote in _QUOTATION_MARK_NORMALIZATION_MAP else self.closing_quote ) return SingleLevelQuoteConvention(normalized_opening_quote, normalized_closing_quote) @@ -40,13 +40,13 @@ def normalize(self) -> "SingleLevelQuoteConvention": class QuoteConvention: def __init__(self, name: str, levels: list[SingleLevelQuoteConvention]): - self.name = name + self._name = name self.levels = levels def __eq__(self, value): if not isinstance(value, QuoteConvention): return False - if self.name != value.name: + if self._name != value._name: return False if len(self.levels) != len(value.levels): return False @@ -57,10 +57,12 @@ def __eq__(self, value): return False return True - def get_name(self) -> str: - return self.name + @property + def name(self) -> str: + return self._name - def get_num_levels(self) -> int: + @property + def num_levels(self) -> int: return len(self.levels) def get_opening_quote_at_level(self, level: int) -> str: @@ -70,7 +72,7 @@ def get_closing_quote_at_level(self, level: int) -> str: return self.levels[level - 1].closing_quote def get_expected_quotation_mark(self, depth: int, direction: QuotationMarkDirection) -> str: - if depth > len(self.levels) or depth < 1: + if depth > self.num_levels or depth < 1: return "" return ( self.get_opening_quote_at_level(depth) @@ -117,13 +119,10 @@ def is_compatible_with_observed_quotation_marks( return True def normalize(self) -> "QuoteConvention": - return QuoteConvention(self.get_name() + "_normalized", [level.normalize() for level in self.levels]) + return QuoteConvention(self.name + "_normalized", [level.normalize() for level in self.levels]) - def print_summary(self) -> None: - print(self._get_summary_message()) - - def _get_summary_message(self) -> str: - summary = self.get_name() + "\n" + def __str__(self) -> str: + summary = self.name + "\n" for level, convention in enumerate(self.levels): ordinal_name = self._get_ordinal_name(level + 1) summary += "%s%s-level quote%s\n" % ( diff --git a/machine/corpora/analysis/quote_convention_detection_resolution_settings.py b/machine/corpora/punctuation_analysis/quote_convention_detection_resolution_settings.py similarity index 91% rename from machine/corpora/analysis/quote_convention_detection_resolution_settings.py rename to machine/corpora/punctuation_analysis/quote_convention_detection_resolution_settings.py index c167fbb9..97591194 100644 --- a/machine/corpora/analysis/quote_convention_detection_resolution_settings.py +++ b/machine/corpora/punctuation_analysis/quote_convention_detection_resolution_settings.py @@ -19,15 +19,18 @@ def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStr def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_convention_set) - def get_opening_quotation_mark_regex(self) -> regex.Pattern: + @property + def opening_quotation_mark_regex(self) -> regex.Pattern: return self._quote_convention_set.opening_quotation_mark_regex - def get_closing_quotation_mark_regex(self) -> regex.Pattern: + @property + def closing_quotation_mark_regex(self) -> regex.Pattern: return self._quote_convention_set.closing_quotation_mark_regex def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: return self._quote_convention_set.marks_are_a_valid_pair(opening_mark, closing_mark) + @property def should_rely_on_paragraph_markers(self): return True diff --git a/machine/corpora/analysis/quote_convention_detector.py b/machine/corpora/punctuation_analysis/quote_convention_detector.py similarity index 89% rename from machine/corpora/analysis/quote_convention_detector.py rename to machine/corpora/punctuation_analysis/quote_convention_detector.py index 3e2673b9..4b915365 100644 --- a/machine/corpora/analysis/quote_convention_detector.py +++ b/machine/corpora/punctuation_analysis/quote_convention_detector.py @@ -11,11 +11,11 @@ from .quote_convention import QuoteConvention from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings from .quote_convention_set import QuoteConventionSet -from .standard_quote_conventions import standard_quote_conventions +from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS from .usfm_structure_extractor import UsfmStructureExtractor -@dataclass +@dataclass(frozen=True) class QuoteConventionAnalysis: best_quote_convention: QuoteConvention best_quote_convention_score: float @@ -29,7 +29,7 @@ def __init__(self): def _count_quotation_marks_in_chapters(self, chapters: list[Chapter]) -> None: possible_quote_conventions: QuoteConventionSet = PreliminaryQuotationAnalyzer( - standard_quote_conventions + STANDARD_QUOTE_CONVENTIONS ).narrow_down_possible_quote_conventions(chapters) for chapter in chapters: @@ -53,12 +53,12 @@ def _count_quotation_marks_in_chapter( def detect_quotation_convention(self, print_summary: bool) -> Union[QuoteConventionAnalysis, None]: self._count_quotation_marks_in_chapters(self.get_chapters()) - (best_quote_convention, score) = standard_quote_conventions.find_most_similar_convention( + (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention( self._quotation_mark_tabulator ) if print_summary: - self._quotation_mark_tabulator.print_summary() + print(self._quotation_mark_tabulator.get_summary_message()) if score > 0 and best_quote_convention is not None: return QuoteConventionAnalysis(best_quote_convention, score) diff --git a/machine/corpora/analysis/quote_convention_set.py b/machine/corpora/punctuation_analysis/quote_convention_set.py similarity index 93% rename from machine/corpora/analysis/quote_convention_set.py rename to machine/corpora/punctuation_analysis/quote_convention_set.py index 5d27c25f..01128dc6 100644 --- a/machine/corpora/analysis/quote_convention_set.py +++ b/machine/corpora/punctuation_analysis/quote_convention_set.py @@ -26,7 +26,7 @@ def _create_quote_regexes(self) -> None: if len(self._conventions) > 0: for convention in self._conventions: - for level in range(1, convention.get_num_levels() + 1): + for level in range(1, convention.num_levels + 1): opening_quote = convention.get_opening_quote_at_level(level) closing_quote = convention.get_closing_quote_at_level(level) opening_quotation_marks.add(opening_quote) @@ -56,7 +56,7 @@ def _create_quotation_mark_pair_map(self) -> None: self.closing_marks_by_opening_mark: Dict[str, set[str]] = dict() self.opening_marks_by_closing_mark: Dict[str, set[str]] = dict() for convention in self._conventions: - for level in range(1, convention.get_num_levels() + 1): + for level in range(1, convention.num_levels + 1): opening_quote = convention.get_opening_quote_at_level(level) closing_quote = convention.get_closing_quote_at_level(level) if opening_quote not in self.closing_marks_by_opening_mark: @@ -80,12 +80,12 @@ def quotation_mark_regex(self) -> Pattern: def get_quote_convention_by_name(self, name: str) -> Union[QuoteConvention, None]: for convention in self._conventions: - if convention.get_name() == name: + if convention.name == name: return convention return None def get_all_quote_convention_names(self) -> List[str]: - return sorted([qc.name for qc in self._conventions]) + return sorted([qc._name for qc in self._conventions]) def get_possible_opening_marks(self) -> list[str]: return sorted(list(self.closing_marks_by_opening_mark.keys())) @@ -157,7 +157,3 @@ def find_most_similar_convention( best_quote_convention = quote_convention return (best_quote_convention, best_similarity) - - def print_summary(self) -> None: - print("Opening quotation marks must be one of the following: ", self.get_possible_opening_marks()) - print("Closing quotation marks must be one of the following: ", self.get_possible_closing_marks()) diff --git a/machine/corpora/analysis/standard_quote_conventions.py b/machine/corpora/punctuation_analysis/standard_quote_conventions.py similarity index 99% rename from machine/corpora/analysis/standard_quote_conventions.py rename to machine/corpora/punctuation_analysis/standard_quote_conventions.py index d3c72b90..b1292e15 100644 --- a/machine/corpora/analysis/standard_quote_conventions.py +++ b/machine/corpora/punctuation_analysis/standard_quote_conventions.py @@ -1,7 +1,7 @@ from .quote_convention import QuoteConvention, SingleLevelQuoteConvention from .quote_convention_set import QuoteConventionSet -standard_quote_conventions: QuoteConventionSet = QuoteConventionSet( +STANDARD_QUOTE_CONVENTIONS: QuoteConventionSet = QuoteConventionSet( [ QuoteConvention( "standard_english", diff --git a/machine/corpora/analysis/text_segment.py b/machine/corpora/punctuation_analysis/text_segment.py similarity index 100% rename from machine/corpora/analysis/text_segment.py rename to machine/corpora/punctuation_analysis/text_segment.py diff --git a/machine/corpora/analysis/usfm_marker_type.py b/machine/corpora/punctuation_analysis/usfm_marker_type.py similarity index 100% rename from machine/corpora/analysis/usfm_marker_type.py rename to machine/corpora/punctuation_analysis/usfm_marker_type.py diff --git a/machine/corpora/analysis/usfm_structure_extractor.py b/machine/corpora/punctuation_analysis/usfm_structure_extractor.py similarity index 100% rename from machine/corpora/analysis/usfm_structure_extractor.py rename to machine/corpora/punctuation_analysis/usfm_structure_extractor.py diff --git a/machine/corpora/analysis/verse.py b/machine/corpora/punctuation_analysis/verse.py similarity index 100% rename from machine/corpora/analysis/verse.py rename to machine/corpora/punctuation_analysis/verse.py diff --git a/machine/corpora/quotation_denormalization_first_pass.py b/machine/corpora/quotation_denormalization_first_pass.py index d3cc453c..e3e98db4 100644 --- a/machine/corpora/quotation_denormalization_first_pass.py +++ b/machine/corpora/quotation_denormalization_first_pass.py @@ -1,4 +1,4 @@ -from .analysis.quote_convention import QuoteConvention +from .punctuation_analysis.quote_convention import QuoteConvention from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass diff --git a/machine/corpora/quotation_denormalization_usfm_update_block_handler.py b/machine/corpora/quotation_denormalization_usfm_update_block_handler.py index d6cd9cef..e92fa1d1 100644 --- a/machine/corpora/quotation_denormalization_usfm_update_block_handler.py +++ b/machine/corpora/quotation_denormalization_usfm_update_block_handler.py @@ -1,4 +1,4 @@ -from .analysis.quote_convention import QuoteConvention +from .punctuation_analysis.quote_convention import QuoteConvention from .quotation_mark_update_settings import QuotationMarkUpdateSettings from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler diff --git a/machine/corpora/quotation_mark_update_first_pass.py b/machine/corpora/quotation_mark_update_first_pass.py index f4853968..414ef1c3 100644 --- a/machine/corpora/quotation_mark_update_first_pass.py +++ b/machine/corpora/quotation_mark_update_first_pass.py @@ -1,14 +1,14 @@ from typing import Dict, List, Set -from .analysis.chapter import Chapter -from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver -from .analysis.quotation_mark_finder import QuotationMarkFinder -from .analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue -from .analysis.quotation_mark_resolver import QuotationMarkResolver -from .analysis.quotation_mark_string_match import QuotationMarkStringMatch -from .analysis.quote_convention import QuoteConvention -from .analysis.quote_convention_set import QuoteConventionSet -from .analysis.usfm_structure_extractor import UsfmStructureExtractor +from .punctuation_analysis.chapter import Chapter +from .punctuation_analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .punctuation_analysis.quotation_mark_finder import QuotationMarkFinder +from .punctuation_analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver +from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .punctuation_analysis.quote_convention import QuoteConvention +from .punctuation_analysis.quote_convention_set import QuoteConventionSet +from .punctuation_analysis.usfm_structure_extractor import UsfmStructureExtractor from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy @@ -34,11 +34,11 @@ def _check_whether_fallback_mode_will_work( self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention ) -> bool: target_marks_by_source_marks: Dict[str, Set[str]] = {} - for level in range(1, source_quote_convention.get_num_levels() + 1): + for level in range(1, source_quote_convention.num_levels + 1): opening_quotation_mark = source_quote_convention.get_opening_quote_at_level(level) if opening_quotation_mark not in target_marks_by_source_marks: target_marks_by_source_marks[opening_quotation_mark] = set() - if level <= target_quote_convention.get_num_levels(): + if level <= target_quote_convention.num_levels: target_marks_by_source_marks[opening_quotation_mark].add( target_quote_convention.get_closing_quote_at_level(level) ) diff --git a/machine/corpora/quotation_mark_update_resolution_settings.py b/machine/corpora/quotation_mark_update_resolution_settings.py index b0fd40f2..0628e07c 100644 --- a/machine/corpora/quotation_mark_update_resolution_settings.py +++ b/machine/corpora/quotation_mark_update_resolution_settings.py @@ -2,11 +2,11 @@ import regex -from .analysis.quotation_mark_direction import QuotationMarkDirection -from .analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings -from .analysis.quotation_mark_string_match import QuotationMarkStringMatch -from .analysis.quote_convention import QuoteConvention -from .analysis.quote_convention_set import QuoteConventionSet +from .punctuation_analysis.quotation_mark_direction import QuotationMarkDirection +from .punctuation_analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .punctuation_analysis.quote_convention import QuoteConvention +from .punctuation_analysis.quote_convention_set import QuoteConventionSet class QuotationMarkUpdateResolutionSettings(QuotationMarkResolutionSettings): @@ -21,15 +21,18 @@ def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStr def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_convention_singleton_set) - def get_opening_quotation_mark_regex(self) -> regex.Pattern: + @property + def opening_quotation_mark_regex(self) -> regex.Pattern: return self._quote_convention_singleton_set.opening_quotation_mark_regex - def get_closing_quotation_mark_regex(self) -> regex.Pattern: + @property + def closing_quotation_mark_regex(self) -> regex.Pattern: return self._quote_convention_singleton_set.closing_quotation_mark_regex def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: return self._quote_convention_singleton_set.marks_are_a_valid_pair(opening_mark, closing_mark) + @property def should_rely_on_paragraph_markers(self): return False diff --git a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py index 745b718b..86aa72ec 100644 --- a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py +++ b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py @@ -1,14 +1,14 @@ from typing import List, Union -from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver -from .analysis.quotation_mark_finder import QuotationMarkFinder -from .analysis.quotation_mark_resolver import QuotationMarkResolver -from .analysis.quotation_mark_string_match import QuotationMarkStringMatch -from .analysis.quote_convention import QuoteConvention -from .analysis.quote_convention_set import QuoteConventionSet -from .analysis.text_segment import TextSegment -from .analysis.usfm_marker_type import UsfmMarkerType from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver +from .punctuation_analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .punctuation_analysis.quotation_mark_finder import QuotationMarkFinder +from .punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver +from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .punctuation_analysis.quote_convention import QuoteConvention +from .punctuation_analysis.quote_convention_set import QuoteConventionSet +from .punctuation_analysis.text_segment import TextSegment +from .punctuation_analysis.usfm_marker_type import UsfmMarkerType from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings from .quotation_mark_update_settings import QuotationMarkUpdateSettings from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy diff --git a/tests/corpora/analysis/test_chapter.py b/tests/corpora/punctuation_analysis/test_chapter.py similarity index 90% rename from tests/corpora/analysis/test_chapter.py rename to tests/corpora/punctuation_analysis/test_chapter.py index cb9f4f23..a8ee6cce 100644 --- a/tests/corpora/analysis/test_chapter.py +++ b/tests/corpora/punctuation_analysis/test_chapter.py @@ -1,4 +1,4 @@ -from machine.corpora.analysis import Chapter, TextSegment, Verse +from machine.corpora.punctuation_analysis import Chapter, TextSegment, Verse def test_initialize_verse() -> None: diff --git a/tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py b/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py similarity index 97% rename from tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py rename to tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py index a41933d1..904cd6c8 100644 --- a/tests/corpora/analysis/test_depth_based_quotation_mark_resolver.py +++ b/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py @@ -1,7 +1,7 @@ from pytest import raises from machine.corpora import QuotationMarkUpdateResolutionSettings -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( DepthBasedQuotationMarkResolver, QuotationContinuerState, QuotationContinuerStyle, @@ -316,7 +316,7 @@ def test_add_quotation_continuer() -> None: def test_is_english_quotation_continuer() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None @@ -564,7 +564,7 @@ def test_is_english_quotation_continuer() -> None: def test_is_spanish_quotation_continuer() -> None: western_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("western_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") ) assert western_european_quote_convention is not None @@ -812,7 +812,7 @@ def test_is_spanish_quotation_continuer() -> None: def test_is_opening_quote() -> None: central_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") ) assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -825,7 +825,7 @@ def test_is_opening_quote() -> None: ) british_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("british_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") ) assert british_english_quote_convention is not None british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -836,7 +836,7 @@ def test_is_opening_quote() -> None: ) standard_swedish_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") ) assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1039,7 +1039,7 @@ def test_is_opening_quote() -> None: def test_is_closing_quote() -> None: central_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") ) assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1052,7 +1052,7 @@ def test_is_closing_quote() -> None: ) british_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("british_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") ) assert british_english_quote_convention is not None british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1063,7 +1063,7 @@ def test_is_closing_quote() -> None: ) standard_swedish_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") ) assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1074,7 +1074,7 @@ def test_is_closing_quote() -> None: ) standard_french_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_french") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_french") ) assert standard_french_quote_convention is not None standard_french_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1223,7 +1223,7 @@ def test_is_closing_quote() -> None: def test_is_malformed_opening_quote() -> None: central_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") ) assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1236,7 +1236,7 @@ def test_is_malformed_opening_quote() -> None: ) british_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("british_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") ) assert british_english_quote_convention is not None british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1247,7 +1247,7 @@ def test_is_malformed_opening_quote() -> None: ) standard_swedish_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") ) assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1424,7 +1424,7 @@ def test_is_malformed_opening_quote() -> None: def test_is_malformed_closing_quote() -> None: central_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") ) assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1437,7 +1437,7 @@ def test_is_malformed_closing_quote() -> None: ) british_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("british_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") ) assert british_english_quote_convention is not None british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1448,7 +1448,7 @@ def test_is_malformed_closing_quote() -> None: ) standard_swedish_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") ) assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1622,7 +1622,7 @@ def test_is_malformed_closing_quote() -> None: def test_is_unpaired_closing_quote() -> None: central_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") ) assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1635,7 +1635,7 @@ def test_is_unpaired_closing_quote() -> None: ) british_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("british_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") ) assert british_english_quote_convention is not None british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1646,7 +1646,7 @@ def test_is_unpaired_closing_quote() -> None: ) standard_swedish_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") ) assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1809,7 +1809,7 @@ def test_is_unpaired_closing_quote() -> None: def test_is_apostrophe() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1822,7 +1822,7 @@ def test_is_apostrophe() -> None: ) typewriter_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") ) assert typewriter_english_quote_convention is not None typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -1974,7 +1974,7 @@ def test_is_apostrophe() -> None: # DepthBasedQuotationMarkResolver tests def test_depth_based_quotation_mark_resolver_reset() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2006,7 +2006,7 @@ def test_depth_based_quotation_mark_resolver_reset() -> None: def test_basic_quotation_mark_recognition() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2035,7 +2035,7 @@ def test_basic_quotation_mark_recognition() -> None: def test_resolution_only_of_passed_matches() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2075,7 +2075,7 @@ def test_resolution_only_of_passed_matches() -> None: def test_resolution_across_segments() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2105,7 +2105,7 @@ def test_resolution_across_segments() -> None: def test_resolution_with_apostrophes() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2138,7 +2138,7 @@ def test_resolution_with_apostrophes() -> None: assert standard_english_quotation_mark_resolver.get_issues() == set() typewriter_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") ) assert typewriter_english_quote_convention is not None typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2170,7 +2170,7 @@ def test_resolution_with_apostrophes() -> None: def test_english_quote_continuers() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2209,7 +2209,7 @@ def test_english_quote_continuers() -> None: def test_spanish_quote_continuers() -> None: western_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("western_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") ) assert western_european_quote_convention is not None western_european_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2248,7 +2248,7 @@ def test_spanish_quote_continuers() -> None: def test_malformed_quotation_marks() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2283,7 +2283,7 @@ def test_malformed_quotation_marks() -> None: def test_unpaired_quotation_mark_issue() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2326,7 +2326,7 @@ def test_unpaired_quotation_mark_issue() -> None: def test_too_deep_nesting_issue() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2362,7 +2362,7 @@ def test_too_deep_nesting_issue() -> None: def test_incompatible_quotation_mark_issue() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2393,7 +2393,7 @@ def test_incompatible_quotation_mark_issue() -> None: def test_ambiguous_quotation_mark_issue() -> None: typewriter_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") ) assert typewriter_english_quote_convention is not None typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2433,7 +2433,7 @@ def test_ambiguous_quotation_mark_issue() -> None: def test_typewriter_english_quotation_mark_recognition() -> None: typewriter_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") ) assert typewriter_english_quote_convention is not None typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2464,7 +2464,7 @@ def test_typewriter_english_quotation_mark_recognition() -> None: def test_typewriter_french_mark_recognition() -> None: typewriter_french_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_french") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_french") ) assert typewriter_french_quote_convention is not None typewriter_french_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2493,7 +2493,7 @@ def test_typewriter_french_mark_recognition() -> None: def test_central_european_quotation_mark_recognition() -> None: central_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") ) assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2527,7 +2527,7 @@ def test_central_european_quotation_mark_recognition() -> None: def test_standard_swedish_quotation_mark_recognition() -> None: standard_swedish_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") ) assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -2561,17 +2561,17 @@ def test_standard_swedish_quotation_mark_recognition() -> None: def test_multiple_conventions_quotation_mark_recognition() -> None: typewriter_french_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_french") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_french") ) assert typewriter_french_quote_convention is not None central_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") ) assert central_european_quote_convention is not None standard_swedish_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_swedish") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") ) assert standard_swedish_quote_convention is not None multiple_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( diff --git a/tests/corpora/analysis/test_preliminary_quotation_analyzer.py b/tests/corpora/punctuation_analysis/test_preliminary_quotation_analyzer.py similarity index 99% rename from tests/corpora/analysis/test_preliminary_quotation_analyzer.py rename to tests/corpora/punctuation_analysis/test_preliminary_quotation_analyzer.py index de167f05..a52d2164 100644 --- a/tests/corpora/analysis/test_preliminary_quotation_analyzer.py +++ b/tests/corpora/punctuation_analysis/test_preliminary_quotation_analyzer.py @@ -1,4 +1,4 @@ -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( ApostropheProportionStatistics, Chapter, PreliminaryApostropheAnalyzer, diff --git a/tests/corpora/analysis/test_quotation_mark_finder.py b/tests/corpora/punctuation_analysis/test_quotation_mark_finder.py similarity index 96% rename from tests/corpora/analysis/test_quotation_mark_finder.py rename to tests/corpora/punctuation_analysis/test_quotation_mark_finder.py index 10f6cb52..5fbfe5fc 100644 --- a/tests/corpora/analysis/test_quotation_mark_finder.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_finder.py @@ -1,4 +1,4 @@ -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( QuotationMarkFinder, QuotationMarkStringMatch, QuoteConventionSet, @@ -8,7 +8,7 @@ def test_that_all_possible_quotation_marks_are_identified() -> None: - quotation_mark_finder = QuotationMarkFinder(standard_quote_conventions.standard_quote_conventions) + quotation_mark_finder = QuotationMarkFinder(standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS) assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( TextSegment.Builder().set_text("\u201cSample Text\u201d").build() ) == [ @@ -178,7 +178,7 @@ def test_that_all_possible_quotation_marks_are_identified() -> None: def test_that_it_uses_the_quote_convention_set() -> None: standard_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("standard_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") ) assert standard_english_quote_convention is not None @@ -193,7 +193,7 @@ def test_that_it_uses_the_quote_convention_set() -> None: ) typewriter_english_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("typewriter_english") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") ) assert typewriter_english_quote_convention is not None @@ -215,7 +215,7 @@ def test_that_it_uses_the_quote_convention_set() -> None: ] western_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("western_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") ) assert western_european_quote_convention is not None @@ -237,7 +237,7 @@ def test_that_it_uses_the_quote_convention_set() -> None: ] typewriter_western_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "typewriter_western_european" ) ) @@ -268,7 +268,7 @@ def test_that_it_uses_the_quote_convention_set() -> None: ] central_european_quote_convention = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name("central_european") + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") ) assert central_european_quote_convention is not None diff --git a/tests/corpora/analysis/test_quotation_mark_metadata.py b/tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py similarity index 94% rename from tests/corpora/analysis/test_quotation_mark_metadata.py rename to tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py index d7c2395e..912cf11a 100644 --- a/tests/corpora/analysis/test_quotation_mark_metadata.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py @@ -1,6 +1,6 @@ from typing import Union -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( QuotationMarkDirection, QuotationMarkMetadata, QuoteConvention, @@ -46,7 +46,7 @@ def test_update_quotation_mark() -> None: def get_quote_convention_by_name(name: str) -> QuoteConvention: quote_convention: Union[QuoteConvention, None] = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name(name) + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) ) assert quote_convention is not None return quote_convention diff --git a/tests/corpora/analysis/test_quotation_mark_resolver.py b/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py similarity index 96% rename from tests/corpora/analysis/test_quotation_mark_resolver.py rename to tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py index 0b30172c..07988774 100644 --- a/tests/corpora/analysis/test_quotation_mark_resolver.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py @@ -1,6 +1,6 @@ from typing import List -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( DepthBasedQuotationMarkResolver, QuotationMarkResolver, QuotationMarkStringMatch, @@ -13,7 +13,7 @@ def test_reset() -> None: quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( - QuoteConventionDetectionResolutionSettings(standard_quote_conventions.standard_quote_conventions) + QuoteConventionDetectionResolutionSettings(standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS) ) assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] diff --git a/tests/corpora/analysis/test_quotation_mark_string_match.py b/tests/corpora/punctuation_analysis/test_quotation_mark_string_match.py similarity index 99% rename from tests/corpora/analysis/test_quotation_mark_string_match.py rename to tests/corpora/punctuation_analysis/test_quotation_mark_string_match.py index e54caa31..c21568e9 100644 --- a/tests/corpora/analysis/test_quotation_mark_string_match.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_string_match.py @@ -1,6 +1,6 @@ import regex -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( QuotationMarkDirection, QuotationMarkMetadata, QuotationMarkStringMatch, diff --git a/tests/corpora/analysis/test_quotation_mark_tabulator.py b/tests/corpora/punctuation_analysis/test_quotation_mark_tabulator.py similarity index 98% rename from tests/corpora/analysis/test_quotation_mark_tabulator.py rename to tests/corpora/punctuation_analysis/test_quotation_mark_tabulator.py index dde9ea3a..de9a787d 100644 --- a/tests/corpora/analysis/test_quotation_mark_tabulator.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_tabulator.py @@ -1,7 +1,7 @@ # QuotationMarkCounts tests from pytest import approx -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( QuotationMarkCounts, QuotationMarkDirection, QuotationMarkMetadata, diff --git a/tests/corpora/analysis/test_quote_convention.py b/tests/corpora/punctuation_analysis/test_quote_convention.py similarity index 94% rename from tests/corpora/analysis/test_quote_convention.py rename to tests/corpora/punctuation_analysis/test_quote_convention.py index 6a863f4f..d20e5337 100644 --- a/tests/corpora/analysis/test_quote_convention.py +++ b/tests/corpora/punctuation_analysis/test_quote_convention.py @@ -1,5 +1,5 @@ -from machine.corpora.analysis import QuotationMarkDirection -from machine.corpora.analysis.quote_convention import QuoteConvention, SingleLevelQuoteConvention +from machine.corpora.punctuation_analysis import QuotationMarkDirection +from machine.corpora.punctuation_analysis.quote_convention import QuoteConvention, SingleLevelQuoteConvention def test_single_level_quote_convention_normalize() -> None: @@ -86,13 +86,13 @@ def test_single_level_quote_convention_normalize() -> None: def test_get_num_levels() -> None: empty_quote_convention = QuoteConvention("empty-quote-convention", []) - assert empty_quote_convention.get_num_levels() == 0 + assert empty_quote_convention.num_levels == 0 one_level_quote_convention = QuoteConvention( "one-level-quote-convention", [SingleLevelQuoteConvention("\u201c", "\u201d")], ) - assert one_level_quote_convention.get_num_levels() == 1 + assert one_level_quote_convention.num_levels == 1 two_level_quote_convention = QuoteConvention( "two-level-quote-convention", @@ -101,7 +101,7 @@ def test_get_num_levels() -> None: SingleLevelQuoteConvention("\u2018", "\u2019"), ], ) - assert two_level_quote_convention.get_num_levels() == 2 + assert two_level_quote_convention.num_levels == 2 three_level_quote_convention = QuoteConvention( "three-level-quote-convention", @@ -111,7 +111,7 @@ def test_get_num_levels() -> None: SingleLevelQuoteConvention("\u201D", "\u201D"), ], ) - assert three_level_quote_convention.get_num_levels() == 3 + assert three_level_quote_convention.num_levels == 3 def test_get_opening_quote_at_level() -> None: @@ -298,8 +298,8 @@ def test_is_compatible_with_observed_quotation_marks() -> None: def test_normalize() -> None: empty_quote_convention = QuoteConvention("empty-quote-convention", []) normalized_empty_quote_convention = empty_quote_convention.normalize() - assert normalized_empty_quote_convention.get_name() == "empty-quote-convention_normalized" - assert normalized_empty_quote_convention.get_num_levels() == 0 + assert normalized_empty_quote_convention.name == "empty-quote-convention_normalized" + assert normalized_empty_quote_convention.num_levels == 0 standard_english_quote_convention = QuoteConvention( "standard-english-quote-convention", @@ -311,8 +311,8 @@ def test_normalize() -> None: ], ) normalized_standard_english_quote_convention = standard_english_quote_convention.normalize() - assert normalized_standard_english_quote_convention.get_name() == "standard-english-quote-convention_normalized" - assert normalized_standard_english_quote_convention.get_num_levels() == 4 + assert normalized_standard_english_quote_convention.name == "standard-english-quote-convention_normalized" + assert normalized_standard_english_quote_convention.num_levels == 4 assert normalized_standard_english_quote_convention.get_opening_quote_at_level(1) == '"' assert normalized_standard_english_quote_convention.get_closing_quote_at_level(1) == '"' assert normalized_standard_english_quote_convention.get_opening_quote_at_level(2) == "'" @@ -331,8 +331,8 @@ def test_normalize() -> None: ], ) normalized_western_european_quote_convention = western_european_quote_convention.normalize() - assert normalized_western_european_quote_convention.get_name() == "test-quote-convention_normalized" - assert normalized_western_european_quote_convention.get_num_levels() == 3 + assert normalized_western_european_quote_convention.name == "test-quote-convention_normalized" + assert normalized_western_european_quote_convention.num_levels == 3 assert normalized_western_european_quote_convention.get_opening_quote_at_level(1) == '"' assert normalized_western_european_quote_convention.get_closing_quote_at_level(1) == '"' assert normalized_western_european_quote_convention.get_opening_quote_at_level(2) == '"' @@ -353,10 +353,10 @@ def test_normalize() -> None: hybrid_british_typewriter_english_quote_convention.normalize() ) assert ( - normalized_hybrid_british_typewriter_english_quote_convention.get_name() + normalized_hybrid_british_typewriter_english_quote_convention.name == "hybrid-british-typewriter-english-quote-convention_normalized" ) - assert normalized_hybrid_british_typewriter_english_quote_convention.get_num_levels() == 3 + assert normalized_hybrid_british_typewriter_english_quote_convention.num_levels == 3 assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quote_at_level(1) == '"' assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quote_at_level(1) == '"' assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quote_at_level(2) == "'" @@ -380,4 +380,4 @@ def test_print_summary() -> None: + "\u2018Second-level quote\u2019\n" + "\u201DThird-level quote\u201D\n" ) - assert quote_convention._get_summary_message() == expected_summary_message + assert str(quote_convention) == expected_summary_message diff --git a/tests/corpora/analysis/test_quote_convention_detector.py b/tests/corpora/punctuation_analysis/test_quote_convention_detector.py similarity index 82% rename from tests/corpora/analysis/test_quote_convention_detector.py rename to tests/corpora/punctuation_analysis/test_quote_convention_detector.py index 442643b6..31df6034 100644 --- a/tests/corpora/analysis/test_quote_convention_detector.py +++ b/tests/corpora/punctuation_analysis/test_quote_convention_detector.py @@ -1,7 +1,7 @@ from typing import Union from machine.corpora import parse_usfm -from machine.corpora.analysis import QuoteConventionAnalysis, QuoteConventionDetector +from machine.corpora.punctuation_analysis import QuoteConventionAnalysis, QuoteConventionDetector # Text comes from the World English Bible, which is in the public domain. @@ -16,7 +16,7 @@ def test_standard_english() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "standard_english" + assert analysis.best_quote_convention.name == "standard_english" def test_typewriter_english() -> None: @@ -29,7 +29,7 @@ def test_typewriter_english() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "typewriter_english" + assert analysis.best_quote_convention.name == "typewriter_english" def test_british_english() -> None: @@ -42,7 +42,7 @@ def test_british_english() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "british_english" + assert analysis.best_quote_convention.name == "british_english" def test_british_typewriter_english() -> None: @@ -55,7 +55,7 @@ def test_british_typewriter_english() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "british_typewriter_english" + assert analysis.best_quote_convention.name == "british_typewriter_english" def test_hybrid_typewriter_english() -> None: @@ -68,7 +68,7 @@ def test_hybrid_typewriter_english() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "hybrid_typewriter_english" + assert analysis.best_quote_convention.name == "hybrid_typewriter_english" def test_standard_french() -> None: @@ -81,7 +81,7 @@ def test_standard_french() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "standard_french" + assert analysis.best_quote_convention.name == "standard_french" def test_typewriter_french() -> None: @@ -94,7 +94,7 @@ def test_typewriter_french() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "typewriter_french" + assert analysis.best_quote_convention.name == "typewriter_french" # french_variant requires a 3rd-level of quotes to differentiate from standard_french @@ -110,7 +110,7 @@ def test_western_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "western_european" + assert analysis.best_quote_convention.name == "western_european" def test_british_inspired_western_european() -> None: @@ -123,7 +123,7 @@ def test_british_inspired_western_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "british_inspired_western_european" + assert analysis.best_quote_convention.name == "british_inspired_western_european" def test_typewriter_western_european() -> None: @@ -136,7 +136,7 @@ def test_typewriter_western_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "typewriter_western_european" + assert analysis.best_quote_convention.name == "typewriter_western_european" def test_typewriter_western_european_variant() -> None: @@ -149,7 +149,7 @@ def test_typewriter_western_european_variant() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "typewriter_western_european_variant" + assert analysis.best_quote_convention.name == "typewriter_western_european_variant" def test_hybrid_typewriter_western_european() -> None: @@ -162,7 +162,7 @@ def test_hybrid_typewriter_western_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "hybrid_typewriter_western_european" + assert analysis.best_quote_convention.name == "hybrid_typewriter_western_european" def test_hybrid_british_typewriter_western_european() -> None: @@ -175,7 +175,7 @@ def test_hybrid_british_typewriter_western_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "hybrid_british_typewriter_western_european" + assert analysis.best_quote_convention.name == "hybrid_british_typewriter_western_european" def test_central_european() -> None: @@ -188,7 +188,7 @@ def test_central_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "central_european" + assert analysis.best_quote_convention.name == "central_european" def test_central_european_guillemets() -> None: @@ -201,7 +201,7 @@ def test_central_european_guillemets() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "central_european_guillemets" + assert analysis.best_quote_convention.name == "central_european_guillemets" def test_standard_swedish() -> None: @@ -214,7 +214,7 @@ def test_standard_swedish() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "standard_swedish" + assert analysis.best_quote_convention.name == "standard_swedish" def test_standard_finnish() -> None: @@ -227,7 +227,7 @@ def test_standard_finnish() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "standard_finnish" + assert analysis.best_quote_convention.name == "standard_finnish" def test_eastern_european() -> None: @@ -240,7 +240,7 @@ def test_eastern_european() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "eastern_european" + assert analysis.best_quote_convention.name == "eastern_european" def test_standard_russian() -> None: @@ -253,7 +253,7 @@ def test_standard_russian() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "standard_russian" + assert analysis.best_quote_convention.name == "standard_russian" def test_standard_arabic() -> None: @@ -266,7 +266,7 @@ def test_standard_arabic() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "standard_arabic" + assert analysis.best_quote_convention.name == "standard_arabic" def test_non_standard_arabic() -> None: @@ -279,7 +279,7 @@ def test_non_standard_arabic() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "non-standard_arabic" + assert analysis.best_quote_convention.name == "non-standard_arabic" def test_mismatched_quotation_marks() -> None: @@ -296,7 +296,7 @@ def test_mismatched_quotation_marks() -> None: """ analysis = detect_quote_convention(usfm) assert analysis is not None - assert analysis.best_quote_convention.get_name() == "standard_english" + assert analysis.best_quote_convention.name == "standard_english" def detect_quote_convention(usfm: str) -> Union[QuoteConventionAnalysis, None]: diff --git a/tests/corpora/analysis/test_quote_convention_set.py b/tests/corpora/punctuation_analysis/test_quote_convention_set.py similarity index 99% rename from tests/corpora/analysis/test_quote_convention_set.py rename to tests/corpora/punctuation_analysis/test_quote_convention_set.py index a2b4fb09..58ee0269 100644 --- a/tests/corpora/analysis/test_quote_convention_set.py +++ b/tests/corpora/punctuation_analysis/test_quote_convention_set.py @@ -1,6 +1,6 @@ from pytest import approx -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( QuotationMarkDirection, QuotationMarkMetadata, QuotationMarkTabulator, diff --git a/tests/corpora/analysis/test_text_segment.py b/tests/corpora/punctuation_analysis/test_text_segment.py similarity index 99% rename from tests/corpora/analysis/test_text_segment.py rename to tests/corpora/punctuation_analysis/test_text_segment.py index 08444faf..25d64fef 100644 --- a/tests/corpora/analysis/test_text_segment.py +++ b/tests/corpora/punctuation_analysis/test_text_segment.py @@ -1,5 +1,5 @@ from machine.corpora import UsfmToken, UsfmTokenType -from machine.corpora.analysis import TextSegment, UsfmMarkerType +from machine.corpora.punctuation_analysis import TextSegment, UsfmMarkerType def test_builder_initialization() -> None: diff --git a/tests/corpora/analysis/test_usfm_structure_extractor.py b/tests/corpora/punctuation_analysis/test_usfm_structure_extractor.py similarity index 99% rename from tests/corpora/analysis/test_usfm_structure_extractor.py rename to tests/corpora/punctuation_analysis/test_usfm_structure_extractor.py index 31d26e27..e489a620 100644 --- a/tests/corpora/analysis/test_usfm_structure_extractor.py +++ b/tests/corpora/punctuation_analysis/test_usfm_structure_extractor.py @@ -1,7 +1,7 @@ from typing import List from machine.corpora import UsfmParser -from machine.corpora.analysis import Chapter, TextSegment, UsfmMarkerType, UsfmStructureExtractor, Verse +from machine.corpora.punctuation_analysis import Chapter, TextSegment, UsfmMarkerType, UsfmStructureExtractor, Verse verse_text_parser_state = usfm_parser = UsfmParser("").state verse_text_parser_state.verse_ref.verse_num = 1 diff --git a/tests/corpora/analysis/test_verse.py b/tests/corpora/punctuation_analysis/test_verse.py similarity index 95% rename from tests/corpora/analysis/test_verse.py rename to tests/corpora/punctuation_analysis/test_verse.py index 3d6de831..ddfa58d4 100644 --- a/tests/corpora/analysis/test_verse.py +++ b/tests/corpora/punctuation_analysis/test_verse.py @@ -1,4 +1,4 @@ -from machine.corpora.analysis import TextSegment, Verse +from machine.corpora.punctuation_analysis import TextSegment, Verse def test_initialize_verse() -> None: diff --git a/tests/corpora/test_fallback_quotation_mark_resolver.py b/tests/corpora/test_fallback_quotation_mark_resolver.py index 241b2222..2ea25708 100644 --- a/tests/corpora/test_fallback_quotation_mark_resolver.py +++ b/tests/corpora/test_fallback_quotation_mark_resolver.py @@ -1,5 +1,5 @@ from machine.corpora import FallbackQuotationMarkResolver, QuotationMarkUpdateResolutionSettings -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( QuotationMarkDirection, QuotationMarkMetadata, QuotationMarkResolutionIssue, @@ -12,7 +12,7 @@ def test_reset(): - english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" ) assert english_quote_convention is not None @@ -32,7 +32,7 @@ def test_reset(): def test_simple_quotation_mark_resolution(): - english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" ) assert english_quote_convention is not None @@ -65,7 +65,7 @@ def test_simple_quotation_mark_resolution(): def test_is_opening_quote(): - english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" ) assert english_quote_convention is not None @@ -108,7 +108,7 @@ def test_is_opening_quote(): def test_is_opening_quote_with_unambiguous_quote_convention(): - english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" ) assert english_quote_convention is not None @@ -135,7 +135,7 @@ def test_is_opening_quote_with_unambiguous_quote_convention(): def test_is_opening_quote_stateful(): - english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" ) assert english_quote_convention is not None @@ -156,7 +156,7 @@ def test_is_opening_quote_stateful(): def test_does_most_recent_opening_mark_immediately_precede(): - english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" ) assert english_quote_convention is not None @@ -196,7 +196,7 @@ def test_does_most_recent_opening_mark_immediately_precede(): def test_is_closing_quote(): - english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" ) assert english_quote_convention is not None @@ -239,7 +239,7 @@ def test_is_closing_quote(): def test_is_closing_quote_with_unambiguous_quote_convention(): - english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" ) assert english_quote_convention is not None @@ -266,7 +266,7 @@ def test_is_closing_quote_with_unambiguous_quote_convention(): def test_resolve_opening_quote(): - english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" ) assert english_quote_convention is not None @@ -286,7 +286,7 @@ def test_resolve_opening_quote(): def test_resolve_closing_quote(): - english_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( "standard_english" ) assert english_quote_convention is not None diff --git a/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py index 3bd706df..4e18b2a8 100644 --- a/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py +++ b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py @@ -7,7 +7,7 @@ UpdateUsfmParserHandler, parse_usfm, ) -from machine.corpora.analysis import QuoteConvention, standard_quote_conventions +from machine.corpora.punctuation_analysis import QuoteConvention, standard_quote_conventions simple_normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal @@ -413,7 +413,7 @@ def assert_usfm_equal(observed_usfm: str, expected_usfm: str) -> None: def get_quote_convention_by_name(name: str) -> QuoteConvention: quote_convention: Union[QuoteConvention, None] = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name(name) + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) ) assert quote_convention is not None return quote_convention diff --git a/tests/corpora/test_quotation_mark_update_first_pass.py b/tests/corpora/test_quotation_mark_update_first_pass.py index af38368a..2e80c2bd 100644 --- a/tests/corpora/test_quotation_mark_update_first_pass.py +++ b/tests/corpora/test_quotation_mark_update_first_pass.py @@ -1,7 +1,7 @@ from typing import List, Union from machine.corpora import QuotationMarkUpdateFirstPass, QuotationMarkUpdateStrategy, parse_usfm -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( Chapter, QuotationMarkResolutionIssue, QuoteConvention, @@ -631,12 +631,12 @@ def test_ambiguous_in_first_unpaired_in_second() -> None: def run_first_pass( normalized_usfm: str, source_quote_convention_name: str, target_quote_convention_name: str ) -> List[QuotationMarkUpdateStrategy]: - source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + source_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( source_quote_convention_name ) assert source_quote_convention is not None - target_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + target_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( target_quote_convention_name ) assert target_quote_convention is not None @@ -650,12 +650,12 @@ def run_first_pass( def run_first_pass_on_chapter( verse_texts: List[str], source_quote_convention_name: str, target_quote_convention_name: str ) -> QuotationMarkUpdateStrategy: - source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + source_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( source_quote_convention_name ) assert source_quote_convention is not None - target_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + target_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( target_quote_convention_name ) assert target_quote_convention is not None @@ -669,7 +669,7 @@ def run_first_pass_on_chapter( def get_quote_convention_by_name(name: str) -> QuoteConvention: quote_convention: Union[QuoteConvention, None] = ( - standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name(name) + standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) ) assert quote_convention is not None return quote_convention diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py index ff67f957..68e0923c 100644 --- a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py +++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py @@ -14,7 +14,7 @@ UsfmUpdateBlockElementType, parse_usfm, ) -from machine.corpora.analysis import ( +from machine.corpora.punctuation_analysis import ( QuotationMarkDirection, QuotationMarkFinder, QuotationMarkMetadata, @@ -641,12 +641,12 @@ def create_quote_convention_changing_usfm_update_block_handler( target_quote_convention_name: str, quotation_mark_update_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), ) -> QuoteConventionChangingUsfmUpdateBlockHandler: - source_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + source_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( source_quote_convention_name ) assert source_quote_convention is not None - target_quote_convention = standard_quote_conventions.standard_quote_conventions.get_quote_convention_by_name( + target_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( target_quote_convention_name ) assert target_quote_convention is not None From 59946ce44154e03cf9223cc5c41a9ccc183b4c4c Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 10 Jul 2025 10:40:51 -0400 Subject: [PATCH 22/31] Eli's requested code-review changes --- machine/corpora/__init__.py | 8 +- .../fallback_quotation_mark_resolver.py | 52 +- .../corpora/punctuation_analysis/__init__.py | 14 +- .../depth_based_quotation_mark_resolver.py | 324 ++++----- ...=> preliminary_quotation_mark_analyzer.py} | 165 +++-- .../quotation_mark_finder.py | 18 +- .../quotation_mark_resolver.py | 7 +- .../quotation_mark_string_match.py | 8 +- .../quotation_mark_tabulator.py | 52 +- .../punctuation_analysis/quote_convention.py | 59 +- ...onvention_detection_resolution_settings.py | 18 +- .../quote_convention_detector.py | 6 +- .../quote_convention_set.py | 72 +- .../punctuation_analysis/text_segment.py | 6 +- .../usfm_structure_extractor.py | 9 +- ...tation_mark_denormalization_first_pass.py} | 2 +- ...ormalization_usfm_update_block_handler.py} | 2 +- .../quotation_mark_update_first_pass.py | 4 +- ...tion_changing_usfm_update_block_handler.py | 4 +- ...est_depth_based_quotation_mark_resolver.py | 624 +++++++++--------- ...st_preliminary_quotation_mark_analyzer.py} | 99 +-- .../test_quotation_mark_resolver.py | 20 +- .../test_quote_convention.py | 116 ++-- .../test_quote_convention_detector.py | 2 +- .../punctuation_analysis/test_text_segment.py | 8 +- .../test_usfm_structure_extractor.py | 13 - .../test_fallback_quotation_mark_resolver.py | 52 +- ...normalization_usfm_block_update_handler.py | 8 +- ...tion_changing_usfm_block_update_handler.py | 6 +- 29 files changed, 873 insertions(+), 905 deletions(-) rename machine/corpora/punctuation_analysis/{preliminary_quotation_analyzer.py => preliminary_quotation_mark_analyzer.py} (65%) rename machine/corpora/{quotation_denormalization_first_pass.py => quotation_mark_denormalization_first_pass.py} (85%) rename machine/corpora/{quotation_denormalization_usfm_update_block_handler.py => quotation_mark_denormalization_usfm_update_block_handler.py} (86%) rename tests/corpora/punctuation_analysis/{test_preliminary_quotation_analyzer.py => test_preliminary_quotation_mark_analyzer.py} (93%) diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 25c5b4d3..f48b6990 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -25,8 +25,8 @@ from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler -from .quotation_denormalization_first_pass import QuotationDenormalizationFirstPass -from .quotation_denormalization_usfm_update_block_handler import QuotationDenormalizationUsfmUpdateBlockHandler +from .quotation_mark_denormalization_first_pass import QuotationMarkDenormalizationFirstPass +from .quotation_mark_denormalization_usfm_update_block_handler import QuotationMarkDenormalizationUsfmUpdateBlockHandler from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings from .quotation_mark_update_settings import QuotationMarkUpdateSettings @@ -134,8 +134,8 @@ "QuotationMarkUpdateResolutionSettings", "QuotationMarkUpdateStrategy", "QuotationMarkUpdateFirstPass", - "QuotationDenormalizationFirstPass", - "QuotationDenormalizationUsfmUpdateBlockHandler", + "QuotationMarkDenormalizationFirstPass", + "QuotationMarkDenormalizationUsfmUpdateBlockHandler", "QuotationMarkUpdateSettings", "RtlReferenceOrder", "ScriptureElement", diff --git a/machine/corpora/fallback_quotation_mark_resolver.py b/machine/corpora/fallback_quotation_mark_resolver.py index 9f4694c0..fa337d11 100644 --- a/machine/corpora/fallback_quotation_mark_resolver.py +++ b/machine/corpora/fallback_quotation_mark_resolver.py @@ -20,31 +20,31 @@ def reset(self) -> None: self._issues = set() def resolve_quotation_marks( - self, quote_matches: list[QuotationMarkStringMatch] + self, quotation_mark_matches: list[QuotationMarkStringMatch] ) -> Generator[QuotationMarkMetadata, None, None]: - for quote_match in quote_matches: - yield from self._resolve_quotation_mark(quote_match) + for quotation_mark_match in quotation_mark_matches: + yield from self._resolve_quotation_mark(quotation_mark_match) def _resolve_quotation_mark( self, - quote_match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, ) -> Generator[QuotationMarkMetadata, None, None]: - if self._is_opening_quote(quote_match): - quote: Union[QuotationMarkMetadata, None] = self._resolve_opening_mark(quote_match) - if quote is not None: - yield quote + if self._is_opening_quotation_mark(quotation_mark_match): + quotation_mark: Union[QuotationMarkMetadata, None] = self._resolve_opening_mark(quotation_mark_match) + if quotation_mark is not None: + yield quotation_mark else: self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) - elif self._is_closing_quote(quote_match): - quote: Union[QuotationMarkMetadata, None] = self._resolve_closing_mark(quote_match) - if quote is not None: - yield quote + elif self._is_closing_quotation_mark(quotation_mark_match): + quotation_mark: Union[QuotationMarkMetadata, None] = self._resolve_closing_mark(quotation_mark_match) + if quotation_mark is not None: + yield quotation_mark else: self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) else: self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) - def _is_opening_quote( + def _is_opening_quotation_mark( self, match: QuotationMarkStringMatch, ) -> bool: @@ -78,7 +78,7 @@ def _does_most_recent_opening_mark_immediately_precede( and self._last_quotation_mark.end_index == match.start_index ) - def _is_closing_quote( + def _is_closing_quotation_mark( self, match: QuotationMarkStringMatch, ) -> bool: @@ -94,27 +94,31 @@ def _is_closing_quote( return False - def _resolve_opening_mark(self, quote_match: QuotationMarkStringMatch) -> Union[QuotationMarkMetadata, None]: + def _resolve_opening_mark( + self, quotation_mark_match: QuotationMarkStringMatch + ) -> Union[QuotationMarkMetadata, None]: possible_depths: Set[int] = self._settings.get_possible_depths( - quote_match.quotation_mark, QuotationMarkDirection.OPENING + quotation_mark_match.quotation_mark, QuotationMarkDirection.OPENING ) if len(possible_depths) == 0: return None - quote = quote_match.resolve(min(possible_depths), QuotationMarkDirection.OPENING) - self._last_quotation_mark = quote - return quote + quotation_mark = quotation_mark_match.resolve(min(possible_depths), QuotationMarkDirection.OPENING) + self._last_quotation_mark = quotation_mark + return quotation_mark - def _resolve_closing_mark(self, quote_match: QuotationMarkStringMatch) -> Union[QuotationMarkMetadata, None]: + def _resolve_closing_mark( + self, quotation_mark_match: QuotationMarkStringMatch + ) -> Union[QuotationMarkMetadata, None]: possible_depths: Set[int] = self._settings.get_possible_depths( - quote_match.quotation_mark, QuotationMarkDirection.CLOSING + quotation_mark_match.quotation_mark, QuotationMarkDirection.CLOSING ) if len(possible_depths) == 0: return None - quote = quote_match.resolve(min(possible_depths), QuotationMarkDirection.CLOSING) - self._last_quotation_mark = quote - return quote + quotation_mark = quotation_mark_match.resolve(min(possible_depths), QuotationMarkDirection.CLOSING) + self._last_quotation_mark = quotation_mark + return quotation_mark def get_issues(self) -> Set[QuotationMarkResolutionIssue]: return self._issues diff --git a/machine/corpora/punctuation_analysis/__init__.py b/machine/corpora/punctuation_analysis/__init__.py index 081254f6..4ac5d9df 100644 --- a/machine/corpora/punctuation_analysis/__init__.py +++ b/machine/corpora/punctuation_analysis/__init__.py @@ -1,15 +1,15 @@ from .chapter import Chapter from .depth_based_quotation_mark_resolver import ( DepthBasedQuotationMarkResolver, - QuotationContinuerState, - QuotationContinuerStyle, QuotationMarkCategorizer, QuotationMarkResolverState, + QuoteContinuerState, + QuoteContinuerStyle, ) -from .preliminary_quotation_analyzer import ( +from .preliminary_quotation_mark_analyzer import ( ApostropheProportionStatistics, PreliminaryApostropheAnalyzer, - PreliminaryQuotationAnalyzer, + PreliminaryQuotationMarkAnalyzer, QuotationMarkGrouper, QuotationMarkSequences, QuotationMarkWordPositions, @@ -36,10 +36,10 @@ "Chapter", "DepthBasedQuotationMarkResolver", "PreliminaryApostropheAnalyzer", - "PreliminaryQuotationAnalyzer", + "PreliminaryQuotationMarkAnalyzer", "SingleLevelQuoteConvention", - "QuotationContinuerState", - "QuotationContinuerStyle", + "QuoteContinuerState", + "QuoteContinuerStyle", "QuotationMarkCategorizer", "QuotationMarkCounts", "QuotationMarkDirection", diff --git a/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py index bff404b3..a9b158ce 100644 --- a/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py +++ b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py @@ -19,85 +19,80 @@ def __init__(self): def reset(self) -> None: self._quotation_stack: list[QuotationMarkMetadata] = [] - self._current_depth: int = 0 @property def current_depth(self) -> int: - return self._current_depth + 1 + return len(self._quotation_stack) def has_open_quotation_mark(self) -> bool: - return self._current_depth > 0 + return self.current_depth > 0 def are_more_than_n_quotes_open(self, n: int) -> bool: - return self._current_depth > n + return self.current_depth > n - def add_opening_quotation_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - quote = quote_match.resolve(self._current_depth + 1, QuotationMarkDirection.OPENING) - self._quotation_stack.append(quote) - self._current_depth += 1 - return quote + def add_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + quotation_mark = quotation_mark_match.resolve(self.current_depth + 1, QuotationMarkDirection.OPENING) + self._quotation_stack.append(quotation_mark) + return quotation_mark - def add_closing_quotation_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: - quote = quote_match.resolve(self._current_depth, QuotationMarkDirection.CLOSING) + def add_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + quotation_mark = quotation_mark_match.resolve(self.current_depth, QuotationMarkDirection.CLOSING) self._quotation_stack.pop() - self._current_depth -= 1 - return quote + return quotation_mark def get_opening_quotation_mark_at_depth(self, depth: int) -> str: - if depth > len(self._quotation_stack): + if depth > self.current_depth: raise RuntimeError( - "get_opening_quotation_mark_at_depth() was called with a depth greater than the quotation stack size." + f"Opening quotation mark at depth ${depth} was requested from a quotation stack " + + f"with depth ${self.current_depth}." ) return self._quotation_stack[depth - 1].quotation_mark def get_deepest_opening_quotation_mark(self) -> str: if not self.has_open_quotation_mark(): - raise RuntimeError( - "get_deepest_opening_quotation_mark() was called when the stack of quotation marks was empty." - ) + raise RuntimeError("The deepest opening quotation mark was requested from an empty quotation stack.") return self._quotation_stack[-1].quotation_mark -class QuotationContinuerStyle(Enum): +class QuoteContinuerStyle(Enum): UNDETERMINED = auto() ENGLISH = auto() SPANISH = auto() -class QuotationContinuerState: +class QuoteContinuerState: def __init__(self): self.reset() def reset(self) -> None: - self._quotation_continuer_stack: list[QuotationMarkMetadata] = [] - self._current_depth = 0 - self._continuer_style = QuotationContinuerStyle.UNDETERMINED + self._quote_continuer_mark_stack: list[QuotationMarkMetadata] = [] + self._continuer_style = QuoteContinuerStyle.UNDETERMINED @property def current_depth(self) -> int: - return self._current_depth + return len(self._quote_continuer_mark_stack) def continuer_has_been_observed(self) -> bool: - return len(self._quotation_continuer_stack) > 0 + return len(self._quote_continuer_mark_stack) > 0 @property - def continuer_style(self) -> QuotationContinuerStyle: + def continuer_style(self) -> QuoteContinuerStyle: return self._continuer_style - def add_quotation_continuer( + def add_quote_continuer( self, - quote_match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, quotation_mark_resolver_state: QuotationMarkResolverState, - quotation_continuer_style: QuotationContinuerStyle, + quote_continuer_style: QuoteContinuerStyle, ) -> QuotationMarkMetadata: - quote = quote_match.resolve(len(self._quotation_continuer_stack) + 1, QuotationMarkDirection.OPENING) - self._quotation_continuer_stack.append(quote) - self._current_depth += 1 - self._continuer_style = quotation_continuer_style - if len(self._quotation_continuer_stack) == len(quotation_mark_resolver_state._quotation_stack): - self._quotation_continuer_stack.clear() - self._current_depth = 0 - return quote + quotation_mark = quotation_mark_match.resolve( + len(self._quote_continuer_mark_stack) + 1, QuotationMarkDirection.OPENING + ) + self._quote_continuer_mark_stack.append(quotation_mark) + self._continuer_style = quote_continuer_style + if len(self._quote_continuer_mark_stack) == len(quotation_mark_resolver_state._quotation_stack): + self._quote_continuer_mark_stack.clear() + return quotation_mark class QuotationMarkCategorizer: @@ -107,89 +102,85 @@ def __init__( self, quotation_mark_resolution_settings: QuotationMarkResolutionSettings, quotation_mark_resolver_state: QuotationMarkResolverState, - quotation_continuer_state: QuotationContinuerState, + quote_continuer_state: QuoteContinuerState, ): self._settings = quotation_mark_resolution_settings self._quotation_mark_resolver_state = quotation_mark_resolver_state - self._quotation_continuer_state = quotation_continuer_state + self._quote_continuer_state = quote_continuer_state - def is_english_quotation_continuer( + def is_english_quote_continuer( self, - quote_match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, previous_match: Union[QuotationMarkStringMatch, None], next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if self._quotation_continuer_state.continuer_style == QuotationContinuerStyle.SPANISH: + if self._quote_continuer_state.continuer_style == QuoteContinuerStyle.SPANISH: return False - if not self._meets_quote_continuer_prerequisites(quote_match, previous_match, next_match): + if not self._meets_quote_continuer_prerequisites(quotation_mark_match, previous_match, next_match): return False - if not self._quotation_continuer_state.continuer_has_been_observed(): - if quote_match._start_index > 0: - return False - if quote_match.quotation_mark != self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( - self._quotation_continuer_state.current_depth + 1 - ): + if ( + quotation_mark_match.quotation_mark + != self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( + self._quote_continuer_state.current_depth + 1 + ) + ): + return False + + if not self._quote_continuer_state.continuer_has_been_observed(): + if quotation_mark_match._start_index > 0: return False + + # Check the next quotation mark match, since quote continuers must appear consecutively if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): - if next_match is None or next_match.start_index != quote_match.end_index: + if next_match is None or next_match.start_index != quotation_mark_match.end_index: return False - else: - if quote_match.quotation_mark != self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( - self._quotation_continuer_state.current_depth + 1 - ): - return False return True - def is_spanish_quotation_continuer( + def is_spanish_quote_continuer( self, - quote_match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, previous_match: Union[QuotationMarkStringMatch, None], next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if self._quotation_continuer_state.continuer_style == QuotationContinuerStyle.ENGLISH: + if self._quote_continuer_state.continuer_style == QuoteContinuerStyle.ENGLISH: + return False + if not self._meets_quote_continuer_prerequisites(quotation_mark_match, previous_match, next_match): return False - if not self._meets_quote_continuer_prerequisites(quote_match, previous_match, next_match): + + if not self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( + self._quote_continuer_state.current_depth + 1 + ), + quotation_mark_match.quotation_mark, + ): return False - if not self._quotation_continuer_state.continuer_has_been_observed(): - if quote_match._start_index > 0: + if not self._quote_continuer_state.continuer_has_been_observed(): + if quotation_mark_match._start_index > 0: return False # this has only been observed with guillemets so far - if quote_match.quotation_mark != "»": - return False - if not self._settings.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( - self._quotation_continuer_state.current_depth + 1 - ), - quote_match.quotation_mark, - ): + if quotation_mark_match.quotation_mark != "»": return False + + # Check the next quotation mark match, since quote continuers must appear consecutively if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): - if next_match is None or next_match.start_index != quote_match.end_index: + if next_match is None or next_match.start_index != quotation_mark_match.end_index: return False - else: - if not self._settings.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( - self._quotation_continuer_state.current_depth + 1 - ), - quote_match.quotation_mark, - ): - return False return True def _meets_quote_continuer_prerequisites( self, - quote_match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, previous_match: Union[QuotationMarkStringMatch, None], next_match: Union[QuotationMarkStringMatch, None], ) -> bool: if ( self._settings.should_rely_on_paragraph_markers - and not quote_match._text_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH) + and not quotation_mark_match._text_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH) ): return False if not self._quotation_mark_resolver_state.has_open_quotation_mark(): @@ -197,90 +188,95 @@ def _meets_quote_continuer_prerequisites( return True - def is_opening_quote( + def is_opening_quotation_mark( self, - match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, ) -> bool: - if not self._settings.is_valid_opening_quotation_mark(match): + if not self._settings.is_valid_opening_quotation_mark(quotation_mark_match): return False # if the quote convention is ambiguous, use whitespace as a clue - if self._settings.is_valid_closing_quotation_mark(match): + if self._settings.is_valid_closing_quotation_mark(quotation_mark_match): return ( - match.has_leading_whitespace() - or self._most_recent_opening_mark_immediately_precedes(match) - or match.has_quote_introducer_in_leading_substring() - ) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation()) + quotation_mark_match.has_leading_whitespace() + or self._most_recent_opening_mark_immediately_precedes(quotation_mark_match) + or quotation_mark_match.has_quote_introducer_in_leading_substring() + ) and not ( + quotation_mark_match.has_trailing_whitespace() or quotation_mark_match.has_trailing_punctuation() + ) return True - def is_closing_quote( + def is_closing_quotation_mark( self, - match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, ) -> bool: - if not self._settings.is_valid_closing_quotation_mark(match): + if not self._settings.is_valid_closing_quotation_mark(quotation_mark_match): return False # if the quote convention is ambiguous, use whitespace as a clue - if self._settings.is_valid_opening_quotation_mark(match): + if self._settings.is_valid_opening_quotation_mark(quotation_mark_match): return ( - match.has_trailing_whitespace() - or match.has_trailing_punctuation() - or match.is_at_end_of_segment() - or match.next_character_matches(self._settings.closing_quotation_mark_regex) - ) and not match.has_leading_whitespace() + quotation_mark_match.has_trailing_whitespace() + or quotation_mark_match.has_trailing_punctuation() + or quotation_mark_match.is_at_end_of_segment() + or quotation_mark_match.next_character_matches(self._settings.closing_quotation_mark_regex) + ) and not quotation_mark_match.has_leading_whitespace() return True - def is_malformed_opening_quote( + def is_malformed_opening_quotation_mark( self, - match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, ) -> bool: - if not self._settings.is_valid_opening_quotation_mark(match): + if not self._settings.is_valid_opening_quotation_mark(quotation_mark_match): return False - if match.has_quote_introducer_in_leading_substring(): + if quotation_mark_match.has_quote_introducer_in_leading_substring(): return True if ( - match.has_leading_whitespace() - and match.has_trailing_whitespace() + quotation_mark_match.has_leading_whitespace() + and quotation_mark_match.has_trailing_whitespace() and not self._quotation_mark_resolver_state.has_open_quotation_mark() ): return True return False - def is_malformed_closing_quote( + def is_malformed_closing_quotation_mark( self, - match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, ) -> bool: - if not self._settings.is_valid_closing_quotation_mark(match): + if not self._settings.is_valid_closing_quotation_mark(quotation_mark_match): return False return ( ( - match.is_at_end_of_segment() - or not match.has_trailing_whitespace() - or (match.has_leading_whitespace() and match.has_trailing_whitespace()) + quotation_mark_match.is_at_end_of_segment() + or not quotation_mark_match.has_trailing_whitespace() + or (quotation_mark_match.has_leading_whitespace() and quotation_mark_match.has_trailing_whitespace()) ) and self._quotation_mark_resolver_state.has_open_quotation_mark() and self._settings.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.quotation_mark + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), + quotation_mark_match.quotation_mark, ) ) - def is_unpaired_closing_quote( + def is_unpaired_closing_quotation_mark( self, - match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, ) -> bool: - if not self._settings.is_valid_closing_quotation_mark(match): + if not self._settings.is_valid_closing_quotation_mark(quotation_mark_match): return False if self._quotation_mark_resolver_state.has_open_quotation_mark(): return False - return not match.has_leading_whitespace() and (match.is_at_end_of_segment() or match.has_trailing_whitespace()) + return not quotation_mark_match.has_leading_whitespace() and ( + quotation_mark_match.is_at_end_of_segment() or quotation_mark_match.has_trailing_whitespace() + ) def _most_recent_opening_mark_immediately_precedes(self, match: QuotationMarkStringMatch) -> bool: if not self._quotation_mark_resolver_state.has_open_quotation_mark(): @@ -290,30 +286,31 @@ def _most_recent_opening_mark_immediately_precedes(self, match: QuotationMarkStr def is_apostrophe( self, - match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, next_match: Union[QuotationMarkStringMatch, None], ) -> bool: - if not match.quotation_mark_matches(self._APOSTROPHE_PATTERN): + if not quotation_mark_match.quotation_mark_matches(self._APOSTROPHE_PATTERN): return False # Latin letters on both sides of punctuation mark if ( - match.previous_character is not None - and match.has_leading_latin_letter() - and match.next_character is not None - and match.has_trailing_latin_letter() + quotation_mark_match.previous_character is not None + and quotation_mark_match.has_leading_latin_letter() + and quotation_mark_match.next_character is not None + and quotation_mark_match.has_trailing_latin_letter() ): return True # potential final s possessive (e.g. Moses') - if match.previous_character_matches(regex.compile(r"s")) and ( - match.has_trailing_whitespace() or match.has_trailing_punctuation() + if quotation_mark_match.previous_character_matches(regex.compile(r"s")) and ( + quotation_mark_match.has_trailing_whitespace() or quotation_mark_match.has_trailing_punctuation() ): - # check whether it could be a closing quote + # check whether it could be a closing quotation mark if not self._quotation_mark_resolver_state.has_open_quotation_mark(): return True if not self._settings.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.quotation_mark + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), + quotation_mark_match.quotation_mark, ): return True if next_match is not None and self._settings.are_marks_a_valid_pair( @@ -325,10 +322,11 @@ def is_apostrophe( # for languages that use apostrophes at the start and end of words if ( not self._quotation_mark_resolver_state.has_open_quotation_mark() - and match.quotation_mark == "'" + and quotation_mark_match.quotation_mark == "'" or self._quotation_mark_resolver_state.has_open_quotation_mark() and not self._settings.are_marks_a_valid_pair( - self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), match.quotation_mark + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), + quotation_mark_match.quotation_mark, ) ): return True @@ -340,88 +338,92 @@ class DepthBasedQuotationMarkResolver(QuotationMarkResolver): def __init__(self, settings: QuotationMarkResolutionSettings): self._settings = settings self._quotation_mark_resolver_state = QuotationMarkResolverState() - self._quotation_continuer_state = QuotationContinuerState() + self._quote_continuer_state = QuoteContinuerState() self._quotation_mark_categorizer = QuotationMarkCategorizer( - self._settings, self._quotation_mark_resolver_state, self._quotation_continuer_state + self._settings, self._quotation_mark_resolver_state, self._quote_continuer_state ) self._issues: Set[QuotationMarkResolutionIssue] = set() def reset(self) -> None: self._quotation_mark_resolver_state.reset() - self._quotation_continuer_state.reset() + self._quote_continuer_state.reset() self._issues = set() def resolve_quotation_marks( - self, quote_matches: list[QuotationMarkStringMatch] + self, quotation_mark_matches: list[QuotationMarkStringMatch] ) -> Generator[QuotationMarkMetadata, None, None]: - for quote_index, quote_match in enumerate(quote_matches): - previous_mark = None if quote_index == 0 else quote_matches[quote_index - 1] - next_mark = None if quote_index == len(quote_matches) - 1 else quote_matches[quote_index + 1] - yield from self._resolve_quotation_mark(quote_match, previous_mark, next_mark) + for index, quotation_mark_match in enumerate(quotation_mark_matches): + previous_mark = None if index == 0 else quotation_mark_matches[index - 1] + next_mark = None if index == len(quotation_mark_matches) - 1 else quotation_mark_matches[index + 1] + yield from self._resolve_quotation_mark(quotation_mark_match, previous_mark, next_mark) if self._quotation_mark_resolver_state.has_open_quotation_mark(): self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) def _resolve_quotation_mark( self, - quote_match: QuotationMarkStringMatch, + quotation_mark_match: QuotationMarkStringMatch, previous_mark: Union[QuotationMarkStringMatch, None], next_mark: Union[QuotationMarkStringMatch, None], ) -> Generator[QuotationMarkMetadata, None, None]: - if self._quotation_mark_categorizer.is_opening_quote(quote_match): - if self._quotation_mark_categorizer.is_english_quotation_continuer(quote_match, previous_mark, next_mark): - yield self._process_quotation_continuer(quote_match, QuotationContinuerStyle.ENGLISH) + if self._quotation_mark_categorizer.is_opening_quotation_mark(quotation_mark_match): + if self._quotation_mark_categorizer.is_english_quote_continuer( + quotation_mark_match, previous_mark, next_mark + ): + yield self._process_quote_continuer(quotation_mark_match, QuoteContinuerStyle.ENGLISH) else: if self._is_depth_too_great(): self._issues.add(QuotationMarkResolutionIssue.TOO_DEEP_NESTING) return - yield self._process_opening_mark(quote_match) - elif self._quotation_mark_categorizer.is_apostrophe(quote_match, next_mark): + yield self._process_opening_mark(quotation_mark_match) + elif self._quotation_mark_categorizer.is_apostrophe(quotation_mark_match, next_mark): pass - elif self._quotation_mark_categorizer.is_closing_quote(quote_match): - if self._quotation_mark_categorizer.is_spanish_quotation_continuer(quote_match, previous_mark, next_mark): - yield self._process_quotation_continuer(quote_match, QuotationContinuerStyle.SPANISH) + elif self._quotation_mark_categorizer.is_closing_quotation_mark(quotation_mark_match): + if self._quotation_mark_categorizer.is_spanish_quote_continuer( + quotation_mark_match, previous_mark, next_mark + ): + yield self._process_quote_continuer(quotation_mark_match, QuoteContinuerStyle.SPANISH) elif not self._quotation_mark_resolver_state.has_open_quotation_mark(): self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) return else: - yield self._process_closing_mark(quote_match) - elif self._quotation_mark_categorizer.is_malformed_closing_quote(quote_match): - yield self._process_closing_mark(quote_match) - elif self._quotation_mark_categorizer.is_malformed_opening_quote(quote_match): - yield self._process_opening_mark(quote_match) - elif self._quotation_mark_categorizer.is_unpaired_closing_quote(quote_match): + yield self._process_closing_mark(quotation_mark_match) + elif self._quotation_mark_categorizer.is_malformed_closing_quotation_mark(quotation_mark_match): + yield self._process_closing_mark(quotation_mark_match) + elif self._quotation_mark_categorizer.is_malformed_opening_quotation_mark(quotation_mark_match): + yield self._process_opening_mark(quotation_mark_match) + elif self._quotation_mark_categorizer.is_unpaired_closing_quotation_mark(quotation_mark_match): self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) else: self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) - def _process_quotation_continuer( - self, quote_match: QuotationMarkStringMatch, continuer_style: QuotationContinuerStyle + def _process_quote_continuer( + self, quotation_mark_match: QuotationMarkStringMatch, continuer_style: QuoteContinuerStyle ) -> QuotationMarkMetadata: - return self._quotation_continuer_state.add_quotation_continuer( - quote_match, self._quotation_mark_resolver_state, continuer_style + return self._quote_continuer_state.add_quote_continuer( + quotation_mark_match, self._quotation_mark_resolver_state, continuer_style ) def _is_depth_too_great(self) -> bool: return self._quotation_mark_resolver_state.are_more_than_n_quotes_open(3) - def _process_opening_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + def _process_opening_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: if not self._settings.metadata_matches_quotation_mark( - quote_match.quotation_mark, - self._quotation_mark_resolver_state.current_depth, + quotation_mark_match.quotation_mark, + self._quotation_mark_resolver_state.current_depth + 1, QuotationMarkDirection.OPENING, ): self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) - return self._quotation_mark_resolver_state.add_opening_quotation_mark(quote_match) + return self._quotation_mark_resolver_state.add_opening_quotation_mark(quotation_mark_match) - def _process_closing_mark(self, quote_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + def _process_closing_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: if not self._settings.metadata_matches_quotation_mark( - quote_match.quotation_mark, - self._quotation_mark_resolver_state.current_depth - 1, + quotation_mark_match.quotation_mark, + self._quotation_mark_resolver_state.current_depth, QuotationMarkDirection.CLOSING, ): self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) - return self._quotation_mark_resolver_state.add_closing_quotation_mark(quote_match) + return self._quotation_mark_resolver_state.add_closing_quotation_mark(quotation_mark_match) def get_issues(self) -> Set[QuotationMarkResolutionIssue]: return self._issues diff --git a/machine/corpora/punctuation_analysis/preliminary_quotation_analyzer.py b/machine/corpora/punctuation_analysis/preliminary_quotation_mark_analyzer.py similarity index 65% rename from machine/corpora/punctuation_analysis/preliminary_quotation_analyzer.py rename to machine/corpora/punctuation_analysis/preliminary_quotation_mark_analyzer.py index ab1b5ce2..7367d5cc 100644 --- a/machine/corpora/punctuation_analysis/preliminary_quotation_analyzer.py +++ b/machine/corpora/punctuation_analysis/preliminary_quotation_mark_analyzer.py @@ -1,3 +1,4 @@ +from collections import Counter, defaultdict from typing import Dict, Generator, List, Tuple import regex @@ -31,132 +32,115 @@ def is_apostrophe_proportion_greater_than(self, threshold: float) -> bool: class QuotationMarkWordPositions: + _MAXIMUM_PROPORTION_FOR_RARITY = 0.1 + _MAXIMUM_PROPORTION_DIFFERENCE_THRESHOLD = 0.3 + def __init__(self): self.reset() def reset(self) -> None: - self._word_initial_occurrences: Dict[str, int] = dict() - self._mid_word_occurrences: Dict[str, int] = dict() - self._word_final_occurrences: Dict[str, int] = dict() + self._word_initial_occurrences: Counter[str] = Counter() + self._mid_word_occurrences: Counter[str] = Counter() + self._word_final_occurrences: Counter[str] = Counter() + self._total_occurrences: Counter[str] = Counter() def count_word_initial_apostrophe(self, quotation_mark: str) -> None: - if quotation_mark not in self._word_initial_occurrences: - self._word_initial_occurrences[quotation_mark] = 0 - self._word_initial_occurrences[quotation_mark] += 1 + self._word_initial_occurrences.update([quotation_mark]) + self._total_occurrences.update([quotation_mark]) def count_mid_word_apostrophe(self, quotation_mark: str) -> None: - if quotation_mark not in self._mid_word_occurrences: - self._mid_word_occurrences[quotation_mark] = 0 - self._mid_word_occurrences[quotation_mark] += 1 + self._mid_word_occurrences.update([quotation_mark]) + self._total_occurrences.update([quotation_mark]) def count_word_final_apostrophe(self, quotation_mark: str) -> None: - if quotation_mark not in self._word_final_occurrences: - self._word_final_occurrences[quotation_mark] = 0 - self._word_final_occurrences[quotation_mark] += 1 - - def _get_word_initial_occurrences(self, quotation_mark: str) -> int: - return self._word_initial_occurrences[quotation_mark] if quotation_mark in self._word_initial_occurrences else 0 - - def _get_mid_word_occurrences(self, quotation_mark: str) -> int: - return self._mid_word_occurrences[quotation_mark] if quotation_mark in self._mid_word_occurrences else 0 - - def _get_word_final_occurrences(self, quotation_mark: str) -> int: - return self._word_final_occurrences[quotation_mark] if quotation_mark in self._word_final_occurrences else 0 + self._word_final_occurrences.update([quotation_mark]) + self._total_occurrences.update([quotation_mark]) def _get_total_occurrences(self, quotation_mark: str) -> int: return ( - self._get_word_initial_occurrences(quotation_mark) - + self._get_mid_word_occurrences(quotation_mark) - + self._get_word_final_occurrences(quotation_mark) + self._word_initial_occurrences[quotation_mark] + + self._mid_word_occurrences[quotation_mark] + + self._word_final_occurrences[quotation_mark] ) def is_mark_rarely_initial(self, quotation_mark: str) -> bool: - num_initial_marks: int = self._get_word_initial_occurrences(quotation_mark) - num_total_marks: int = self._get_total_occurrences(quotation_mark) - return num_total_marks > 0 and num_initial_marks / num_total_marks < 0.1 + num_initial_marks: int = self._word_initial_occurrences[quotation_mark] + num_total_marks: int = self._total_occurrences[quotation_mark] + return num_total_marks > 0 and num_initial_marks / num_total_marks < self._MAXIMUM_PROPORTION_FOR_RARITY def is_mark_rarely_final(self, quotation_mark: str) -> bool: - num_final_marks: int = self._get_word_final_occurrences(quotation_mark) - num_total_marks: int = self._get_total_occurrences(quotation_mark) - return num_total_marks > 0 and num_final_marks / num_total_marks < 0.1 + num_final_marks: int = self._word_final_occurrences[quotation_mark] + num_total_marks: int = self._total_occurrences[quotation_mark] + return num_total_marks > 0 and num_final_marks / num_total_marks < self._MAXIMUM_PROPORTION_FOR_RARITY def are_initial_and_final_rates_similar(self, quotation_mark: str) -> bool: - num_initial_marks: int = self._get_word_initial_occurrences(quotation_mark) - num_final_marks: int = self._get_word_final_occurrences(quotation_mark) - num_total_marks: int = self._get_total_occurrences(quotation_mark) - return num_total_marks > 0 and abs(num_initial_marks - num_final_marks) / num_total_marks < 0.3 + num_initial_marks: int = self._word_initial_occurrences[quotation_mark] + num_final_marks: int = self._word_final_occurrences[quotation_mark] + num_total_marks: int = self._total_occurrences[quotation_mark] + return ( + num_total_marks > 0 + and abs(num_initial_marks - num_final_marks) / num_total_marks + < self._MAXIMUM_PROPORTION_DIFFERENCE_THRESHOLD + ) def is_mark_commonly_mid_word(self, quotation_mark: str) -> bool: - num_mid_word_marks: int = self._get_mid_word_occurrences(quotation_mark) - num_total_marks: int = self._get_total_occurrences(quotation_mark) - return num_total_marks > 0 and num_mid_word_marks / num_total_marks > 0.3 + num_mid_word_marks: int = self._mid_word_occurrences[quotation_mark] + num_total_marks: int = self._total_occurrences[quotation_mark] + return ( + num_total_marks > 0 and num_mid_word_marks / num_total_marks > self._MAXIMUM_PROPORTION_DIFFERENCE_THRESHOLD + ) class QuotationMarkSequences: + _SOLE_OCCURRENCE_MINIMUM_COUNT = 5 + _MUCH_MORE_COMMON_MINIMUM_RATIO = 10 + _MAXIMUM_PROPORTION_DIFFERENCE_THRESHOLD = 0.2 + def __init__(self): self.reset() def reset(self) -> None: - self._earlier_quotation_mark_counts: Dict[str, int] = dict() - self._later_quotation_mark_counts: Dict[str, int] = dict() - - def record_earlier_quotation_mark(self, quotation_mark: str) -> None: - if quotation_mark not in self._earlier_quotation_mark_counts: - self._earlier_quotation_mark_counts[quotation_mark] = 0 - self._earlier_quotation_mark_counts[quotation_mark] += 1 + self._earlier_quotation_mark_counts: Counter[str] = Counter() + self._later_quotation_mark_counts: Counter[str] = Counter() - def record_later_quotation_mark(self, quotation_mark: str) -> None: - if quotation_mark not in self._later_quotation_mark_counts: - self._later_quotation_mark_counts[quotation_mark] = 0 - self._later_quotation_mark_counts[quotation_mark] += 1 + def count_earlier_quotation_mark(self, quotation_mark: str) -> None: + self._earlier_quotation_mark_counts.update([quotation_mark]) - def _get_earlier_occurrences(self, quotation_mark: str) -> int: - return ( - self._earlier_quotation_mark_counts[quotation_mark] - if quotation_mark in self._earlier_quotation_mark_counts - else 0 - ) - - def _get_later_occurrences(self, quotation_mark: str) -> int: - return ( - self._later_quotation_mark_counts[quotation_mark] - if quotation_mark in self._later_quotation_mark_counts - else 0 - ) + def count_later_quotation_mark(self, quotation_mark: str) -> None: + self._later_quotation_mark_counts.update([quotation_mark]) def is_mark_much_more_common_earlier(self, quotation_mark: str) -> bool: - num_early_occurrences: int = self._get_earlier_occurrences(quotation_mark) - num_late_occurrences: int = self._get_later_occurrences(quotation_mark) - return ( - num_late_occurrences == 0 and num_early_occurrences > 5 - ) or num_early_occurrences > num_late_occurrences * 10 + num_early_occurrences: int = self._earlier_quotation_mark_counts[quotation_mark] + num_late_occurrences: int = self._later_quotation_mark_counts[quotation_mark] + return (num_late_occurrences == 0 and num_early_occurrences > self._SOLE_OCCURRENCE_MINIMUM_COUNT) or ( + num_early_occurrences > num_late_occurrences * self._MUCH_MORE_COMMON_MINIMUM_RATIO + ) def is_mark_much_more_common_later(self, quotation_mark: str) -> bool: - num_early_occurrences: int = self._get_earlier_occurrences(quotation_mark) - num_late_occurrences: int = self._get_later_occurrences(quotation_mark) - return ( - num_early_occurrences == 0 and num_late_occurrences > 5 - ) or num_late_occurrences > num_early_occurrences * 10 + num_early_occurrences: int = self._earlier_quotation_mark_counts[quotation_mark] + num_late_occurrences: int = self._later_quotation_mark_counts[quotation_mark] + return (num_early_occurrences == 0 and num_late_occurrences > self._SOLE_OCCURRENCE_MINIMUM_COUNT) or ( + num_late_occurrences > num_early_occurrences * self._MUCH_MORE_COMMON_MINIMUM_RATIO + ) - def is_mark_common_early_and_late(self, quotation_mark: str) -> bool: - num_early_occurrences: int = self._get_earlier_occurrences(quotation_mark) - num_late_occurrences: int = self._get_later_occurrences(quotation_mark) + def are_early_and_late_mark_rates_similar(self, quotation_mark: str) -> bool: + num_early_occurrences: int = self._earlier_quotation_mark_counts[quotation_mark] + num_late_occurrences: int = self._later_quotation_mark_counts[quotation_mark] return ( num_early_occurrences > 0 - and abs(num_late_occurrences - num_early_occurrences) / num_early_occurrences < 0.2 + and abs(num_late_occurrences - num_early_occurrences) / (num_early_occurrences + num_late_occurrences) + < self._MAXIMUM_PROPORTION_DIFFERENCE_THRESHOLD ) class QuotationMarkGrouper: - def __init__(self, quotation_marks: List[QuotationMarkStringMatch], quote_convention_set: QuoteConventionSet): - self._quote_convention_set = quote_convention_set + def __init__(self, quotation_marks: List[QuotationMarkStringMatch], quote_conventions: QuoteConventionSet): + self._quote_conventions = quote_conventions self._group_quotation_marks(quotation_marks) def _group_quotation_marks(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: - self._grouped_quotation_marks: Dict[str, List[QuotationMarkStringMatch]] = dict() + self._grouped_quotation_marks: Dict[str, List[QuotationMarkStringMatch]] = defaultdict(list) for quotation_mark_match in quotation_marks: - if quotation_mark_match.quotation_mark not in self._grouped_quotation_marks: - self._grouped_quotation_marks[quotation_mark_match.quotation_mark] = [] self._grouped_quotation_marks[quotation_mark_match.quotation_mark].append(quotation_mark_match) def get_quotation_mark_pairs(self) -> Generator[Tuple[str, str], None, None]: @@ -164,7 +148,7 @@ def get_quotation_mark_pairs(self) -> Generator[Tuple[str, str], None, None]: # handle cases of identical opening/closing marks if ( len(matches1) == 2 - and self._quote_convention_set.is_quotation_mark_direction_ambiguous(mark1) + and self._quote_conventions.is_quotation_mark_direction_ambiguous(mark1) and not self.has_distinct_paired_quotation_mark(mark1) ): yield (mark1, mark1) @@ -178,7 +162,7 @@ def get_quotation_mark_pairs(self) -> Generator[Tuple[str, str], None, None]: for mark2, matches2 in self._grouped_quotation_marks.items(): if ( len(matches2) == 1 - and self._quote_convention_set.marks_are_a_valid_pair(mark1, mark2) + and self._quote_conventions.marks_are_a_valid_pair(mark1, mark2) and matches1[0].precedes(matches2[0]) ): yield (mark1, mark2) @@ -187,13 +171,14 @@ def has_distinct_paired_quotation_mark(self, quotation_mark: str) -> bool: return any( [ mark != quotation_mark and mark in self._grouped_quotation_marks - for mark in self._quote_convention_set.get_possible_paired_quotation_marks(quotation_mark) + for mark in self._quote_conventions.get_possible_paired_quotation_marks(quotation_mark) ] ) class PreliminaryApostropheAnalyzer: _APOSTROPHE_PATTERN = regex.compile(r"[\'\u2019]", regex.U) + _MAXIMUM_APOSTROPHE_PROPORTION = 0.02 def __init__(self): self._apostrophe_proportion_statistics = ApostropheProportionStatistics() @@ -261,13 +246,15 @@ def is_apostrophe_only(self, mark: str) -> bool: ) and self._word_position_statistics.is_mark_commonly_mid_word(mark): return True - if self._apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.02): + if self._apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than( + self._MAXIMUM_APOSTROPHE_PROPORTION + ): return True return False -class PreliminaryQuotationAnalyzer: +class PreliminaryQuotationMarkAnalyzer: def __init__(self, quote_conventions: QuoteConventionSet): self._quote_conventions = quote_conventions @@ -298,8 +285,8 @@ def _analyze_quotation_marks_for_verse(self, verse: Verse) -> None: def _analyze_quotation_mark_sequence(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: quotation_mark_grouper: QuotationMarkGrouper = QuotationMarkGrouper(quotation_marks, self._quote_conventions) for earlier_mark, later_mark in quotation_mark_grouper.get_quotation_mark_pairs(): - self._quotation_mark_sequences.record_earlier_quotation_mark(earlier_mark) - self._quotation_mark_sequences.record_later_quotation_mark(later_mark) + self._quotation_mark_sequences.count_earlier_quotation_mark(earlier_mark) + self._quotation_mark_sequences.count_later_quotation_mark(later_mark) def _select_compatible_quote_conventions(self) -> QuoteConventionSet: opening_quotation_marks = self._find_opening_quotation_marks() @@ -322,7 +309,7 @@ def _is_opening_quotation_mark(self, quotation_mark: str) -> bool: if self._quotation_mark_sequences.is_mark_much_more_common_earlier(quotation_mark): return True - if self._quotation_mark_sequences.is_mark_common_early_and_late( + if self._quotation_mark_sequences.are_early_and_late_mark_rates_similar( quotation_mark ) and self._quote_conventions.is_quotation_mark_direction_ambiguous(quotation_mark): return True @@ -342,7 +329,7 @@ def _is_closing_quotation_mark(self, quotation_mark: str) -> bool: if self._quotation_mark_sequences.is_mark_much_more_common_later(quotation_mark): return True - if self._quotation_mark_sequences.is_mark_common_early_and_late( + if self._quotation_mark_sequences.are_early_and_late_mark_rates_similar( quotation_mark ) and self._quote_conventions.is_quotation_mark_direction_ambiguous(quotation_mark): return True diff --git a/machine/corpora/punctuation_analysis/quotation_mark_finder.py b/machine/corpora/punctuation_analysis/quotation_mark_finder.py index 16d00d34..73c95368 100644 --- a/machine/corpora/punctuation_analysis/quotation_mark_finder.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_finder.py @@ -10,10 +10,10 @@ class QuotationMarkFinder: - _QUOTE_PATTERN = regex.compile(r"(\p{Quotation_Mark}|<<|>>|<|>)", regex.U) + _QUOTATION_MARK_PATTERN = regex.compile(r"(\p{Quotation_Mark}|<<|>>|<|>)", regex.U) - def __init__(self, quote_convention_set: QuoteConventionSet): - self._quote_convention_set = quote_convention_set + def __init__(self, quote_conventions: QuoteConventionSet): + self._quote_conventions = quote_conventions def find_all_potential_quotation_marks_in_chapter(self, chapter: Chapter) -> List[QuotationMarkStringMatch]: quotation_matches: List[QuotationMarkStringMatch] = [] @@ -36,9 +36,11 @@ def find_all_potential_quotation_marks_in_text_segment( self, text_segment: TextSegment ) -> List[QuotationMarkStringMatch]: quotation_matches: List[QuotationMarkStringMatch] = [] - for quote_match in self._QUOTE_PATTERN.finditer(text_segment.text): - if self._quote_convention_set.is_valid_opening_quotation_mark( - quote_match.group() - ) or self._quote_convention_set.is_valid_closing_quotation_mark(quote_match.group()): - quotation_matches.append(QuotationMarkStringMatch(text_segment, quote_match.start(), quote_match.end())) + for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text): + if self._quote_conventions.is_valid_opening_quotation_mark( + quotation_mark_match.group() + ) or self._quote_conventions.is_valid_closing_quotation_mark(quotation_mark_match.group()): + quotation_matches.append( + QuotationMarkStringMatch(text_segment, quotation_mark_match.start(), quotation_mark_match.end()) + ) return quotation_matches diff --git a/machine/corpora/punctuation_analysis/quotation_mark_resolver.py b/machine/corpora/punctuation_analysis/quotation_mark_resolver.py index 189c21dc..3e9097f5 100644 --- a/machine/corpora/punctuation_analysis/quotation_mark_resolver.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_resolver.py @@ -3,20 +3,17 @@ from .quotation_mark_metadata import QuotationMarkMetadata from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue -from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings from .quotation_mark_string_match import QuotationMarkStringMatch class QuotationMarkResolver(ABC): - def __init__(self, settings: QuotationMarkResolutionSettings): - self._settings = settings - @abstractmethod def resolve_quotation_marks( - self, quote_matches: List[QuotationMarkStringMatch] + self, quotation_mark_matches: List[QuotationMarkStringMatch] ) -> Generator[QuotationMarkMetadata, None, None]: ... + @abstractmethod def reset(self) -> None: ... @abstractmethod diff --git a/machine/corpora/punctuation_analysis/quotation_mark_string_match.py b/machine/corpora/punctuation_analysis/quotation_mark_string_match.py index c2e47aaf..9de4b8e2 100644 --- a/machine/corpora/punctuation_analysis/quotation_mark_string_match.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_string_match.py @@ -37,11 +37,11 @@ def __eq__(self, value): def quotation_mark(self) -> str: return self._text_segment.text[self._start_index : self._end_index] - def is_valid_opening_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: - return quote_convention_set.is_valid_opening_quotation_mark(self.quotation_mark) + def is_valid_opening_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool: + return quote_conventions.is_valid_opening_quotation_mark(self.quotation_mark) - def is_valid_closing_quotation_mark(self, quote_convention_set: QuoteConventionSet) -> bool: - return quote_convention_set.is_valid_closing_quotation_mark(self.quotation_mark) + def is_valid_closing_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool: + return quote_conventions.is_valid_closing_quotation_mark(self.quotation_mark) def quotation_mark_matches(self, regex_pattern: regex.Pattern) -> bool: return regex_pattern.search(self.quotation_mark) is not None diff --git a/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py b/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py index fd6935df..ec17eba6 100644 --- a/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py @@ -1,4 +1,5 @@ -from typing import Dict, List +from collections import Counter, defaultdict +from typing import List from .quotation_mark_direction import QuotationMarkDirection from .quotation_mark_metadata import QuotationMarkMetadata @@ -7,23 +8,18 @@ class QuotationMarkCounts: def __init__(self): - self._string_counts: Dict[str, int] = dict() + self._quotation_mark_counter: Counter[str] = Counter() self._total_count = 0 def count_quotation_mark(self, quotation_mark: str) -> None: - if quotation_mark not in self._string_counts: - self._string_counts[quotation_mark] = 0 - self._string_counts[quotation_mark] += 1 + self._quotation_mark_counter.update([quotation_mark]) self._total_count += 1 def find_best_quotation_mark_proportion(self) -> tuple[str, int, int]: - best_str = max(self._string_counts, key=lambda x: self._string_counts[x]) - return (best_str, self._string_counts[best_str], self._total_count) + return self._quotation_mark_counter.most_common(1)[0] + (self._total_count,) def calculate_num_differences(self, expected_quotation_mark: str) -> int: - if expected_quotation_mark not in self._string_counts: - return self._total_count - return self._total_count - self._string_counts[expected_quotation_mark] + return self._total_count - self._quotation_mark_counter[expected_quotation_mark] def get_observed_count(self) -> int: return self._total_count @@ -32,46 +28,43 @@ def get_observed_count(self) -> int: class QuotationMarkTabulator: def __init__(self): - self.quotation_counts_by_depth_and_direction: dict[tuple[int, QuotationMarkDirection], QuotationMarkCounts] = ( - dict() + self._quotation_counts_by_depth_and_direction: dict[tuple[int, QuotationMarkDirection], QuotationMarkCounts] = ( + defaultdict(QuotationMarkCounts) ) def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None: for quotation_mark in quotation_marks: self._count_quotation_mark(quotation_mark) - def _count_quotation_mark(self, quote: QuotationMarkMetadata) -> None: - key = (quote.depth, quote.direction) - quotation_mark = quote.quotation_mark - if key not in self.quotation_counts_by_depth_and_direction: - self.quotation_counts_by_depth_and_direction[key] = QuotationMarkCounts() - self.quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark) + def _count_quotation_mark(self, quotation_mark: QuotationMarkMetadata) -> None: + key = (quotation_mark.depth, quotation_mark.direction) + self._quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark.quotation_mark) def _depth_and_direction_observed(self, depth: int, direction: QuotationMarkDirection) -> bool: - return (depth, direction) in self.quotation_counts_by_depth_and_direction + return (depth, direction) in self._quotation_counts_by_depth_and_direction def _find_most_common_quotation_mark_with_depth_and_direction( self, depth: int, direction: QuotationMarkDirection ) -> tuple[str, int, int]: - return self.quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion() + return self._quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion() def calculate_similarity(self, quote_convention: QuoteConvention) -> float: - num_differences = 0 - num_total_quotation_marks = 0 - for depth, direction in self.quotation_counts_by_depth_and_direction: + weighted_difference = 0 + total_weight = 0 + for depth, direction in self._quotation_counts_by_depth_and_direction: expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction) # give higher weight to shallower depths, since deeper marks are more likely to be mistakes - num_differences += self.quotation_counts_by_depth_and_direction[ + weighted_difference += self._quotation_counts_by_depth_and_direction[ (depth, direction) ].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth) - num_total_quotation_marks += self.quotation_counts_by_depth_and_direction[ + total_weight += self._quotation_counts_by_depth_and_direction[ (depth, direction) ].get_observed_count() * 2 ** (-depth) - if num_total_quotation_marks == 0: + if total_weight == 0: return 0 - return 1 - (num_differences / num_total_quotation_marks) + return 1 - (weighted_difference / total_weight) def get_summary_message(self) -> str: message_lines: List[str] = [] @@ -90,7 +83,10 @@ def get_summary_message(self) -> str: ) ) message_lines.append( - "The most common level %i quotes are %s (%i of %i opening quotes) and %s (%i of %i closing quotes)" + ( + "The most common level %i quotation marks are " + + "%s (%i of %i opening marks) and %s (%i of %i closing marks)" + ) % ( depth, opening_quotation_mark, diff --git a/machine/corpora/punctuation_analysis/quote_convention.py b/machine/corpora/punctuation_analysis/quote_convention.py index 3dc3a9e0..23063d42 100644 --- a/machine/corpora/punctuation_analysis/quote_convention.py +++ b/machine/corpora/punctuation_analysis/quote_convention.py @@ -21,21 +21,21 @@ @dataclass(frozen=True) class SingleLevelQuoteConvention: - opening_quote: str - closing_quote: str + opening_quotation_mark: str + closing_quotation_mark: str def normalize(self) -> "SingleLevelQuoteConvention": - normalized_opening_quote = ( - _QUOTATION_MARK_NORMALIZATION_MAP[self.opening_quote] - if self.opening_quote in _QUOTATION_MARK_NORMALIZATION_MAP - else self.opening_quote + normalized_opening_quotation_mark = ( + _QUOTATION_MARK_NORMALIZATION_MAP[self.opening_quotation_mark] + if self.opening_quotation_mark in _QUOTATION_MARK_NORMALIZATION_MAP + else self.opening_quotation_mark ) - normalized_closing_quote = ( - _QUOTATION_MARK_NORMALIZATION_MAP[self.closing_quote] - if self.closing_quote in _QUOTATION_MARK_NORMALIZATION_MAP - else self.closing_quote + normalized_closing_quotation_mark = ( + _QUOTATION_MARK_NORMALIZATION_MAP[self.closing_quotation_mark] + if self.closing_quotation_mark in _QUOTATION_MARK_NORMALIZATION_MAP + else self.closing_quotation_mark ) - return SingleLevelQuoteConvention(normalized_opening_quote, normalized_closing_quote) + return SingleLevelQuoteConvention(normalized_opening_quotation_mark, normalized_closing_quotation_mark) class QuoteConvention: @@ -51,9 +51,9 @@ def __eq__(self, value): if len(self.levels) != len(value.levels): return False for level, other_level in zip(self.levels, value.levels): - if level.opening_quote != other_level.opening_quote: + if level.opening_quotation_mark != other_level.opening_quotation_mark: return False - if level.closing_quote != other_level.closing_quote: + if level.closing_quotation_mark != other_level.closing_quotation_mark: return False return True @@ -65,39 +65,39 @@ def name(self) -> str: def num_levels(self) -> int: return len(self.levels) - def get_opening_quote_at_level(self, level: int) -> str: - return self.levels[level - 1].opening_quote + def get_opening_quotation_mark_at_level(self, level: int) -> str: + return self.levels[level - 1].opening_quotation_mark - def get_closing_quote_at_level(self, level: int) -> str: - return self.levels[level - 1].closing_quote + def get_closing_quotation_mark_at_level(self, level: int) -> str: + return self.levels[level - 1].closing_quotation_mark def get_expected_quotation_mark(self, depth: int, direction: QuotationMarkDirection) -> str: if depth > self.num_levels or depth < 1: return "" return ( - self.get_opening_quote_at_level(depth) + self.get_opening_quotation_mark_at_level(depth) if direction is QuotationMarkDirection.OPENING - else self.get_closing_quote_at_level(depth) + else self.get_closing_quotation_mark_at_level(depth) ) def _includes_opening_quotation_mark(self, opening_quotation_mark: str) -> bool: for level in self.levels: - if level.opening_quote == opening_quotation_mark: + if level.opening_quotation_mark == opening_quotation_mark: return True return False def _includes_closing_quotation_mark(self, closing_quotation_mark: str) -> bool: for level in self.levels: - if level.closing_quote == closing_quotation_mark: + if level.closing_quotation_mark == closing_quotation_mark: return True return False def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: depths: Set[int] = set() for depth, level in enumerate(self.levels, start=1): - if direction is QuotationMarkDirection.OPENING and level.opening_quote == quotation_mark: + if direction is QuotationMarkDirection.OPENING and level.opening_quotation_mark == quotation_mark: depths.add(depth) - elif direction is QuotationMarkDirection.CLOSING and level.closing_quote == quotation_mark: + elif direction is QuotationMarkDirection.CLOSING and level.closing_quotation_mark == quotation_mark: depths.add(depth) return depths @@ -111,10 +111,11 @@ def is_compatible_with_observed_quotation_marks( if not self._includes_closing_quotation_mark(closing_quotation_mark): return False - # we require the first-level quotes to have been observed - if self.get_opening_quote_at_level(1) not in opening_quotation_marks: - return False - if self.get_closing_quote_at_level(1) not in closing_quotation_marks: + # we require the first-level quotation marks to have been observed + if ( + self.get_opening_quotation_mark_at_level(1) not in opening_quotation_marks + or self.get_closing_quotation_mark_at_level(1) not in closing_quotation_marks + ): return False return True @@ -126,9 +127,9 @@ def __str__(self) -> str: for level, convention in enumerate(self.levels): ordinal_name = self._get_ordinal_name(level + 1) summary += "%s%s-level quote%s\n" % ( - convention.opening_quote, + convention.opening_quotation_mark, ordinal_name, - convention.closing_quote, + convention.closing_quotation_mark, ) return summary diff --git a/machine/corpora/punctuation_analysis/quote_convention_detection_resolution_settings.py b/machine/corpora/punctuation_analysis/quote_convention_detection_resolution_settings.py index 97591194..be43806c 100644 --- a/machine/corpora/punctuation_analysis/quote_convention_detection_resolution_settings.py +++ b/machine/corpora/punctuation_analysis/quote_convention_detection_resolution_settings.py @@ -10,34 +10,34 @@ class QuoteConventionDetectionResolutionSettings(QuotationMarkResolutionSettings): - def __init__(self, quote_convention_set: QuoteConventionSet): - self._quote_convention_set = quote_convention_set + def __init__(self, quote_conventions: QuoteConventionSet): + self._quote_conventions = quote_conventions def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: - return quotation_mark_match.is_valid_opening_quotation_mark(self._quote_convention_set) + return quotation_mark_match.is_valid_opening_quotation_mark(self._quote_conventions) def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: - return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_convention_set) + return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_conventions) @property def opening_quotation_mark_regex(self) -> regex.Pattern: - return self._quote_convention_set.opening_quotation_mark_regex + return self._quote_conventions.opening_quotation_mark_regex @property def closing_quotation_mark_regex(self) -> regex.Pattern: - return self._quote_convention_set.closing_quotation_mark_regex + return self._quote_conventions.closing_quotation_mark_regex def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: - return self._quote_convention_set.marks_are_a_valid_pair(opening_mark, closing_mark) + return self._quote_conventions.marks_are_a_valid_pair(opening_mark, closing_mark) @property def should_rely_on_paragraph_markers(self): return True def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: - return self._quote_convention_set.get_possible_depths(quotation_mark, direction) + return self._quote_conventions.get_possible_depths(quotation_mark, direction) def metadata_matches_quotation_mark( self, quotation_mark: str, depth: int, direction: QuotationMarkDirection ) -> bool: - return self._quote_convention_set.metadata_matches_quotation_mark(quotation_mark, depth, direction) + return self._quote_conventions.metadata_matches_quotation_mark(quotation_mark, depth, direction) diff --git a/machine/corpora/punctuation_analysis/quote_convention_detector.py b/machine/corpora/punctuation_analysis/quote_convention_detector.py index 4b915365..733111a4 100644 --- a/machine/corpora/punctuation_analysis/quote_convention_detector.py +++ b/machine/corpora/punctuation_analysis/quote_convention_detector.py @@ -3,7 +3,7 @@ from .chapter import Chapter from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver -from .preliminary_quotation_analyzer import PreliminaryQuotationAnalyzer +from .preliminary_quotation_mark_analyzer import PreliminaryQuotationMarkAnalyzer from .quotation_mark_finder import QuotationMarkFinder from .quotation_mark_metadata import QuotationMarkMetadata from .quotation_mark_string_match import QuotationMarkStringMatch @@ -28,7 +28,7 @@ def __init__(self): self._quotation_mark_tabulator = QuotationMarkTabulator() def _count_quotation_marks_in_chapters(self, chapters: list[Chapter]) -> None: - possible_quote_conventions: QuoteConventionSet = PreliminaryQuotationAnalyzer( + possible_quote_conventions: QuoteConventionSet = PreliminaryQuotationMarkAnalyzer( STANDARD_QUOTE_CONVENTIONS ).narrow_down_possible_quote_conventions(chapters) @@ -50,7 +50,7 @@ def _count_quotation_marks_in_chapter( self._quotation_mark_tabulator.tabulate(resolved_quotation_marks) - def detect_quotation_convention(self, print_summary: bool) -> Union[QuoteConventionAnalysis, None]: + def detect_quote_convention(self, print_summary: bool) -> Union[QuoteConventionAnalysis, None]: self._count_quotation_marks_in_chapters(self.get_chapters()) (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention( diff --git a/machine/corpora/punctuation_analysis/quote_convention_set.py b/machine/corpora/punctuation_analysis/quote_convention_set.py index 01128dc6..a006310f 100644 --- a/machine/corpora/punctuation_analysis/quote_convention_set.py +++ b/machine/corpora/punctuation_analysis/quote_convention_set.py @@ -1,3 +1,4 @@ +from collections import defaultdict from re import Pattern from typing import Dict, List, Set, Tuple, Union @@ -11,7 +12,7 @@ class QuoteConventionSet: def __init__(self, conventions: List[QuoteConvention]): self._conventions = conventions - self._create_quote_regexes() + self._create_quotation_mark_regexes() self._create_quotation_mark_pair_map() def __eq__(self, other: object) -> bool: @@ -19,52 +20,43 @@ def __eq__(self, other: object) -> bool: return False return self._conventions == other._conventions - def _create_quote_regexes(self) -> None: + def _create_quotation_mark_regexes(self) -> None: + self._opening_quotation_mark_regex = regex.compile(r"") + self._closing_quotation_mark_regex = regex.compile(r"") + self._all_quotation_mark_regex = regex.compile(r"") + opening_quotation_marks: Set[str] = set() closing_quotation_marks: Set[str] = set() - all_quotation_marks: Set[str] = set() - - if len(self._conventions) > 0: - for convention in self._conventions: - for level in range(1, convention.num_levels + 1): - opening_quote = convention.get_opening_quote_at_level(level) - closing_quote = convention.get_closing_quote_at_level(level) - opening_quotation_marks.add(opening_quote) - closing_quotation_marks.add(closing_quote) - all_quotation_marks.add(opening_quote) - all_quotation_marks.add(closing_quote) - - if len(all_quotation_marks) > 0: - self._opening_quotation_mark_regex: Pattern = regex.compile( - r"[" + "".join(sorted(list(opening_quotation_marks))) + "]" - ) - self._closing_quotation_mark_regex: Pattern = regex.compile( - r"[" + "".join(sorted(list(closing_quotation_marks))) + "]" - ) - self._all_quotation_mark_regex: Pattern = regex.compile( - r"[" + "".join(sorted(list(all_quotation_marks))) + "]" - ) - if len(opening_quotation_marks) == 0: - self._opening_quotation_mark_regex = regex.compile(r"") - if len(closing_quotation_marks) == 0: - self._closing_quotation_mark_regex = regex.compile(r"") - if len(all_quotation_marks) == 0: - self._all_quotation_mark_regex = regex.compile(r"") + for convention in self._conventions: + for level in range(1, convention.num_levels + 1): + opening_quotation_mark = convention.get_opening_quotation_mark_at_level(level) + closing_quotation_mark = convention.get_closing_quotation_mark_at_level(level) + opening_quotation_marks.add(opening_quotation_mark) + closing_quotation_marks.add(closing_quotation_mark) + + all_quotation_marks = opening_quotation_marks.union(closing_quotation_marks) + + if len(all_quotation_marks) > 0: + self._opening_quotation_mark_regex: Pattern = regex.compile( + r"[" + "".join(sorted(list(opening_quotation_marks))) + "]" + ) + self._closing_quotation_mark_regex: Pattern = regex.compile( + r"[" + "".join(sorted(list(closing_quotation_marks))) + "]" + ) + self._all_quotation_mark_regex: Pattern = regex.compile( + r"[" + "".join(sorted(list(all_quotation_marks))) + "]" + ) def _create_quotation_mark_pair_map(self) -> None: - self.closing_marks_by_opening_mark: Dict[str, set[str]] = dict() - self.opening_marks_by_closing_mark: Dict[str, set[str]] = dict() + self.closing_marks_by_opening_mark: Dict[str, set[str]] = defaultdict(set) + self.opening_marks_by_closing_mark: Dict[str, set[str]] = defaultdict(set) for convention in self._conventions: for level in range(1, convention.num_levels + 1): - opening_quote = convention.get_opening_quote_at_level(level) - closing_quote = convention.get_closing_quote_at_level(level) - if opening_quote not in self.closing_marks_by_opening_mark: - self.closing_marks_by_opening_mark[opening_quote] = set() - self.closing_marks_by_opening_mark[opening_quote].add(closing_quote) - if closing_quote not in self.opening_marks_by_closing_mark: - self.opening_marks_by_closing_mark[closing_quote] = set() - self.opening_marks_by_closing_mark[closing_quote].add(opening_quote) + opening_quotation_mark = convention.get_opening_quotation_mark_at_level(level) + closing_quotation_mark = convention.get_closing_quotation_mark_at_level(level) + self.closing_marks_by_opening_mark[opening_quotation_mark].add(closing_quotation_mark) + self.opening_marks_by_closing_mark[closing_quotation_mark].add(opening_quotation_mark) @property def opening_quotation_mark_regex(self) -> Pattern: diff --git a/machine/corpora/punctuation_analysis/text_segment.py b/machine/corpora/punctuation_analysis/text_segment.py index 491489fe..96ff01d1 100644 --- a/machine/corpora/punctuation_analysis/text_segment.py +++ b/machine/corpora/punctuation_analysis/text_segment.py @@ -67,10 +67,12 @@ def replace_substring(self, start_index: int, end_index: int, replacement: str) self._usfm_token.text = self._text # These setters need to be implemented outside the builder to avoid circular dependencies - def set_previous_segment(self, previous_segment: "TextSegment") -> None: + @previous_segment.setter + def previous_segment(self, previous_segment: "TextSegment") -> None: self._previous_segment = previous_segment - def set_next_segment(self, next_segment: "TextSegment") -> None: + @next_segment.setter + def next_segment(self, next_segment: "TextSegment") -> None: self._next_segment = next_segment def set_index_in_verse(self, index_in_verse: int) -> None: diff --git a/machine/corpora/punctuation_analysis/usfm_structure_extractor.py b/machine/corpora/punctuation_analysis/usfm_structure_extractor.py index 8958c2aa..303e30f4 100644 --- a/machine/corpora/punctuation_analysis/usfm_structure_extractor.py +++ b/machine/corpora/punctuation_analysis/usfm_structure_extractor.py @@ -11,9 +11,6 @@ class UsfmStructureExtractor(UsfmParserHandler): def __init__(self): - self._reset() - - def _reset(self): self._text_segments: list[TextSegment] = [] self._next_text_segment_builder: TextSegment.Builder = TextSegment.Builder() @@ -74,11 +71,11 @@ def text(self, state: UsfmParserState, text: str) -> None: self._next_text_segment_builder.set_text(text) text_segment: TextSegment = self._next_text_segment_builder.build() # don't look past verse boundaries, to enable identical functionality in the - # online one-verse-at-a-time (QuotationDenormalizationScriptureUpdateBlockHandler) + # online one-verse-at-a-time (QuotationMarkDenormalizationScriptureUpdateBlockHandler) # and offline whole-book-at-once settings (QuoteConventionDetector) if len(self._text_segments) > 0 and not text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE): - self._text_segments[-1].set_next_segment(text_segment) - text_segment.set_previous_segment(self._text_segments[-1]) + self._text_segments[-1].next_segment = text_segment + text_segment.previous_segment = self._text_segments[-1] self._text_segments.append(text_segment) self._next_text_segment_builder = TextSegment.Builder() diff --git a/machine/corpora/quotation_denormalization_first_pass.py b/machine/corpora/quotation_mark_denormalization_first_pass.py similarity index 85% rename from machine/corpora/quotation_denormalization_first_pass.py rename to machine/corpora/quotation_mark_denormalization_first_pass.py index e3e98db4..4460d876 100644 --- a/machine/corpora/quotation_denormalization_first_pass.py +++ b/machine/corpora/quotation_mark_denormalization_first_pass.py @@ -3,7 +3,7 @@ # This is a convenience class so that users don't have to know to normalize the source quote convention -class QuotationDenormalizationFirstPass(QuotationMarkUpdateFirstPass): +class QuotationMarkDenormalizationFirstPass(QuotationMarkUpdateFirstPass): def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): super().__init__(source_quote_convention.normalize(), target_quote_convention) diff --git a/machine/corpora/quotation_denormalization_usfm_update_block_handler.py b/machine/corpora/quotation_mark_denormalization_usfm_update_block_handler.py similarity index 86% rename from machine/corpora/quotation_denormalization_usfm_update_block_handler.py rename to machine/corpora/quotation_mark_denormalization_usfm_update_block_handler.py index e92fa1d1..baf75718 100644 --- a/machine/corpora/quotation_denormalization_usfm_update_block_handler.py +++ b/machine/corpora/quotation_mark_denormalization_usfm_update_block_handler.py @@ -4,7 +4,7 @@ # This is a convenience class so that users don't have to know to normalize the source quote convention -class QuotationDenormalizationUsfmUpdateBlockHandler(QuoteConventionChangingUsfmUpdateBlockHandler): +class QuotationMarkDenormalizationUsfmUpdateBlockHandler(QuoteConventionChangingUsfmUpdateBlockHandler): def __init__( self, diff --git a/machine/corpora/quotation_mark_update_first_pass.py b/machine/corpora/quotation_mark_update_first_pass.py index 414ef1c3..4e3f5a43 100644 --- a/machine/corpora/quotation_mark_update_first_pass.py +++ b/machine/corpora/quotation_mark_update_first_pass.py @@ -35,12 +35,12 @@ def _check_whether_fallback_mode_will_work( ) -> bool: target_marks_by_source_marks: Dict[str, Set[str]] = {} for level in range(1, source_quote_convention.num_levels + 1): - opening_quotation_mark = source_quote_convention.get_opening_quote_at_level(level) + opening_quotation_mark = source_quote_convention.get_opening_quotation_mark_at_level(level) if opening_quotation_mark not in target_marks_by_source_marks: target_marks_by_source_marks[opening_quotation_mark] = set() if level <= target_quote_convention.num_levels: target_marks_by_source_marks[opening_quotation_mark].add( - target_quote_convention.get_closing_quote_at_level(level) + target_quote_convention.get_closing_quotation_mark_at_level(level) ) for source_mark in target_marks_by_source_marks: diff --git a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py index 86aa72ec..37e069a6 100644 --- a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py +++ b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py @@ -120,9 +120,9 @@ def _create_text_segment(self, token: UsfmToken) -> Union[TextSegment, None]: def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) -> List[TextSegment]: for i in range(len(text_segments)): if i > 0: - text_segments[i].set_previous_segment(text_segments[i - 1]) + text_segments[i].previous_segment = text_segments[i - 1] if i < len(text_segments) - 1: - text_segments[i].set_next_segment(text_segments[i + 1]) + text_segments[i].next_segment = text_segments[i + 1] return text_segments def _check_for_chapter_change(self, block: UsfmUpdateBlock) -> None: diff --git a/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py b/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py index 904cd6c8..5b832852 100644 --- a/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py +++ b/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py @@ -3,14 +3,14 @@ from machine.corpora import QuotationMarkUpdateResolutionSettings from machine.corpora.punctuation_analysis import ( DepthBasedQuotationMarkResolver, - QuotationContinuerState, - QuotationContinuerStyle, QuotationMarkCategorizer, QuotationMarkDirection, QuotationMarkMetadata, QuotationMarkResolutionIssue, QuotationMarkResolverState, QuotationMarkStringMatch, + QuoteContinuerState, + QuoteContinuerStyle, QuoteConventionDetectionResolutionSettings, QuoteConventionSet, TextSegment, @@ -20,29 +20,29 @@ # QuotationMarkResolverState tests -def test_get_current_depth_quotation_mark_resolver_state() -> None: +def test_current_depth_quotation_mark_resolver_state() -> None: quotation_mark_resolver_state = QuotationMarkResolverState() - assert quotation_mark_resolver_state.current_depth == 1 + assert quotation_mark_resolver_state.current_depth == 0 quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert quotation_mark_resolver_state.current_depth == 2 + assert quotation_mark_resolver_state.current_depth == 1 quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert quotation_mark_resolver_state.current_depth == 3 + assert quotation_mark_resolver_state.current_depth == 2 quotation_mark_resolver_state.add_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert quotation_mark_resolver_state.current_depth == 2 + assert quotation_mark_resolver_state.current_depth == 1 quotation_mark_resolver_state.add_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert quotation_mark_resolver_state.current_depth == 1 + assert quotation_mark_resolver_state.current_depth == 0 def test_has_open_quotation_mark() -> None: @@ -172,27 +172,27 @@ def test_get_current_depth_quotation_continuer_state() -> None: QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() assert quotation_continuer_state.current_depth == 0 - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) assert quotation_continuer_state.current_depth == 1 - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) assert quotation_continuer_state.current_depth == 2 - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) assert quotation_continuer_state.current_depth == 0 @@ -209,27 +209,27 @@ def test_has_continuer_been_observed() -> None: QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() assert not quotation_continuer_state.continuer_has_been_observed() - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) assert quotation_continuer_state.continuer_has_been_observed() - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) assert quotation_continuer_state.continuer_has_been_observed() - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) assert not quotation_continuer_state.continuer_has_been_observed() @@ -246,29 +246,29 @@ def test_get_continuer_style() -> None: QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - quotation_continuer_state = QuotationContinuerState() - assert quotation_continuer_state.continuer_style is QuotationContinuerStyle.UNDETERMINED + quotation_continuer_state = QuoteContinuerState() + assert quotation_continuer_state.continuer_style is QuoteContinuerStyle.UNDETERMINED - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) - assert quotation_continuer_state.continuer_style is QuotationContinuerStyle.ENGLISH + assert quotation_continuer_state.continuer_style is QuoteContinuerStyle.ENGLISH - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.SPANISH, + QuoteContinuerStyle.SPANISH, ) - assert quotation_continuer_state.continuer_style is QuotationContinuerStyle.SPANISH + assert quotation_continuer_state.continuer_style is QuoteContinuerStyle.SPANISH - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) - assert quotation_continuer_state.continuer_style is QuotationContinuerStyle.ENGLISH + assert quotation_continuer_state.continuer_style is QuoteContinuerStyle.ENGLISH def test_add_quotation_continuer() -> None: @@ -283,29 +283,29 @@ def test_add_quotation_continuer() -> None: QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() - assert quotation_continuer_state.add_quotation_continuer( + assert quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) == QuotationMarkMetadata( "\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text("\u201c").build(), 0, 1 ) - assert quotation_continuer_state.add_quotation_continuer( + assert quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.SPANISH, + QuoteContinuerStyle.SPANISH, ) == QuotationMarkMetadata( "\u2018", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text("\u2018").build(), 0, 1 ) - assert quotation_continuer_state.continuer_style == QuotationContinuerStyle.SPANISH + assert quotation_continuer_state.continuer_style == QuoteContinuerStyle.SPANISH - assert quotation_continuer_state.add_quotation_continuer( + assert quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) == QuotationMarkMetadata( "\u201c", 3, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text("\u201c").build(), 0, 1 ) @@ -324,7 +324,7 @@ def test_is_english_quotation_continuer() -> None: QuoteConventionSet([standard_english_quote_convention]) ) quotation_mark_resolver_state = QuotationMarkResolverState() - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() quotation_mark_categorizer = QuotationMarkCategorizer( english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state @@ -335,8 +335,8 @@ def test_is_english_quotation_continuer() -> None: ) # Should always be false if the continuer style is Spanish - quotation_continuer_state._continuer_style = QuotationContinuerStyle.ENGLISH - assert quotation_mark_categorizer.is_english_quotation_continuer( + quotation_continuer_state._continuer_style = QuoteContinuerStyle.ENGLISH + assert quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -346,8 +346,8 @@ def test_is_english_quotation_continuer() -> None: None, ) - quotation_continuer_state._continuer_style = QuotationContinuerStyle.SPANISH - assert not quotation_mark_categorizer.is_english_quotation_continuer( + quotation_continuer_state._continuer_style = QuoteContinuerStyle.SPANISH + assert not quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -356,10 +356,10 @@ def test_is_english_quotation_continuer() -> None: None, None, ) - quotation_continuer_state._continuer_style = QuotationContinuerStyle.ENGLISH + quotation_continuer_state._continuer_style = QuoteContinuerStyle.ENGLISH # Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) - assert not quotation_mark_categorizer.is_english_quotation_continuer( + assert not quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201ctest").build(), 0, @@ -369,7 +369,7 @@ def test_is_english_quotation_continuer() -> None: None, ) - assert quotation_mark_categorizer.is_english_quotation_continuer( + assert quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -384,7 +384,7 @@ def test_is_english_quotation_continuer() -> None: quotation_mark_resolver_state, quotation_continuer_state, ) - assert quotation_mark_categorizer_for_denormalization.is_english_quotation_continuer( + assert quotation_mark_categorizer_for_denormalization.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201ctest").build(), 0, @@ -399,7 +399,7 @@ def test_is_english_quotation_continuer() -> None: empty_quotation_mark_categorizer = QuotationMarkCategorizer( english_resolver_settings, empty_quotation_mark_resolver_state, quotation_continuer_state ) - assert not empty_quotation_mark_categorizer.is_english_quotation_continuer( + assert not empty_quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -410,7 +410,7 @@ def test_is_english_quotation_continuer() -> None: ) # Should be false if the starting index of the quotation mark is greater than 0 - assert not quotation_mark_categorizer.is_english_quotation_continuer( + assert not quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text(" \u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, @@ -421,7 +421,7 @@ def test_is_english_quotation_continuer() -> None: ) # Should be false if the mark does not match the already opened mark - assert not quotation_mark_categorizer.is_english_quotation_continuer( + assert not quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -436,7 +436,7 @@ def test_is_english_quotation_continuer() -> None: quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert not quotation_mark_categorizer.is_english_quotation_continuer( + assert not quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -445,7 +445,7 @@ def test_is_english_quotation_continuer() -> None: None, None, ) - assert quotation_mark_categorizer.is_english_quotation_continuer( + assert quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -458,7 +458,7 @@ def test_is_english_quotation_continuer() -> None: 2, ), ) - assert quotation_mark_categorizer.is_english_quotation_continuer( + assert quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201c\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -473,17 +473,17 @@ def test_is_english_quotation_continuer() -> None: ) # When there are multiple open quotes, the continuer must match the deepest observed mark - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) - assert not quotation_mark_categorizer.is_english_quotation_continuer( + assert not quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201c\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, @@ -493,7 +493,7 @@ def test_is_english_quotation_continuer() -> None: None, ) - assert quotation_mark_categorizer.is_english_quotation_continuer( + assert quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, @@ -511,7 +511,7 @@ def test_is_english_quotation_continuer() -> None: ) ) - assert quotation_mark_categorizer.is_english_quotation_continuer( + assert quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u201c\u2018\u201ctest") @@ -524,7 +524,7 @@ def test_is_english_quotation_continuer() -> None: None, ) - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u201c\u2018\u201ctest") @@ -534,9 +534,9 @@ def test_is_english_quotation_continuer() -> None: 2, ), quotation_mark_resolver_state, - QuotationContinuerStyle.ENGLISH, + QuoteContinuerStyle.ENGLISH, ) - assert not quotation_mark_categorizer.is_english_quotation_continuer( + assert not quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u201c\u2018\u2018test") @@ -548,7 +548,7 @@ def test_is_english_quotation_continuer() -> None: None, None, ) - assert quotation_mark_categorizer.is_english_quotation_continuer( + assert quotation_mark_categorizer.is_english_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u201c\u2018\u201ctest") @@ -572,7 +572,7 @@ def test_is_spanish_quotation_continuer() -> None: QuoteConventionSet([western_european_quote_convention]) ) quotation_mark_resolver_state = QuotationMarkResolverState() - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() quotation_mark_categorizer = QuotationMarkCategorizer( spanish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state @@ -583,8 +583,8 @@ def test_is_spanish_quotation_continuer() -> None: ) # Should always be false if the continuer style is English - quotation_continuer_state._continuer_style = QuotationContinuerStyle.SPANISH - assert quotation_mark_categorizer.is_spanish_quotation_continuer( + quotation_continuer_state._continuer_style = QuoteContinuerStyle.SPANISH + assert quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -594,8 +594,8 @@ def test_is_spanish_quotation_continuer() -> None: None, ) - quotation_continuer_state._continuer_style = QuotationContinuerStyle.ENGLISH - assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + quotation_continuer_state._continuer_style = QuoteContinuerStyle.ENGLISH + assert not quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -604,10 +604,10 @@ def test_is_spanish_quotation_continuer() -> None: None, None, ) - quotation_continuer_state._continuer_style = QuotationContinuerStyle.SPANISH + quotation_continuer_state._continuer_style = QuoteContinuerStyle.SPANISH # Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) - assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + assert not quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bbtest").build(), 0, @@ -617,7 +617,7 @@ def test_is_spanish_quotation_continuer() -> None: None, ) - assert quotation_mark_categorizer.is_spanish_quotation_continuer( + assert quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -632,7 +632,7 @@ def test_is_spanish_quotation_continuer() -> None: quotation_mark_resolver_state, quotation_continuer_state, ) - assert quotation_mark_categorizer_for_denormalization.is_spanish_quotation_continuer( + assert quotation_mark_categorizer_for_denormalization.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bbtest").build(), 0, @@ -647,7 +647,7 @@ def test_is_spanish_quotation_continuer() -> None: empty_quotation_mark_categorizer = QuotationMarkCategorizer( spanish_resolver_settings, empty_quotation_mark_resolver_state, quotation_continuer_state ) - assert not empty_quotation_mark_categorizer.is_spanish_quotation_continuer( + assert not empty_quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -658,7 +658,7 @@ def test_is_spanish_quotation_continuer() -> None: ) # Should be false if the starting index of the quotation mark is greater than 0 - assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + assert not quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text(" \u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, @@ -669,7 +669,7 @@ def test_is_spanish_quotation_continuer() -> None: ) # Should be false if the mark does not match the already opened mark - assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + assert not quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -684,7 +684,7 @@ def test_is_spanish_quotation_continuer() -> None: quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + assert not quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -693,7 +693,7 @@ def test_is_spanish_quotation_continuer() -> None: None, None, ) - assert quotation_mark_categorizer.is_spanish_quotation_continuer( + assert quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -706,7 +706,7 @@ def test_is_spanish_quotation_continuer() -> None: 2, ), ) - assert quotation_mark_categorizer.is_spanish_quotation_continuer( + assert quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bb\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, @@ -721,17 +721,17 @@ def test_is_spanish_quotation_continuer() -> None: ) # When there are multiple open quotes, the continuer must match the deepest observed mark - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 0, 1, ), quotation_mark_resolver_state, - QuotationContinuerStyle.SPANISH, + QuoteContinuerStyle.SPANISH, ) - assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + assert not quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bb\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, @@ -741,7 +741,7 @@ def test_is_spanish_quotation_continuer() -> None: None, ) - assert quotation_mark_categorizer.is_spanish_quotation_continuer( + assert quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), 1, @@ -759,7 +759,7 @@ def test_is_spanish_quotation_continuer() -> None: ) ) - assert quotation_mark_categorizer.is_spanish_quotation_continuer( + assert quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u00bb\u201d\u2019test") @@ -772,7 +772,7 @@ def test_is_spanish_quotation_continuer() -> None: None, ) - quotation_continuer_state.add_quotation_continuer( + quotation_continuer_state.add_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u00bb\u201d\u2019test") @@ -782,9 +782,9 @@ def test_is_spanish_quotation_continuer() -> None: 2, ), quotation_mark_resolver_state, - QuotationContinuerStyle.SPANISH, + QuoteContinuerStyle.SPANISH, ) - assert not quotation_mark_categorizer.is_spanish_quotation_continuer( + assert not quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u00bb\u201d\u201dtest") @@ -796,7 +796,7 @@ def test_is_spanish_quotation_continuer() -> None: None, None, ) - assert quotation_mark_categorizer.is_spanish_quotation_continuer( + assert quotation_mark_categorizer.is_spanish_quote_continuer( QuotationMarkStringMatch( TextSegment.Builder() .set_text("\u00bb\u201d\u2019test") @@ -819,7 +819,7 @@ def test_is_opening_quote() -> None: QuoteConventionSet([central_european_quote_convention]) ) quotation_mark_resolver_state = QuotationMarkResolverState() - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() central_european_quotation_mark_categorizer = QuotationMarkCategorizer( central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) @@ -856,144 +856,144 @@ def test_is_opening_quote() -> None: ) # It should only accept valid opening marks under the quote convention - assert central_european_quotation_mark_categorizer.is_opening_quote( + assert central_european_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) ) - assert central_european_quotation_mark_categorizer.is_opening_quote( + assert central_european_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_opening_quote( + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_opening_quote( + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_opening_quote( + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_opening_quote( + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_opening_quote( + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_opening_quote( + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_opening_quote( + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_opening_quote( + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) ) - assert british_english_quotation_mark_categorizer.is_opening_quote( + assert british_english_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) ) - assert british_english_quotation_mark_categorizer.is_opening_quote( + assert british_english_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_opening_quote( + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_opening_quote( + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_opening_quote( + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_opening_quote( + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) ) - assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) ) - assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_opening_quote( + assert not three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_opening_quote( + assert not three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) ) # Leading whitespace is not necessary for unambiguous opening quotes - assert central_european_quotation_mark_categorizer.is_opening_quote( + assert central_european_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201e").build(), 4, 5) ) - assert central_european_quotation_mark_categorizer.is_opening_quote( + assert central_european_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201a").build(), 4, 5) ) - assert british_english_quotation_mark_categorizer.is_opening_quote( + assert british_english_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201c").build(), 4, 5) ) - assert british_english_quotation_mark_categorizer.is_opening_quote( + assert british_english_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u2018").build(), 4, 5) ) - assert three_conventions_quotation_mark_categorizer.is_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201e").build(), 4, 5) ) - assert three_conventions_quotation_mark_categorizer.is_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201a").build(), 4, 5) ) # An ambiguous quotation mark (opening/closing) is recognized as opening if # it has a quote introducer beforehand - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201d").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(":\u2019").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_opening_quote( + assert not three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201c").build(), 1, 2) ) @@ -1002,37 +1002,37 @@ def test_is_opening_quote() -> None: quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 1, 2) ) - assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 1, 2) ) - assert standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_opening_quote( + assert not three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201c").build(), 1, 2) ) # An ambiguous quotation mark (opening/closing) is not recognized as opening if # it has trailing whitespace or punctuation - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d.").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201d ").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019 ").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019?").build(), 1, 2) ) @@ -1046,7 +1046,7 @@ def test_is_closing_quote() -> None: QuoteConventionSet([central_european_quote_convention]) ) quotation_mark_resolver_state = QuotationMarkResolverState() - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() central_european_quotation_mark_categorizer = QuotationMarkCategorizer( central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) @@ -1094,129 +1094,129 @@ def test_is_closing_quote() -> None: ) # It should only accept valid closing marks under the quote convention - assert central_european_quotation_mark_categorizer.is_closing_quote( + assert central_european_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) ) - assert central_european_quotation_mark_categorizer.is_closing_quote( + assert central_european_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_closing_quote( + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e ").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_closing_quote( + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201a ").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_closing_quote( + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_closing_quote( + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_closing_quote( + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_closing_quote( + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_closing_quote( + assert not british_english_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_closing_quote( + assert not british_english_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) ) - assert british_english_quotation_mark_categorizer.is_closing_quote( + assert british_english_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) ) - assert british_english_quotation_mark_categorizer.is_closing_quote( + assert british_english_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_closing_quote( + assert not british_english_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_closing_quote( + assert not british_english_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_closing_quote( + assert three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_closing_quote( + assert three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_closing_quote( + assert three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_closing_quote( + assert three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) ) # Trailing whitespace is not necessary for unambiguous closing quotes - assert standard_french_quotation_mark_categorizer.is_closing_quote( + assert standard_french_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bbtext").build(), 0, 1) ) - assert standard_french_quotation_mark_categorizer.is_closing_quote( + assert standard_french_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u203atext").build(), 0, 1) ) # An ambiguous quotation mark (opening/closing) is recognized as closing if # followed by whitespace, punctuation or the end of the segment - assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201dtext").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019text").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019?").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019\u201d").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201ctext").build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_closing_quote( + assert three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c?").build(), 0, 1) ) # An ambiguous quotation mark (opening/closing) is not recognized as opening if # it has leading whitespace - assert not standard_swedish_quotation_mark_categorizer.is_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\t\u201c?").build(), 1, 2) ) @@ -1230,7 +1230,7 @@ def test_is_malformed_opening_quote() -> None: QuoteConventionSet([central_european_quote_convention]) ) quotation_mark_resolver_state = QuotationMarkResolverState() - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() central_european_quotation_mark_categorizer = QuotationMarkCategorizer( central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) @@ -1267,134 +1267,134 @@ def test_is_malformed_opening_quote() -> None: ) # It should only accept valid opening marks under the quote convention - assert central_european_quotation_mark_categorizer.is_malformed_opening_quote( + assert central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) ) - assert central_european_quotation_mark_categorizer.is_malformed_opening_quote( + assert central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) ) - assert british_english_quotation_mark_categorizer.is_malformed_opening_quote( + assert british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) ) - assert british_english_quotation_mark_categorizer.is_malformed_opening_quote( + assert british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) ) - assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) ) - assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) ) - assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) ) # Should return true if there is a leading quote introducer - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201d ").build(), 1, 2) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(":\u2019 ").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201c ").build(), 1, 2) ) # Should return false unless the mark has leading and trailing whitespace - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) ) - assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) ) @@ -1402,22 +1402,22 @@ def test_is_malformed_opening_quote() -> None: quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_malformed_opening_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) ) - assert not central_european_quotation_mark_categorizer.is_malformed_opening_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) ) @@ -1431,7 +1431,7 @@ def test_is_malformed_closing_quote() -> None: QuoteConventionSet([central_european_quote_convention]) ) quotation_mark_resolver_state = QuotationMarkResolverState() - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() central_european_quotation_mark_categorizer = QuotationMarkCategorizer( central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) @@ -1471,28 +1471,28 @@ def test_is_malformed_closing_quote() -> None: quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e").build(), 0, 1) ) - assert central_european_quotation_mark_categorizer.is_malformed_closing_quote( + assert central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201a").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_malformed_closing_quote( + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) ) @@ -1502,22 +1502,22 @@ def test_is_malformed_closing_quote() -> None: quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) ) @@ -1527,62 +1527,62 @@ def test_is_malformed_closing_quote() -> None: quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + assert three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) ) # Returns true if it's at the end of the segment - assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) # Returns true if it does not have trailing whitespace - assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d-").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201dtext").build(), 0, 1) ) # Returns true if it has trailing and leading whitespace - assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) ) @@ -1590,7 +1590,7 @@ def test_is_malformed_closing_quote() -> None: quotation_mark_resolver_state.add_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) ) @@ -1599,23 +1599,23 @@ def test_is_malformed_closing_quote() -> None: quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert british_english_quotation_mark_categorizer.is_malformed_closing_quote( + assert british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) @@ -1629,7 +1629,7 @@ def test_is_unpaired_closing_quote() -> None: QuoteConventionSet([central_european_quote_convention]) ) quotation_mark_resolver_state = QuotationMarkResolverState() - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() central_european_quotation_mark_categorizer = QuotationMarkCategorizer( central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) @@ -1666,85 +1666,85 @@ def test_is_unpaired_closing_quote() -> None: ) # It should only accept valid closing marks under the quote convention - assert central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + assert central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + assert central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201a").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + assert standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) ) @@ -1752,34 +1752,34 @@ def test_is_unpaired_closing_quote() -> None: quotation_mark_resolver_state.add_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) @@ -1787,22 +1787,22 @@ def test_is_unpaired_closing_quote() -> None: quotation_mark_resolver_state.add_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) ) - assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\t\u2019").build(), 1, 2) ) # The quotation mark must be either at the end of the segment # or have trailing whitespace - assert british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) - assert british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) ) - assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quote( + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d?").build(), 0, 1) ) @@ -1816,7 +1816,7 @@ def test_is_apostrophe() -> None: QuoteConventionSet([standard_english_quote_convention]) ) quotation_mark_resolver_state = QuotationMarkResolverState() - quotation_continuer_state = QuotationContinuerState() + quotation_continuer_state = QuoteContinuerState() standard_english_quotation_mark_categorizer = QuotationMarkCategorizer( standard_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) diff --git a/tests/corpora/punctuation_analysis/test_preliminary_quotation_analyzer.py b/tests/corpora/punctuation_analysis/test_preliminary_quotation_mark_analyzer.py similarity index 93% rename from tests/corpora/punctuation_analysis/test_preliminary_quotation_analyzer.py rename to tests/corpora/punctuation_analysis/test_preliminary_quotation_mark_analyzer.py index a52d2164..4607fafa 100644 --- a/tests/corpora/punctuation_analysis/test_preliminary_quotation_analyzer.py +++ b/tests/corpora/punctuation_analysis/test_preliminary_quotation_mark_analyzer.py @@ -2,7 +2,7 @@ ApostropheProportionStatistics, Chapter, PreliminaryApostropheAnalyzer, - PreliminaryQuotationAnalyzer, + PreliminaryQuotationMarkAnalyzer, QuotationMarkGrouper, QuotationMarkSequences, QuotationMarkStringMatch, @@ -181,25 +181,25 @@ def test_is_mark_much_more_common_earlier() -> None: quotation_mark_sequences = QuotationMarkSequences() assert not quotation_mark_sequences.is_mark_much_more_common_earlier('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') assert quotation_mark_sequences.is_mark_much_more_common_earlier('"') - quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') assert not quotation_mark_sequences.is_mark_much_more_common_earlier('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') assert quotation_mark_sequences.is_mark_much_more_common_earlier('"') - quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') assert not quotation_mark_sequences.is_mark_much_more_common_earlier('"') @@ -207,53 +207,54 @@ def test_is_mark_much_more_common_later() -> None: quotation_mark_sequences = QuotationMarkSequences() assert not quotation_mark_sequences.is_mark_much_more_common_later('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') assert quotation_mark_sequences.is_mark_much_more_common_later('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') assert not quotation_mark_sequences.is_mark_much_more_common_later('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') assert quotation_mark_sequences.is_mark_much_more_common_later('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') assert not quotation_mark_sequences.is_mark_much_more_common_later('"') def test_is_mark_common_early_and_late() -> None: quotation_mark_sequences = QuotationMarkSequences() - assert not quotation_mark_sequences.is_mark_common_early_and_late('"') + assert not quotation_mark_sequences.are_early_and_late_mark_rates_similar('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - assert quotation_mark_sequences.is_mark_common_early_and_late('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + assert quotation_mark_sequences.are_early_and_late_mark_rates_similar('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - quotation_mark_sequences.record_earlier_quotation_mark('"') - quotation_mark_sequences.record_later_quotation_mark('"') - assert quotation_mark_sequences.is_mark_common_early_and_late('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + assert quotation_mark_sequences.are_early_and_late_mark_rates_similar('"') - quotation_mark_sequences.record_later_quotation_mark('"') - assert quotation_mark_sequences.is_mark_common_early_and_late('"') + quotation_mark_sequences.count_later_quotation_mark('"') + assert quotation_mark_sequences.are_early_and_late_mark_rates_similar('"') - quotation_mark_sequences.record_later_quotation_mark('"') - assert not quotation_mark_sequences.is_mark_common_early_and_late('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + assert not quotation_mark_sequences.are_early_and_late_mark_rates_similar('"') # QuotationMarkGrouper tests @@ -740,7 +741,7 @@ def test_that_a_frequently_occurring_character_is_an_apostrophe() -> None: assert positive_preliminary_apostrophe_analyzer.is_apostrophe_only("'") -# PreliminaryQuotationAnalyzer tests +# PreliminaryQuotationMarkAnalyzer tests def test_that_quotation_mark_sequence_is_used_to_determine_opening_and_closing_quotes() -> None: standard_english_quote_convention = QuoteConvention( "standard_english", @@ -788,7 +789,7 @@ def test_that_quotation_mark_sequence_is_used_to_determine_opening_and_closing_q ], ) - preliminary_quotation_analyzer = PreliminaryQuotationAnalyzer( + preliminary_quotation_analyzer = PreliminaryQuotationMarkAnalyzer( QuoteConventionSet( [ standard_english_quote_convention, @@ -961,7 +962,7 @@ def test_that_apostrophes_not_considered_as_quotation_marks() -> None: ], ) - preliminary_quotation_analyzer = PreliminaryQuotationAnalyzer( + preliminary_quotation_analyzer = PreliminaryQuotationMarkAnalyzer( QuoteConventionSet( [ standard_english_quote_convention, diff --git a/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py b/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py index 07988774..ce17063c 100644 --- a/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py @@ -17,16 +17,16 @@ def test_reset() -> None: ) assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] - assert quotation_mark_resolver._quotation_continuer_state._quotation_continuer_stack == [] - assert quotation_mark_resolver._quotation_mark_resolver_state._current_depth == 0 - assert quotation_mark_resolver._quotation_continuer_state._current_depth == 0 + assert quotation_mark_resolver._quote_continuer_state._quote_continuer_mark_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 + assert quotation_mark_resolver._quote_continuer_state.current_depth == 0 quotation_mark_resolver.reset() assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] - assert quotation_mark_resolver._quotation_continuer_state._quotation_continuer_stack == [] - assert quotation_mark_resolver._quotation_mark_resolver_state._current_depth == 0 - assert quotation_mark_resolver._quotation_continuer_state._current_depth == 0 + assert quotation_mark_resolver._quote_continuer_state._quote_continuer_mark_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 + assert quotation_mark_resolver._quote_continuer_state.current_depth == 0 quotation_mark_string_matches: List[QuotationMarkStringMatch] = [ QuotationMarkStringMatch(TextSegment.Builder().set_text("Opening “quote").build(), 8, 9), @@ -40,11 +40,11 @@ def test_reset() -> None: list(quotation_mark_resolver.resolve_quotation_marks(quotation_mark_string_matches)) assert len(quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack) > 0 - assert quotation_mark_resolver._quotation_mark_resolver_state._current_depth > 0 + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth > 0 quotation_mark_resolver.reset() assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] - assert quotation_mark_resolver._quotation_continuer_state._quotation_continuer_stack == [] - assert quotation_mark_resolver._quotation_mark_resolver_state._current_depth == 0 - assert quotation_mark_resolver._quotation_continuer_state._current_depth == 0 + assert quotation_mark_resolver._quote_continuer_state._quote_continuer_mark_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 + assert quotation_mark_resolver._quote_continuer_state.current_depth == 0 diff --git a/tests/corpora/punctuation_analysis/test_quote_convention.py b/tests/corpora/punctuation_analysis/test_quote_convention.py index d20e5337..89e413a2 100644 --- a/tests/corpora/punctuation_analysis/test_quote_convention.py +++ b/tests/corpora/punctuation_analysis/test_quote_convention.py @@ -5,83 +5,83 @@ def test_single_level_quote_convention_normalize() -> None: english_level1_quote_convention = SingleLevelQuoteConvention("\u201c", "\u201d") normalized_english_level1_quote_convention = english_level1_quote_convention.normalize() - assert normalized_english_level1_quote_convention.opening_quote == '"' - assert normalized_english_level1_quote_convention.closing_quote == '"' + assert normalized_english_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_english_level1_quote_convention.closing_quotation_mark == '"' english_level2_quote_convention = SingleLevelQuoteConvention("\u2018", "\u2019") normalized_english_level2_quote_convention = english_level2_quote_convention.normalize() - assert normalized_english_level2_quote_convention.opening_quote == "'" - assert normalized_english_level2_quote_convention.closing_quote == "'" + assert normalized_english_level2_quote_convention.opening_quotation_mark == "'" + assert normalized_english_level2_quote_convention.closing_quotation_mark == "'" already_normalized_english_level1_quote_convention = SingleLevelQuoteConvention('"', '"') doubly_normalized_english_level1_quote_convention = already_normalized_english_level1_quote_convention.normalize() - assert doubly_normalized_english_level1_quote_convention.opening_quote == '"' - assert doubly_normalized_english_level1_quote_convention.closing_quote == '"' + assert doubly_normalized_english_level1_quote_convention.opening_quotation_mark == '"' + assert doubly_normalized_english_level1_quote_convention.closing_quotation_mark == '"' already_normalized_english_level2_quote_convention = SingleLevelQuoteConvention("'", "'") doubly_normalized_english_level2_quote_convention = already_normalized_english_level2_quote_convention.normalize() - assert doubly_normalized_english_level2_quote_convention.opening_quote == "'" - assert doubly_normalized_english_level2_quote_convention.closing_quote == "'" + assert doubly_normalized_english_level2_quote_convention.opening_quotation_mark == "'" + assert doubly_normalized_english_level2_quote_convention.closing_quotation_mark == "'" french_level1_quote_convention = SingleLevelQuoteConvention("\u00ab", "\u00bb") normalized_french_level1_quote_convention = french_level1_quote_convention.normalize() - assert normalized_french_level1_quote_convention.opening_quote == '"' - assert normalized_french_level1_quote_convention.closing_quote == '"' + assert normalized_french_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_french_level1_quote_convention.closing_quotation_mark == '"' french_level2_quote_convention = SingleLevelQuoteConvention("\u2039", "\u203a") normalized_french_level2_quote_convention = french_level2_quote_convention.normalize() - assert normalized_french_level2_quote_convention.opening_quote == "\u2039" - assert normalized_french_level2_quote_convention.closing_quote == "\u203a" + assert normalized_french_level2_quote_convention.opening_quotation_mark == "\u2039" + assert normalized_french_level2_quote_convention.closing_quotation_mark == "\u203a" typewriter_french_level1_quote_convention = SingleLevelQuoteConvention("<<", ">>") normalized_typewriter_french_level1_quote_convention = typewriter_french_level1_quote_convention.normalize() - assert normalized_typewriter_french_level1_quote_convention.opening_quote == "<<" - assert normalized_typewriter_french_level1_quote_convention.closing_quote == ">>" + assert normalized_typewriter_french_level1_quote_convention.opening_quotation_mark == "<<" + assert normalized_typewriter_french_level1_quote_convention.closing_quotation_mark == ">>" typewriter_french_level2_quote_convention = SingleLevelQuoteConvention("<", ">") normalized_typewriter_french_level2_quote_convention = typewriter_french_level2_quote_convention.normalize() - assert normalized_typewriter_french_level2_quote_convention.opening_quote == "<" - assert normalized_typewriter_french_level2_quote_convention.closing_quote == ">" + assert normalized_typewriter_french_level2_quote_convention.opening_quotation_mark == "<" + assert normalized_typewriter_french_level2_quote_convention.closing_quotation_mark == ">" central_european_level1_quote_convention = SingleLevelQuoteConvention("\u201e", "\u201c") normalized_central_european_level1_quote_convention = central_european_level1_quote_convention.normalize() - assert normalized_central_european_level1_quote_convention.opening_quote == '"' - assert normalized_central_european_level1_quote_convention.closing_quote == '"' + assert normalized_central_european_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_central_european_level1_quote_convention.closing_quotation_mark == '"' central_european_level2_quote_convention = SingleLevelQuoteConvention("\u201a", "\u2018") normalized_central_european_level2_quote_convention = central_european_level2_quote_convention.normalize() - assert normalized_central_european_level2_quote_convention.opening_quote == "'" - assert normalized_central_european_level2_quote_convention.closing_quote == "'" + assert normalized_central_european_level2_quote_convention.opening_quotation_mark == "'" + assert normalized_central_european_level2_quote_convention.closing_quotation_mark == "'" central_european_guillemets_quote_convention = SingleLevelQuoteConvention("\u00bb", "\u00ab") normalized_central_european_guillemets_quote_convention = central_european_guillemets_quote_convention.normalize() - assert normalized_central_european_guillemets_quote_convention.opening_quote == '"' - assert normalized_central_european_guillemets_quote_convention.closing_quote == '"' + assert normalized_central_european_guillemets_quote_convention.opening_quotation_mark == '"' + assert normalized_central_european_guillemets_quote_convention.closing_quotation_mark == '"' swedish_level1_quote_convention = SingleLevelQuoteConvention("\u201d", "\u201d") normalized_swedish_level1_quote_convention = swedish_level1_quote_convention.normalize() - assert normalized_swedish_level1_quote_convention.opening_quote == '"' - assert normalized_swedish_level1_quote_convention.closing_quote == '"' + assert normalized_swedish_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_swedish_level1_quote_convention.closing_quotation_mark == '"' swedish_level2_quote_convention = SingleLevelQuoteConvention("\u2019", "\u2019") normalized_swedish_level2_quote_convention = swedish_level2_quote_convention.normalize() - assert normalized_swedish_level2_quote_convention.opening_quote == "'" - assert normalized_swedish_level2_quote_convention.closing_quote == "'" + assert normalized_swedish_level2_quote_convention.opening_quotation_mark == "'" + assert normalized_swedish_level2_quote_convention.closing_quotation_mark == "'" finnish_level1_quote_convention = SingleLevelQuoteConvention("\u00bb", "\u00bb") normalized_finnish_level1_quote_convention = finnish_level1_quote_convention.normalize() - assert normalized_finnish_level1_quote_convention.opening_quote == '"' - assert normalized_finnish_level1_quote_convention.closing_quote == '"' + assert normalized_finnish_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_finnish_level1_quote_convention.closing_quotation_mark == '"' arabic_level1_quote_convention = SingleLevelQuoteConvention("\u201d", "\u201c") normalized_arabic_level1_quote_convention = arabic_level1_quote_convention.normalize() - assert normalized_arabic_level1_quote_convention.opening_quote == '"' - assert normalized_arabic_level1_quote_convention.closing_quote == '"' + assert normalized_arabic_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_arabic_level1_quote_convention.closing_quotation_mark == '"' arabic_level2_quote_convention = SingleLevelQuoteConvention("\u2019", "\u2018") normalized_arabic_level2_quote_convention = arabic_level2_quote_convention.normalize() - assert normalized_arabic_level2_quote_convention.opening_quote == "'" - assert normalized_arabic_level2_quote_convention.closing_quote == "'" + assert normalized_arabic_level2_quote_convention.opening_quotation_mark == "'" + assert normalized_arabic_level2_quote_convention.closing_quotation_mark == "'" def test_get_num_levels() -> None: @@ -123,9 +123,9 @@ def test_get_opening_quote_at_level() -> None: SingleLevelQuoteConvention("\u00ab", "\u00bb"), ], ) - assert quote_convention.get_opening_quote_at_level(1) == "\u201c" - assert quote_convention.get_opening_quote_at_level(2) == "\u2018" - assert quote_convention.get_opening_quote_at_level(3) == "\u00ab" + assert quote_convention.get_opening_quotation_mark_at_level(1) == "\u201c" + assert quote_convention.get_opening_quotation_mark_at_level(2) == "\u2018" + assert quote_convention.get_opening_quotation_mark_at_level(3) == "\u00ab" def test_get_closing_quote_at_level() -> None: @@ -137,9 +137,9 @@ def test_get_closing_quote_at_level() -> None: SingleLevelQuoteConvention("\u00ab", "\u00bb"), ], ) - assert quote_convention.get_closing_quote_at_level(1) == "\u201d" - assert quote_convention.get_closing_quote_at_level(2) == "\u2019" - assert quote_convention.get_closing_quote_at_level(3) == "\u00bb" + assert quote_convention.get_closing_quotation_mark_at_level(1) == "\u201d" + assert quote_convention.get_closing_quotation_mark_at_level(2) == "\u2019" + assert quote_convention.get_closing_quotation_mark_at_level(3) == "\u00bb" def test_get_expected_quotation_mark() -> None: @@ -313,14 +313,14 @@ def test_normalize() -> None: normalized_standard_english_quote_convention = standard_english_quote_convention.normalize() assert normalized_standard_english_quote_convention.name == "standard-english-quote-convention_normalized" assert normalized_standard_english_quote_convention.num_levels == 4 - assert normalized_standard_english_quote_convention.get_opening_quote_at_level(1) == '"' - assert normalized_standard_english_quote_convention.get_closing_quote_at_level(1) == '"' - assert normalized_standard_english_quote_convention.get_opening_quote_at_level(2) == "'" - assert normalized_standard_english_quote_convention.get_closing_quote_at_level(2) == "'" - assert normalized_standard_english_quote_convention.get_opening_quote_at_level(3) == '"' - assert normalized_standard_english_quote_convention.get_closing_quote_at_level(3) == '"' - assert normalized_standard_english_quote_convention.get_opening_quote_at_level(4) == "'" - assert normalized_standard_english_quote_convention.get_closing_quote_at_level(4) == "'" + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_level(1) == '"' + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_level(1) == '"' + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_level(2) == "'" + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_level(2) == "'" + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_level(3) == '"' + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_level(3) == '"' + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_level(4) == "'" + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_level(4) == "'" western_european_quote_convention = QuoteConvention( "test-quote-convention", @@ -333,12 +333,12 @@ def test_normalize() -> None: normalized_western_european_quote_convention = western_european_quote_convention.normalize() assert normalized_western_european_quote_convention.name == "test-quote-convention_normalized" assert normalized_western_european_quote_convention.num_levels == 3 - assert normalized_western_european_quote_convention.get_opening_quote_at_level(1) == '"' - assert normalized_western_european_quote_convention.get_closing_quote_at_level(1) == '"' - assert normalized_western_european_quote_convention.get_opening_quote_at_level(2) == '"' - assert normalized_western_european_quote_convention.get_closing_quote_at_level(2) == '"' - assert normalized_western_european_quote_convention.get_opening_quote_at_level(3) == "'" - assert normalized_western_european_quote_convention.get_closing_quote_at_level(3) == "'" + assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_level(1) == '"' + assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_level(1) == '"' + assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_level(2) == '"' + assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_level(2) == '"' + assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_level(3) == "'" + assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_level(3) == "'" hybrid_british_typewriter_english_quote_convention = QuoteConvention( "hybrid-british-typewriter-english-quote-convention", @@ -357,12 +357,12 @@ def test_normalize() -> None: == "hybrid-british-typewriter-english-quote-convention_normalized" ) assert normalized_hybrid_british_typewriter_english_quote_convention.num_levels == 3 - assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quote_at_level(1) == '"' - assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quote_at_level(1) == '"' - assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quote_at_level(2) == "'" - assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quote_at_level(2) == "'" - assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quote_at_level(3) == '"' - assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quote_at_level(3) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_level(1) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_level(1) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_level(2) == "'" + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_level(2) == "'" + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_level(3) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_level(3) == '"' def test_print_summary() -> None: diff --git a/tests/corpora/punctuation_analysis/test_quote_convention_detector.py b/tests/corpora/punctuation_analysis/test_quote_convention_detector.py index 31df6034..2e5b015a 100644 --- a/tests/corpora/punctuation_analysis/test_quote_convention_detector.py +++ b/tests/corpora/punctuation_analysis/test_quote_convention_detector.py @@ -302,4 +302,4 @@ def test_mismatched_quotation_marks() -> None: def detect_quote_convention(usfm: str) -> Union[QuoteConventionAnalysis, None]: quote_convention_detector = QuoteConventionDetector() parse_usfm(usfm, quote_convention_detector) - return quote_convention_detector.detect_quotation_convention(print_summary=False) + return quote_convention_detector.detect_quote_convention(print_summary=False) diff --git a/tests/corpora/punctuation_analysis/test_text_segment.py b/tests/corpora/punctuation_analysis/test_text_segment.py index 25d64fef..4fa34058 100644 --- a/tests/corpora/punctuation_analysis/test_text_segment.py +++ b/tests/corpora/punctuation_analysis/test_text_segment.py @@ -70,7 +70,7 @@ def test_builder_set_usfm_token() -> None: def test_set_previous_segment() -> None: text_segment = TextSegment.Builder().set_text("example text").build() previous_segment = TextSegment.Builder().set_text("previous segment text").build() - text_segment.set_previous_segment(previous_segment) + text_segment.previous_segment = previous_segment assert text_segment._previous_segment == previous_segment assert text_segment._next_segment is None @@ -83,7 +83,7 @@ def test_set_previous_segment() -> None: def test_set_next_segment() -> None: text_segment = TextSegment.Builder().set_text("example text").build() next_segment = TextSegment.Builder().set_text("next segment text").build() - text_segment.set_next_segment(next_segment) + text_segment.next_segment = next_segment assert text_segment._previous_segment is None assert text_segment._next_segment == next_segment @@ -187,10 +187,10 @@ def test_equals() -> None: assert segment_with_preceding_marker != basic_segment segment_with_previous_segment = TextSegment.Builder().set_text("text1").build() - segment_with_previous_segment.set_previous_segment(segment_with_num_verses) + segment_with_previous_segment.previous_segment = segment_with_num_verses segment_with_next_segment = TextSegment.Builder().set_text("text1").build() - segment_with_next_segment.set_next_segment(segment_with_num_verses) + segment_with_next_segment.next_segment = segment_with_num_verses assert basic_segment == segment_with_previous_segment assert basic_segment == segment_with_next_segment diff --git a/tests/corpora/punctuation_analysis/test_usfm_structure_extractor.py b/tests/corpora/punctuation_analysis/test_usfm_structure_extractor.py index e489a620..26cad441 100644 --- a/tests/corpora/punctuation_analysis/test_usfm_structure_extractor.py +++ b/tests/corpora/punctuation_analysis/test_usfm_structure_extractor.py @@ -416,19 +416,6 @@ def test_empty_text(): ) -def test_reset(): - usfm_structure_extractor = UsfmStructureExtractor() - usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) - usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) - usfm_structure_extractor.text(verse_text_parser_state, "test") - usfm_structure_extractor._reset() - - expected_chapters = [] - - actual_chapters = usfm_structure_extractor.get_chapters() - assert_chapter_equal(expected_chapters, actual_chapters) - - def assert_chapter_equal(expected_chapters: List[Chapter], actual_chapters: List[Chapter]): assert len(expected_chapters) == len(actual_chapters) for expected_chapter, actual_chapter in zip(expected_chapters, actual_chapters): diff --git a/tests/corpora/test_fallback_quotation_mark_resolver.py b/tests/corpora/test_fallback_quotation_mark_resolver.py index 2ea25708..e9ad7bdc 100644 --- a/tests/corpora/test_fallback_quotation_mark_resolver.py +++ b/tests/corpora/test_fallback_quotation_mark_resolver.py @@ -76,35 +76,35 @@ def test_is_opening_quote(): # valid opening quote at start of segment quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1) - assert basic_quotation_mark_resolver._is_opening_quote(quote_match) is True + assert basic_quotation_mark_resolver._is_opening_quotation_mark(quote_match) is True # opening quote with leading whitespace quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test "text"').build(), 5, 6) - assert basic_quotation_mark_resolver._is_opening_quote(quote_match) is True + assert basic_quotation_mark_resolver._is_opening_quotation_mark(quote_match) is True # opening quote with quote introducer quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test:"text"').build(), 5, 6) - assert basic_quotation_mark_resolver._is_opening_quote(quote_match) is True + assert basic_quotation_mark_resolver._is_opening_quotation_mark(quote_match) is True # QuotationMarkStringMatch indices don't indicate a quotation mark quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test "text"').build(), 0, 1) - assert basic_quotation_mark_resolver._is_opening_quote(quote_match) is False + assert basic_quotation_mark_resolver._is_opening_quotation_mark(quote_match) is False # the quotation mark is not valid under the current quote convention quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('').build(), 10, 11) - assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is False + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is False # no trailing whitespace after quotation mark quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('"test"text').build(), 5, 6) - assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is False + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is False # opening quote at the start of the segment quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1) - assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is False + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is False # opening quote with leading whitespace quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test "text"').build(), 5, 6) - assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is False + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is False def test_is_closing_quote_with_unambiguous_quote_convention(): @@ -250,19 +250,19 @@ def test_is_closing_quote_with_unambiguous_quote_convention(): # unambiguous closing quote at end of segment quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("“test text”").build(), 10, 11) - assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is True + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is True # unambiguous closing quote with trailing whitespace quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("“test” text").build(), 5, 6) - assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is True + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is True # unambiguous closing quote without the "correct" context quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("“test”text").build(), 5, 6) - assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is True + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is True # unambiguous opening quote quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("test “text”").build(), 5, 6) - assert basic_quotation_mark_resolver._is_closing_quote(quote_match) is False + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is False def test_resolve_opening_quote(): diff --git a/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py index 4e18b2a8..36f73133 100644 --- a/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py +++ b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py @@ -1,7 +1,7 @@ from typing import Union from machine.corpora import ( - QuotationDenormalizationUsfmUpdateBlockHandler, + QuotationMarkDenormalizationUsfmUpdateBlockHandler, QuotationMarkUpdateSettings, QuotationMarkUpdateStrategy, UpdateUsfmParserHandler, @@ -379,7 +379,7 @@ def denormalize_quotation_marks( target_quote_convention_name: str, quotation_denormalization_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), ) -> str: - quotation_denormalizer: QuotationDenormalizationUsfmUpdateBlockHandler = ( + quotation_denormalizer: QuotationMarkDenormalizationUsfmUpdateBlockHandler = ( create_quotation_denormalization_usfm_update_block_handler( source_quote_convention_name, target_quote_convention_name, quotation_denormalization_settings ) @@ -395,11 +395,11 @@ def create_quotation_denormalization_usfm_update_block_handler( source_quote_convention_name: str, target_quote_convention_name: str, quotation_denormalization_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), -) -> QuotationDenormalizationUsfmUpdateBlockHandler: +) -> QuotationMarkDenormalizationUsfmUpdateBlockHandler: source_quote_convention = get_quote_convention_by_name(source_quote_convention_name) target_quote_convention = get_quote_convention_by_name(target_quote_convention_name) - return QuotationDenormalizationUsfmUpdateBlockHandler( + return QuotationMarkDenormalizationUsfmUpdateBlockHandler( source_quote_convention, target_quote_convention, quotation_denormalization_settings, diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py index 68e0923c..8b60fb80 100644 --- a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py +++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py @@ -1,7 +1,6 @@ from typing import Generator, List, Set, Union from machine.corpora import ( - QuotationMarkUpdateResolutionSettings, QuotationMarkUpdateSettings, QuotationMarkUpdateStrategy, QuoteConventionChangingUsfmUpdateBlockHandler, @@ -21,7 +20,6 @@ QuotationMarkResolutionIssue, QuotationMarkResolver, QuotationMarkStringMatch, - QuoteConvention, QuoteConventionSet, TextSegment, UsfmMarkerType, @@ -681,7 +679,9 @@ def find_all_potential_quotation_marks_in_text_segments( class MockQuotationMarkResolver(QuotationMarkResolver): def __init__(self): - super().__init__(QuotationMarkUpdateResolutionSettings(QuoteConvention("", []), QuoteConvention("", []))) + self.num_times_called = 0 + + def reset(self) -> None: self.num_times_called = 0 def resolve_quotation_marks( From 7375fe9dc69bf4c7bb4da7e9ab9bab6401255481 Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 10 Jul 2025 11:59:02 -0400 Subject: [PATCH 23/31] One code review change that was left out of the previous commit --- .../depth_based_quotation_mark_resolver.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py index a9b158ce..55c2d398 100644 --- a/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py +++ b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py @@ -116,7 +116,7 @@ def is_english_quote_continuer( ) -> bool: if self._quote_continuer_state.continuer_style == QuoteContinuerStyle.SPANISH: return False - if not self._meets_quote_continuer_prerequisites(quotation_mark_match, previous_match, next_match): + if not self._meets_quote_continuer_prerequisites(quotation_mark_match): return False if ( @@ -146,7 +146,7 @@ def is_spanish_quote_continuer( ) -> bool: if self._quote_continuer_state.continuer_style == QuoteContinuerStyle.ENGLISH: return False - if not self._meets_quote_continuer_prerequisites(quotation_mark_match, previous_match, next_match): + if not self._meets_quote_continuer_prerequisites(quotation_mark_match): return False if not self._settings.are_marks_a_valid_pair( @@ -175,8 +175,6 @@ def is_spanish_quote_continuer( def _meets_quote_continuer_prerequisites( self, quotation_mark_match: QuotationMarkStringMatch, - previous_match: Union[QuotationMarkStringMatch, None], - next_match: Union[QuotationMarkStringMatch, None], ) -> bool: if ( self._settings.should_rely_on_paragraph_markers From 5195ee3193c9706cb68e06b49a2775bada2e4921 Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 17 Jul 2025 10:43:39 -0400 Subject: [PATCH 24/31] Fixes for Eli's hopefully final code review comments --- .../fallback_quotation_mark_resolver.py | 16 +- .../corpora/punctuation_analysis/__init__.py | 2 + .../depth_based_quotation_mark_resolver.py | 30 ++-- .../preliminary_quotation_mark_analyzer.py | 6 +- .../quotation_mark_string_match.py | 16 +- .../quotation_mark_tabulator.py | 2 +- .../punctuation_analysis/quote_convention.py | 76 ++++---- .../quote_convention_detector.py | 4 +- .../quote_convention_set.py | 20 +-- .../punctuation_analysis/text_segment.py | 45 ++--- .../usfm_structure_extractor.py | 2 +- machine/corpora/punctuation_analysis/verse.py | 4 +- .../quotation_mark_update_first_pass.py | 16 +- ...otation_mark_update_resolution_settings.py | 3 +- .../corpora/quotation_mark_update_settings.py | 14 +- ...tion_changing_usfm_update_block_handler.py | 33 ++-- ...est_depth_based_quotation_mark_resolver.py | 166 +++++------------- .../test_quotation_mark_finder.py | 26 +-- .../test_quotation_mark_metadata.py | 6 +- .../test_quotation_mark_resolver.py | 4 +- .../test_quote_convention.py | 52 +++--- .../punctuation_analysis/test_text_segment.py | 104 +++-------- .../punctuation_analysis/test_verse.py | 12 +- .../test_fallback_quotation_mark_resolver.py | 58 ++---- .../corpora/test_quotation_denormalization.py | 53 ++++++ ...normalization_usfm_block_update_handler.py | 14 +- .../test_quotation_mark_update_first_pass.py | 22 +-- ...tion_changing_usfm_block_update_handler.py | 70 ++++---- 28 files changed, 366 insertions(+), 510 deletions(-) create mode 100644 tests/corpora/test_quotation_denormalization.py diff --git a/machine/corpora/fallback_quotation_mark_resolver.py b/machine/corpora/fallback_quotation_mark_resolver.py index fa337d11..41b33a5e 100644 --- a/machine/corpora/fallback_quotation_mark_resolver.py +++ b/machine/corpora/fallback_quotation_mark_resolver.py @@ -1,4 +1,4 @@ -from typing import Generator, Set, Union +from typing import Generator, Optional, Set from .punctuation_analysis.quotation_mark_direction import QuotationMarkDirection from .punctuation_analysis.quotation_mark_metadata import QuotationMarkMetadata @@ -12,7 +12,7 @@ class FallbackQuotationMarkResolver(QuotationMarkResolver): def __init__(self, settings: QuotationMarkResolutionSettings): self._settings: QuotationMarkResolutionSettings = settings - self._last_quotation_mark: Union[QuotationMarkMetadata, None] = None + self._last_quotation_mark: Optional[QuotationMarkMetadata] = None self._issues: Set[QuotationMarkResolutionIssue] = set() def reset(self) -> None: @@ -30,13 +30,13 @@ def _resolve_quotation_mark( quotation_mark_match: QuotationMarkStringMatch, ) -> Generator[QuotationMarkMetadata, None, None]: if self._is_opening_quotation_mark(quotation_mark_match): - quotation_mark: Union[QuotationMarkMetadata, None] = self._resolve_opening_mark(quotation_mark_match) + quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_opening_mark(quotation_mark_match) if quotation_mark is not None: yield quotation_mark else: self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) elif self._is_closing_quotation_mark(quotation_mark_match): - quotation_mark: Union[QuotationMarkMetadata, None] = self._resolve_closing_mark(quotation_mark_match) + quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_closing_mark(quotation_mark_match) if quotation_mark is not None: yield quotation_mark else: @@ -94,9 +94,7 @@ def _is_closing_quotation_mark( return False - def _resolve_opening_mark( - self, quotation_mark_match: QuotationMarkStringMatch - ) -> Union[QuotationMarkMetadata, None]: + def _resolve_opening_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> Optional[QuotationMarkMetadata]: possible_depths: Set[int] = self._settings.get_possible_depths( quotation_mark_match.quotation_mark, QuotationMarkDirection.OPENING ) @@ -107,9 +105,7 @@ def _resolve_opening_mark( self._last_quotation_mark = quotation_mark return quotation_mark - def _resolve_closing_mark( - self, quotation_mark_match: QuotationMarkStringMatch - ) -> Union[QuotationMarkMetadata, None]: + def _resolve_closing_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> Optional[QuotationMarkMetadata]: possible_depths: Set[int] = self._settings.get_possible_depths( quotation_mark_match.quotation_mark, QuotationMarkDirection.CLOSING ) diff --git a/machine/corpora/punctuation_analysis/__init__.py b/machine/corpora/punctuation_analysis/__init__.py index 4ac5d9df..5aaeb99a 100644 --- a/machine/corpora/punctuation_analysis/__init__.py +++ b/machine/corpora/punctuation_analysis/__init__.py @@ -26,6 +26,7 @@ from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector from .quote_convention_set import QuoteConventionSet +from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS from .text_segment import TextSegment from .usfm_marker_type import UsfmMarkerType from .usfm_structure_extractor import UsfmStructureExtractor @@ -59,6 +60,7 @@ "QuotationMarkTabulator", "QuoteConventionDetector", "QuoteConventionSet", + "STANDARD_QUOTE_CONVENTIONS", "TextSegment", "UsfmMarkerType", "UsfmStructureExtractor", diff --git a/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py index 55c2d398..1de92884 100644 --- a/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py +++ b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py @@ -1,5 +1,5 @@ from enum import Enum, auto -from typing import Generator, Set, Union +from typing import Generator, Optional, Set import regex @@ -90,7 +90,7 @@ def add_quote_continuer( ) self._quote_continuer_mark_stack.append(quotation_mark) self._continuer_style = quote_continuer_style - if len(self._quote_continuer_mark_stack) == len(quotation_mark_resolver_state._quotation_stack): + if self.current_depth == quotation_mark_resolver_state.current_depth: self._quote_continuer_mark_stack.clear() return quotation_mark @@ -111,8 +111,8 @@ def __init__( def is_english_quote_continuer( self, quotation_mark_match: QuotationMarkStringMatch, - previous_match: Union[QuotationMarkStringMatch, None], - next_match: Union[QuotationMarkStringMatch, None], + previous_match: Optional[QuotationMarkStringMatch], + next_match: Optional[QuotationMarkStringMatch], ) -> bool: if self._quote_continuer_state.continuer_style == QuoteContinuerStyle.SPANISH: return False @@ -141,8 +141,8 @@ def is_english_quote_continuer( def is_spanish_quote_continuer( self, quotation_mark_match: QuotationMarkStringMatch, - previous_match: Union[QuotationMarkStringMatch, None], - next_match: Union[QuotationMarkStringMatch, None], + previous_match: Optional[QuotationMarkStringMatch], + next_match: Optional[QuotationMarkStringMatch], ) -> bool: if self._quote_continuer_state.continuer_style == QuoteContinuerStyle.ENGLISH: return False @@ -161,7 +161,7 @@ def is_spanish_quote_continuer( if quotation_mark_match._start_index > 0: return False - # this has only been observed with guillemets so far + # This has only been observed with guillemets so far if quotation_mark_match.quotation_mark != "»": return False @@ -194,7 +194,7 @@ def is_opening_quotation_mark( if not self._settings.is_valid_opening_quotation_mark(quotation_mark_match): return False - # if the quote convention is ambiguous, use whitespace as a clue + # If the quote convention is ambiguous, use whitespace as a clue if self._settings.is_valid_closing_quotation_mark(quotation_mark_match): return ( quotation_mark_match.has_leading_whitespace() @@ -213,7 +213,7 @@ def is_closing_quotation_mark( if not self._settings.is_valid_closing_quotation_mark(quotation_mark_match): return False - # if the quote convention is ambiguous, use whitespace as a clue + # If the quote convention is ambiguous, use whitespace as a clue if self._settings.is_valid_opening_quotation_mark(quotation_mark_match): return ( quotation_mark_match.has_trailing_whitespace() @@ -285,7 +285,7 @@ def _most_recent_opening_mark_immediately_precedes(self, match: QuotationMarkStr def is_apostrophe( self, quotation_mark_match: QuotationMarkStringMatch, - next_match: Union[QuotationMarkStringMatch, None], + next_match: Optional[QuotationMarkStringMatch], ) -> bool: if not quotation_mark_match.quotation_mark_matches(self._APOSTROPHE_PATTERN): return False @@ -299,11 +299,11 @@ def is_apostrophe( ): return True - # potential final s possessive (e.g. Moses') + # Potential final s possessive (e.g. Moses') if quotation_mark_match.previous_character_matches(regex.compile(r"s")) and ( quotation_mark_match.has_trailing_whitespace() or quotation_mark_match.has_trailing_punctuation() ): - # check whether it could be a closing quotation mark + # Check whether it could be a closing quotation mark if not self._quotation_mark_resolver_state.has_open_quotation_mark(): return True if not self._settings.are_marks_a_valid_pair( @@ -317,7 +317,7 @@ def is_apostrophe( ): return True - # for languages that use apostrophes at the start and end of words + # For languages that use apostrophes at the start and end of words if ( not self._quotation_mark_resolver_state.has_open_quotation_mark() and quotation_mark_match.quotation_mark == "'" @@ -360,8 +360,8 @@ def resolve_quotation_marks( def _resolve_quotation_mark( self, quotation_mark_match: QuotationMarkStringMatch, - previous_mark: Union[QuotationMarkStringMatch, None], - next_mark: Union[QuotationMarkStringMatch, None], + previous_mark: Optional[QuotationMarkStringMatch], + next_mark: Optional[QuotationMarkStringMatch], ) -> Generator[QuotationMarkMetadata, None, None]: if self._quotation_mark_categorizer.is_opening_quotation_mark(quotation_mark_match): if self._quotation_mark_categorizer.is_english_quote_continuer( diff --git a/machine/corpora/punctuation_analysis/preliminary_quotation_mark_analyzer.py b/machine/corpora/punctuation_analysis/preliminary_quotation_mark_analyzer.py index 7367d5cc..47ada522 100644 --- a/machine/corpora/punctuation_analysis/preliminary_quotation_mark_analyzer.py +++ b/machine/corpora/punctuation_analysis/preliminary_quotation_mark_analyzer.py @@ -145,7 +145,7 @@ def _group_quotation_marks(self, quotation_marks: List[QuotationMarkStringMatch] def get_quotation_mark_pairs(self) -> Generator[Tuple[str, str], None, None]: for mark1, matches1 in self._grouped_quotation_marks.items(): - # handle cases of identical opening/closing marks + # Handle cases of identical opening/closing marks if ( len(matches1) == 2 and self._quote_conventions.is_quotation_mark_direction_ambiguous(mark1) @@ -154,11 +154,11 @@ def get_quotation_mark_pairs(self) -> Generator[Tuple[str, str], None, None]: yield (mark1, mark1) continue - # skip verses where quotation mark pairs are ambiguous + # Skip verses where quotation mark pairs are ambiguous if len(matches1) > 1: continue - # find matching closing marks + # Find matching closing marks for mark2, matches2 in self._grouped_quotation_marks.items(): if ( len(matches2) == 1 diff --git a/machine/corpora/punctuation_analysis/quotation_mark_string_match.py b/machine/corpora/punctuation_analysis/quotation_mark_string_match.py index 9de4b8e2..67dd4a46 100644 --- a/machine/corpora/punctuation_analysis/quotation_mark_string_match.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_string_match.py @@ -1,5 +1,5 @@ from re import Pattern -from typing import Union +from typing import Optional import regex @@ -12,7 +12,7 @@ class QuotationMarkStringMatch: - # extra stuff in the regex to handle Western Cham + # Extra stuff in the regex to handle Western Cham _LETTER_PATTERN: Pattern = regex.compile(r"[\p{L}\U0001E200-\U0001E28F]", regex.U) _LATIN_LETTER_PATTERN: Pattern = regex.compile(r"^\p{script=Latin}$", regex.U) _WHITESPACE_PATTERN: Pattern = regex.compile(r"[\s~]", regex.U) @@ -53,7 +53,7 @@ def previous_character_matches(self, regex_pattern: regex.Pattern) -> bool: return self.previous_character is not None and regex_pattern.search(self.previous_character) is not None @property - def previous_character(self) -> Union[str, None]: + def previous_character(self) -> Optional[str]: if self._start_index == 0: previous_segment = self._text_segment.previous_segment if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context( @@ -64,7 +64,7 @@ def previous_character(self) -> Union[str, None]: return self._text_segment.text[self._start_index - 1] @property - def next_character(self) -> Union[str, None]: + def next_character(self) -> Optional[str]: if self.is_at_end_of_segment(): next_segment = self._text_segment.next_segment if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH): @@ -78,10 +78,10 @@ def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool: def trailing_substring_matches(self, regex_pattern: regex.Pattern) -> bool: return regex_pattern.search(self._text_segment.substring_after(self._end_index)) is not None - # this assumes that the two matches occur in the same verse + # This assumes that the two matches occur in the same verse def precedes(self, other: "QuotationMarkStringMatch") -> bool: - return self._text_segment._index_in_verse < other._text_segment._index_in_verse or ( - self._text_segment._index_in_verse == other._text_segment._index_in_verse + return self._text_segment.index_in_verse < other._text_segment.index_in_verse or ( + self._text_segment.index_in_verse == other._text_segment.index_in_verse and self._start_index < other._start_index ) @@ -97,7 +97,7 @@ def start_index(self) -> int: def end_index(self) -> int: return self._end_index - # not used, but a useful method for debugging + # Not used, but a useful method for debugging @property def context(self) -> str: return self._text_segment.text[ diff --git a/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py b/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py index ec17eba6..c76ff540 100644 --- a/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py @@ -54,7 +54,7 @@ def calculate_similarity(self, quote_convention: QuoteConvention) -> float: for depth, direction in self._quotation_counts_by_depth_and_direction: expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction) - # give higher weight to shallower depths, since deeper marks are more likely to be mistakes + # Give higher weight to shallower depths, since deeper marks are more likely to be mistakes weighted_difference += self._quotation_counts_by_depth_and_direction[ (depth, direction) ].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth) diff --git a/machine/corpora/punctuation_analysis/quote_convention.py b/machine/corpora/punctuation_analysis/quote_convention.py index 23063d42..386cd559 100644 --- a/machine/corpora/punctuation_analysis/quote_convention.py +++ b/machine/corpora/punctuation_analysis/quote_convention.py @@ -39,21 +39,21 @@ def normalize(self) -> "SingleLevelQuoteConvention": class QuoteConvention: - def __init__(self, name: str, levels: list[SingleLevelQuoteConvention]): + def __init__(self, name: str, level_conventions: list[SingleLevelQuoteConvention]): self._name = name - self.levels = levels + self.level_conventions = level_conventions def __eq__(self, value): if not isinstance(value, QuoteConvention): return False if self._name != value._name: return False - if len(self.levels) != len(value.levels): + if len(self.level_conventions) != len(value.level_conventions): return False - for level, other_level in zip(self.levels, value.levels): - if level.opening_quotation_mark != other_level.opening_quotation_mark: + for level_convention, other_level_convention in zip(self.level_conventions, value.level_conventions): + if level_convention.opening_quotation_mark != other_level_convention.opening_quotation_mark: return False - if level.closing_quotation_mark != other_level.closing_quotation_mark: + if level_convention.closing_quotation_mark != other_level_convention.closing_quotation_mark: return False return True @@ -63,41 +63,47 @@ def name(self) -> str: @property def num_levels(self) -> int: - return len(self.levels) + return len(self.level_conventions) - def get_opening_quotation_mark_at_level(self, level: int) -> str: - return self.levels[level - 1].opening_quotation_mark + def get_opening_quotation_mark_at_depth(self, depth: int) -> str: + return self.level_conventions[depth - 1].opening_quotation_mark - def get_closing_quotation_mark_at_level(self, level: int) -> str: - return self.levels[level - 1].closing_quotation_mark + def get_closing_quotation_mark_at_depth(self, depth: int) -> str: + return self.level_conventions[depth - 1].closing_quotation_mark def get_expected_quotation_mark(self, depth: int, direction: QuotationMarkDirection) -> str: if depth > self.num_levels or depth < 1: return "" return ( - self.get_opening_quotation_mark_at_level(depth) + self.get_opening_quotation_mark_at_depth(depth) if direction is QuotationMarkDirection.OPENING - else self.get_closing_quotation_mark_at_level(depth) + else self.get_closing_quotation_mark_at_depth(depth) ) def _includes_opening_quotation_mark(self, opening_quotation_mark: str) -> bool: - for level in self.levels: - if level.opening_quotation_mark == opening_quotation_mark: + for level_convention in self.level_conventions: + if level_convention.opening_quotation_mark == opening_quotation_mark: return True return False def _includes_closing_quotation_mark(self, closing_quotation_mark: str) -> bool: - for level in self.levels: - if level.closing_quotation_mark == closing_quotation_mark: + for level_convention in self.level_conventions: + if level_convention.closing_quotation_mark == closing_quotation_mark: return True return False def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: depths: Set[int] = set() - for depth, level in enumerate(self.levels, start=1): - if direction is QuotationMarkDirection.OPENING and level.opening_quotation_mark == quotation_mark: + for depth, level_convention in enumerate(self.level_conventions, start=1): + if ( + direction is QuotationMarkDirection.OPENING + and level_convention.opening_quotation_mark == quotation_mark + ): depths.add(depth) - elif direction is QuotationMarkDirection.CLOSING and level.closing_quotation_mark == quotation_mark: + elif ( + direction is QuotationMarkDirection.CLOSING + and level_convention.closing_quotation_mark == quotation_mark + ): depths.add(depth) return depths @@ -111,35 +117,37 @@ def is_compatible_with_observed_quotation_marks( if not self._includes_closing_quotation_mark(closing_quotation_mark): return False - # we require the first-level quotation marks to have been observed + # We require the first-level quotation marks to have been observed if ( - self.get_opening_quotation_mark_at_level(1) not in opening_quotation_marks - or self.get_closing_quotation_mark_at_level(1) not in closing_quotation_marks + self.get_opening_quotation_mark_at_depth(1) not in opening_quotation_marks + or self.get_closing_quotation_mark_at_depth(1) not in closing_quotation_marks ): return False return True def normalize(self) -> "QuoteConvention": - return QuoteConvention(self.name + "_normalized", [level.normalize() for level in self.levels]) + return QuoteConvention( + self.name + "_normalized", [level_convention.normalize() for level_convention in self.level_conventions] + ) def __str__(self) -> str: summary = self.name + "\n" - for level, convention in enumerate(self.levels): - ordinal_name = self._get_ordinal_name(level + 1) + for depth, level_convention in enumerate(self.level_conventions): + ordinal_name = self._get_ordinal_name(depth + 1) summary += "%s%s-level quote%s\n" % ( - convention.opening_quotation_mark, + level_convention.opening_quotation_mark, ordinal_name, - convention.closing_quotation_mark, + level_convention.closing_quotation_mark, ) return summary - def _get_ordinal_name(self, level) -> str: - if level == 1: + def _get_ordinal_name(self, depth) -> str: + if depth == 1: return "First" - if level == 2: + if depth == 2: return "Second" - if level == 3: + if depth == 3: return "Third" - if level == 4: + if depth == 4: return "Fourth" - return str(level) + "th" + return str(depth) + "th" diff --git a/machine/corpora/punctuation_analysis/quote_convention_detector.py b/machine/corpora/punctuation_analysis/quote_convention_detector.py index 733111a4..eac56445 100644 --- a/machine/corpora/punctuation_analysis/quote_convention_detector.py +++ b/machine/corpora/punctuation_analysis/quote_convention_detector.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import List, Union +from typing import List, Optional from .chapter import Chapter from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver @@ -50,7 +50,7 @@ def _count_quotation_marks_in_chapter( self._quotation_mark_tabulator.tabulate(resolved_quotation_marks) - def detect_quote_convention(self, print_summary: bool) -> Union[QuoteConventionAnalysis, None]: + def detect_quote_convention(self, print_summary: bool) -> Optional[QuoteConventionAnalysis]: self._count_quotation_marks_in_chapters(self.get_chapters()) (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention( diff --git a/machine/corpora/punctuation_analysis/quote_convention_set.py b/machine/corpora/punctuation_analysis/quote_convention_set.py index a006310f..bef15639 100644 --- a/machine/corpora/punctuation_analysis/quote_convention_set.py +++ b/machine/corpora/punctuation_analysis/quote_convention_set.py @@ -1,6 +1,6 @@ from collections import defaultdict from re import Pattern -from typing import Dict, List, Set, Tuple, Union +from typing import Dict, List, Optional, Set, Tuple import regex @@ -29,9 +29,9 @@ def _create_quotation_mark_regexes(self) -> None: closing_quotation_marks: Set[str] = set() for convention in self._conventions: - for level in range(1, convention.num_levels + 1): - opening_quotation_mark = convention.get_opening_quotation_mark_at_level(level) - closing_quotation_mark = convention.get_closing_quotation_mark_at_level(level) + for depth in range(1, convention.num_levels + 1): + opening_quotation_mark = convention.get_opening_quotation_mark_at_depth(depth) + closing_quotation_mark = convention.get_closing_quotation_mark_at_depth(depth) opening_quotation_marks.add(opening_quotation_mark) closing_quotation_marks.add(closing_quotation_mark) @@ -52,9 +52,9 @@ def _create_quotation_mark_pair_map(self) -> None: self.closing_marks_by_opening_mark: Dict[str, set[str]] = defaultdict(set) self.opening_marks_by_closing_mark: Dict[str, set[str]] = defaultdict(set) for convention in self._conventions: - for level in range(1, convention.num_levels + 1): - opening_quotation_mark = convention.get_opening_quotation_mark_at_level(level) - closing_quotation_mark = convention.get_closing_quotation_mark_at_level(level) + for depth in range(1, convention.num_levels + 1): + opening_quotation_mark = convention.get_opening_quotation_mark_at_depth(depth) + closing_quotation_mark = convention.get_closing_quotation_mark_at_depth(depth) self.closing_marks_by_opening_mark[opening_quotation_mark].add(closing_quotation_mark) self.opening_marks_by_closing_mark[closing_quotation_mark].add(opening_quotation_mark) @@ -70,7 +70,7 @@ def closing_quotation_mark_regex(self) -> Pattern: def quotation_mark_regex(self) -> Pattern: return self._all_quotation_mark_regex - def get_quote_convention_by_name(self, name: str) -> Union[QuoteConvention, None]: + def get_quote_convention_by_name(self, name: str) -> Optional[QuoteConvention]: for convention in self._conventions: if convention.name == name: return convention @@ -139,9 +139,9 @@ def filter_to_compatible_quote_conventions( def find_most_similar_convention( self, tabulated_quotation_marks: QuotationMarkTabulator - ) -> Tuple[Union[QuoteConvention, None], float]: + ) -> Tuple[Optional[QuoteConvention], float]: best_similarity: float = float("-inf") - best_quote_convention: Union[QuoteConvention, None] = None + best_quote_convention: Optional[QuoteConvention] = None for quote_convention in self._conventions: similarity = tabulated_quotation_marks.calculate_similarity(quote_convention) if similarity > best_similarity: diff --git a/machine/corpora/punctuation_analysis/text_segment.py b/machine/corpora/punctuation_analysis/text_segment.py index 96ff01d1..cae3e387 100644 --- a/machine/corpora/punctuation_analysis/text_segment.py +++ b/machine/corpora/punctuation_analysis/text_segment.py @@ -1,4 +1,4 @@ -from typing import Set, Union +from typing import Optional, Set from ..usfm_token import UsfmToken from .usfm_marker_type import UsfmMarkerType @@ -9,20 +9,20 @@ def __init__(self): self._text = "" self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER self._markers_in_preceding_context: Set[UsfmMarkerType] = set() - self._previous_segment: Union[TextSegment, None] = None - self._next_segment: Union[TextSegment, None] = None - self._index_in_verse: int = 0 - self._num_segments_in_verse: int = 0 - self._usfm_token: Union[UsfmToken, None] = None + self.previous_segment: Optional[TextSegment] = None + self.next_segment: Optional[TextSegment] = None + self.index_in_verse: int = 0 + self.num_segments_in_verse: int = 0 + self._usfm_token: Optional[UsfmToken] = None def __eq__(self, value): if not isinstance(value, TextSegment): return False if self._text != value._text: return False - if self._index_in_verse != value._index_in_verse: + if self.index_in_verse != value.index_in_verse: return False - if self._num_segments_in_verse != value._num_segments_in_verse: + if self.num_segments_in_verse != value.num_segments_in_verse: return False if self._usfm_token != value._usfm_token: return False @@ -34,14 +34,6 @@ def __eq__(self, value): def text(self) -> str: return self._text - @property - def previous_segment(self) -> Union["TextSegment", None]: - return self._previous_segment - - @property - def next_segment(self) -> Union["TextSegment", None]: - return self._next_segment - @property def length(self) -> int: return len(self._text) @@ -56,37 +48,22 @@ def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool: return marker in self._markers_in_preceding_context def is_first_segment_in_verse(self) -> bool: - return self._index_in_verse == 0 + return self.index_in_verse == 0 def is_last_segment_in_verse(self) -> bool: - return self._index_in_verse == self._num_segments_in_verse - 1 + return self.index_in_verse == self.num_segments_in_verse - 1 def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None: self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index) if self._usfm_token is not None: self._usfm_token.text = self._text - # These setters need to be implemented outside the builder to avoid circular dependencies - @previous_segment.setter - def previous_segment(self, previous_segment: "TextSegment") -> None: - self._previous_segment = previous_segment - - @next_segment.setter - def next_segment(self, next_segment: "TextSegment") -> None: - self._next_segment = next_segment - - def set_index_in_verse(self, index_in_verse: int) -> None: - self._index_in_verse = index_in_verse - - def set_num_segments_in_verse(self, num_segments_in_verse: int) -> None: - self._num_segments_in_verse = num_segments_in_verse - class Builder: def __init__(self): self._text_segment = TextSegment() def set_previous_segment(self, previous_segment: "TextSegment") -> "TextSegment.Builder": - self._text_segment._previous_segment = previous_segment + self._text_segment.previous_segment = previous_segment return self def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder": diff --git a/machine/corpora/punctuation_analysis/usfm_structure_extractor.py b/machine/corpora/punctuation_analysis/usfm_structure_extractor.py index 303e30f4..02b22ce6 100644 --- a/machine/corpora/punctuation_analysis/usfm_structure_extractor.py +++ b/machine/corpora/punctuation_analysis/usfm_structure_extractor.py @@ -70,7 +70,7 @@ def text(self, state: UsfmParserState, text: str) -> None: if len(text) > 0: self._next_text_segment_builder.set_text(text) text_segment: TextSegment = self._next_text_segment_builder.build() - # don't look past verse boundaries, to enable identical functionality in the + # Don't look past verse boundaries, to enable identical functionality in the # online one-verse-at-a-time (QuotationMarkDenormalizationScriptureUpdateBlockHandler) # and offline whole-book-at-once settings (QuoteConventionDetector) if len(self._text_segments) > 0 and not text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE): diff --git a/machine/corpora/punctuation_analysis/verse.py b/machine/corpora/punctuation_analysis/verse.py index 00916586..9c871421 100644 --- a/machine/corpora/punctuation_analysis/verse.py +++ b/machine/corpora/punctuation_analysis/verse.py @@ -8,8 +8,8 @@ def __init__(self, text_segments: list[TextSegment]): def _index_text_segments(self) -> None: for index, text_segment in enumerate(self._text_segments): - text_segment.set_index_in_verse(index) - text_segment.set_num_segments_in_verse(len(self._text_segments)) + text_segment.index_in_verse = index + text_segment.num_segments_in_verse = len(self._text_segments) @property def text_segments(self) -> list[TextSegment]: diff --git a/machine/corpora/quotation_mark_update_first_pass.py b/machine/corpora/quotation_mark_update_first_pass.py index 4e3f5a43..ee5827a5 100644 --- a/machine/corpora/quotation_mark_update_first_pass.py +++ b/machine/corpora/quotation_mark_update_first_pass.py @@ -24,7 +24,7 @@ def __init__(self, source_quote_convention: QuoteConvention, target_quote_conven QuoteConventionSet([source_quote_convention]) ) self._quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( - QuotationMarkUpdateResolutionSettings(source_quote_convention, target_quote_convention) + QuotationMarkUpdateResolutionSettings(source_quote_convention) ) self._will_fallback_mode_work: bool = self._check_whether_fallback_mode_will_work( source_quote_convention, target_quote_convention @@ -34,13 +34,13 @@ def _check_whether_fallback_mode_will_work( self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention ) -> bool: target_marks_by_source_marks: Dict[str, Set[str]] = {} - for level in range(1, source_quote_convention.num_levels + 1): - opening_quotation_mark = source_quote_convention.get_opening_quotation_mark_at_level(level) + for depth in range(1, source_quote_convention.num_levels + 1): + opening_quotation_mark = source_quote_convention.get_opening_quotation_mark_at_depth(depth) if opening_quotation_mark not in target_marks_by_source_marks: target_marks_by_source_marks[opening_quotation_mark] = set() - if level <= target_quote_convention.num_levels: + if depth <= target_quote_convention.num_levels: target_marks_by_source_marks[opening_quotation_mark].add( - target_quote_convention.get_closing_quotation_mark_at_level(level) + target_quote_convention.get_closing_quotation_mark_at_depth(depth) ) for source_mark in target_marks_by_source_marks: @@ -63,12 +63,14 @@ def _find_best_strategy_for_chapter(self, chapter: Chapter) -> QuotationMarkUpda self._quotation_mark_resolver.reset() - # use list() to force evaluation of the generator + # Use list() to force evaluation of the generator list(self._quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches)) return self._choose_best_strategy_based_on_observed_issues(self._quotation_mark_resolver.get_issues()) - def _choose_best_strategy_based_on_observed_issues(self, issues) -> QuotationMarkUpdateStrategy: + def _choose_best_strategy_based_on_observed_issues( + self, issues: Set[QuotationMarkResolutionIssue] + ) -> QuotationMarkUpdateStrategy: if QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK in issues: return QuotationMarkUpdateStrategy.SKIP diff --git a/machine/corpora/quotation_mark_update_resolution_settings.py b/machine/corpora/quotation_mark_update_resolution_settings.py index 0628e07c..cd37d43d 100644 --- a/machine/corpora/quotation_mark_update_resolution_settings.py +++ b/machine/corpora/quotation_mark_update_resolution_settings.py @@ -10,10 +10,9 @@ class QuotationMarkUpdateResolutionSettings(QuotationMarkResolutionSettings): - def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): + def __init__(self, source_quote_convention: QuoteConvention): self._source_quote_convention = source_quote_convention self._quote_convention_singleton_set = QuoteConventionSet([self._source_quote_convention]) - self._target_quote_convention = target_quote_convention def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: return quotation_mark_match.is_valid_opening_quotation_mark(self._quote_convention_singleton_set) diff --git a/machine/corpora/quotation_mark_update_settings.py b/machine/corpora/quotation_mark_update_settings.py index cb4de267..006f413c 100644 --- a/machine/corpora/quotation_mark_update_settings.py +++ b/machine/corpora/quotation_mark_update_settings.py @@ -5,13 +5,13 @@ class QuotationMarkUpdateSettings: def __init__( self, - default_chapter_action: QuotationMarkUpdateStrategy = QuotationMarkUpdateStrategy.APPLY_FULL, - chapter_actions: list[QuotationMarkUpdateStrategy] = [], + default_chapter_strategy: QuotationMarkUpdateStrategy = QuotationMarkUpdateStrategy.APPLY_FULL, + chapter_strategies: list[QuotationMarkUpdateStrategy] = [], ): - self._default_chapter_action = default_chapter_action - self._chapter_actions = chapter_actions + self._default_chapter_strategy = default_chapter_strategy + self._chapter_strategies = chapter_strategies def get_action_for_chapter(self, chapter_number: int) -> QuotationMarkUpdateStrategy: - if chapter_number <= len(self._chapter_actions): - return self._chapter_actions[chapter_number - 1] - return self._default_chapter_action + if chapter_number <= len(self._chapter_strategies): + return self._chapter_strategies[chapter_number - 1] + return self._default_chapter_strategy diff --git a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py index 37e069a6..bdc648ee 100644 --- a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py +++ b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List, Optional from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver from .punctuation_analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver @@ -36,9 +36,7 @@ def __init__( ) self._next_scripture_text_segment_builder: TextSegment.Builder = TextSegment.Builder() - resolution_settings = QuotationMarkUpdateResolutionSettings( - self._source_quote_convention, self._target_quote_convention - ) + resolution_settings = QuotationMarkUpdateResolutionSettings(self._source_quote_convention) # Each embed represents a separate context for quotation marks # (i.e. you can't open a quote in one context and close it in another) @@ -66,12 +64,12 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: return self._apply_standard_updating(block) def _apply_fallback_updating(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: - for element in block._elements: + for element in block.elements: self._process_scripture_element(element, self._simple_quotation_mark_resolver) return block def _apply_standard_updating(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: - for element in block._elements: + for element in block.elements: if element.type == UsfmUpdateBlockElementType.EMBED: self._embed_quotation_mark_resolver.reset() self._process_scripture_element(element, self._embed_quotation_mark_resolver) @@ -102,20 +100,19 @@ def _create_text_segments(self, element: UsfmUpdateBlockElement) -> List[TextSeg elif token.type == UsfmTokenType.NOTE: self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) elif token.type == UsfmTokenType.TEXT: - text_segment: Union[TextSegment, None] = self._create_text_segment(token) + text_segment: Optional[TextSegment] = self._create_text_segment(token) if text_segment is not None: text_segments.append(text_segment) return self._set_previous_and_next_for_segments(text_segments) - def _create_text_segment(self, token: UsfmToken) -> Union[TextSegment, None]: + def _create_text_segment(self, token: UsfmToken) -> Optional[TextSegment]: self._next_scripture_text_segment_builder.set_usfm_token(token) + text_segment_to_return: Optional[TextSegment] = None if token.text is not None: self._next_scripture_text_segment_builder.set_text(token.text) - text_segment_to_return: TextSegment = self._next_scripture_text_segment_builder.build() - self._next_scripture_text_segment_builder = TextSegment.Builder() - return text_segment_to_return - else: - self._next_scripture_text_segment_builder = TextSegment.Builder() + text_segment_to_return = self._next_scripture_text_segment_builder.build() + self._next_scripture_text_segment_builder = TextSegment.Builder() + return text_segment_to_return def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) -> List[TextSegment]: for i in range(len(text_segments)): @@ -128,10 +125,10 @@ def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) def _check_for_chapter_change(self, block: UsfmUpdateBlock) -> None: for scripture_ref in block.refs: if scripture_ref.chapter_num != self._current_chapter_number: - self._current_chapter_number = scripture_ref.chapter_num - self._start_new_chapter(self._current_chapter_number) + self._start_new_chapter(scripture_ref.chapter_num) def _start_new_chapter(self, new_chapter_number: int) -> None: + self._current_chapter_number = new_chapter_number self._current_strategy = self._settings.get_action_for_chapter(new_chapter_number) self._verse_text_quotation_mark_resolver.reset() self._next_scripture_text_segment_builder = TextSegment.Builder() @@ -143,8 +140,8 @@ def _check_for_verse_change(self, block: UsfmUpdateBlock) -> None: scripture_ref.chapter_num == self._current_chapter_number and scripture_ref.verse_num != self._current_verse_number ): - self._current_verse_number = scripture_ref.verse_num - self._start_new_verse(self._current_verse_number) + self._start_new_verse(scripture_ref.verse_num) - def _start_new_verse(self, new_chapter_number: int) -> None: + def _start_new_verse(self, new_verse_number: int) -> None: + self._current_verse_number = new_verse_number self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VERSE) diff --git a/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py b/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py index 5b832852..3a69af3d 100644 --- a/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py +++ b/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py @@ -2,6 +2,7 @@ from machine.corpora import QuotationMarkUpdateResolutionSettings from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, DepthBasedQuotationMarkResolver, QuotationMarkCategorizer, QuotationMarkDirection, @@ -15,7 +16,6 @@ QuoteConventionSet, TextSegment, UsfmMarkerType, - standard_quote_conventions, ) @@ -315,9 +315,7 @@ def test_add_quotation_continuer() -> None: def test_is_english_quotation_continuer() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None english_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -380,7 +378,7 @@ def test_is_english_quotation_continuer() -> None: ) quotation_mark_categorizer_for_denormalization = QuotationMarkCategorizer( - QuotationMarkUpdateResolutionSettings(standard_english_quote_convention, standard_english_quote_convention), + QuotationMarkUpdateResolutionSettings(standard_english_quote_convention), quotation_mark_resolver_state, quotation_continuer_state, ) @@ -563,9 +561,7 @@ def test_is_english_quotation_continuer() -> None: def test_is_spanish_quotation_continuer() -> None: - western_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") - ) + western_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") assert western_european_quote_convention is not None spanish_resolver_settings = QuoteConventionDetectionResolutionSettings( @@ -628,7 +624,7 @@ def test_is_spanish_quotation_continuer() -> None: ) quotation_mark_categorizer_for_denormalization = QuotationMarkCategorizer( - QuotationMarkUpdateResolutionSettings(western_european_quote_convention, western_european_quote_convention), + QuotationMarkUpdateResolutionSettings(western_european_quote_convention), quotation_mark_resolver_state, quotation_continuer_state, ) @@ -811,9 +807,7 @@ def test_is_spanish_quotation_continuer() -> None: def test_is_opening_quote() -> None: - central_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") - ) + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([central_european_quote_convention]) @@ -824,9 +818,7 @@ def test_is_opening_quote() -> None: central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - british_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") - ) + british_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") assert british_english_quote_convention is not None british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([british_english_quote_convention]) @@ -835,9 +827,7 @@ def test_is_opening_quote() -> None: british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - standard_swedish_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") - ) + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_swedish_quote_convention]) @@ -1038,9 +1028,7 @@ def test_is_opening_quote() -> None: def test_is_closing_quote() -> None: - central_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") - ) + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([central_european_quote_convention]) @@ -1051,9 +1039,7 @@ def test_is_closing_quote() -> None: central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - british_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") - ) + british_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") assert british_english_quote_convention is not None british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([british_english_quote_convention]) @@ -1062,9 +1048,7 @@ def test_is_closing_quote() -> None: british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - standard_swedish_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") - ) + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_swedish_quote_convention]) @@ -1073,9 +1057,7 @@ def test_is_closing_quote() -> None: standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - standard_french_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_french") - ) + standard_french_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_french") assert standard_french_quote_convention is not None standard_french_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_french_quote_convention]) @@ -1222,9 +1204,7 @@ def test_is_closing_quote() -> None: def test_is_malformed_opening_quote() -> None: - central_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") - ) + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([central_european_quote_convention]) @@ -1235,9 +1215,7 @@ def test_is_malformed_opening_quote() -> None: central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - british_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") - ) + british_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") assert british_english_quote_convention is not None british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([british_english_quote_convention]) @@ -1246,9 +1224,7 @@ def test_is_malformed_opening_quote() -> None: british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - standard_swedish_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") - ) + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_swedish_quote_convention]) @@ -1423,9 +1399,7 @@ def test_is_malformed_opening_quote() -> None: def test_is_malformed_closing_quote() -> None: - central_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") - ) + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([central_european_quote_convention]) @@ -1436,9 +1410,7 @@ def test_is_malformed_closing_quote() -> None: central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - british_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") - ) + british_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") assert british_english_quote_convention is not None british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([british_english_quote_convention]) @@ -1447,9 +1419,7 @@ def test_is_malformed_closing_quote() -> None: british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - standard_swedish_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") - ) + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_swedish_quote_convention]) @@ -1621,9 +1591,7 @@ def test_is_malformed_closing_quote() -> None: def test_is_unpaired_closing_quote() -> None: - central_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") - ) + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([central_european_quote_convention]) @@ -1634,9 +1602,7 @@ def test_is_unpaired_closing_quote() -> None: central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - british_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") - ) + british_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") assert british_english_quote_convention is not None british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([british_english_quote_convention]) @@ -1645,9 +1611,7 @@ def test_is_unpaired_closing_quote() -> None: british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - standard_swedish_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") - ) + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_swedish_quote_convention]) @@ -1808,9 +1772,7 @@ def test_is_unpaired_closing_quote() -> None: def test_is_apostrophe() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -1821,9 +1783,7 @@ def test_is_apostrophe() -> None: standard_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state ) - typewriter_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") - ) + typewriter_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") assert typewriter_english_quote_convention is not None typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([typewriter_english_quote_convention]) @@ -1973,9 +1933,7 @@ def test_is_apostrophe() -> None: # DepthBasedQuotationMarkResolver tests def test_depth_based_quotation_mark_resolver_reset() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -2005,9 +1963,7 @@ def test_depth_based_quotation_mark_resolver_reset() -> None: def test_basic_quotation_mark_recognition() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -2034,9 +1990,7 @@ def test_basic_quotation_mark_recognition() -> None: def test_resolution_only_of_passed_matches() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -2074,9 +2028,7 @@ def test_resolution_only_of_passed_matches() -> None: def test_resolution_across_segments() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -2104,9 +2056,7 @@ def test_resolution_across_segments() -> None: def test_resolution_with_apostrophes() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -2137,9 +2087,7 @@ def test_resolution_with_apostrophes() -> None: ] assert standard_english_quotation_mark_resolver.get_issues() == set() - typewriter_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") - ) + typewriter_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") assert typewriter_english_quote_convention is not None typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([typewriter_english_quote_convention]) @@ -2169,9 +2117,7 @@ def test_resolution_with_apostrophes() -> None: def test_english_quote_continuers() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -2208,9 +2154,7 @@ def test_english_quote_continuers() -> None: def test_spanish_quote_continuers() -> None: - western_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") - ) + western_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") assert western_european_quote_convention is not None western_european_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([western_european_quote_convention]) @@ -2247,9 +2191,7 @@ def test_spanish_quote_continuers() -> None: def test_malformed_quotation_marks() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -2282,9 +2224,7 @@ def test_malformed_quotation_marks() -> None: def test_unpaired_quotation_mark_issue() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -2325,9 +2265,7 @@ def test_unpaired_quotation_mark_issue() -> None: def test_too_deep_nesting_issue() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -2361,9 +2299,7 @@ def test_too_deep_nesting_issue() -> None: def test_incompatible_quotation_mark_issue() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_english_quote_convention]) @@ -2392,9 +2328,7 @@ def test_incompatible_quotation_mark_issue() -> None: def test_ambiguous_quotation_mark_issue() -> None: - typewriter_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") - ) + typewriter_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") assert typewriter_english_quote_convention is not None typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([typewriter_english_quote_convention]) @@ -2432,9 +2366,7 @@ def test_ambiguous_quotation_mark_issue() -> None: def test_typewriter_english_quotation_mark_recognition() -> None: - typewriter_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") - ) + typewriter_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") assert typewriter_english_quote_convention is not None typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([typewriter_english_quote_convention]) @@ -2463,9 +2395,7 @@ def test_typewriter_english_quotation_mark_recognition() -> None: def test_typewriter_french_mark_recognition() -> None: - typewriter_french_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_french") - ) + typewriter_french_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_french") assert typewriter_french_quote_convention is not None typewriter_french_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([typewriter_french_quote_convention]) @@ -2492,9 +2422,7 @@ def test_typewriter_french_mark_recognition() -> None: def test_central_european_quotation_mark_recognition() -> None: - central_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") - ) + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") assert central_european_quote_convention is not None central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([central_european_quote_convention]) @@ -2526,9 +2454,7 @@ def test_central_european_quotation_mark_recognition() -> None: def test_standard_swedish_quotation_mark_recognition() -> None: - standard_swedish_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") - ) + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") assert standard_swedish_quote_convention is not None standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet([standard_swedish_quote_convention]) @@ -2560,19 +2486,13 @@ def test_standard_swedish_quotation_mark_recognition() -> None: def test_multiple_conventions_quotation_mark_recognition() -> None: - typewriter_french_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_french") - ) + typewriter_french_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_french") assert typewriter_french_quote_convention is not None - central_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") - ) + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") assert central_european_quote_convention is not None - standard_swedish_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") - ) + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") assert standard_swedish_quote_convention is not None multiple_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( QuoteConventionSet( diff --git a/tests/corpora/punctuation_analysis/test_quotation_mark_finder.py b/tests/corpora/punctuation_analysis/test_quotation_mark_finder.py index 5fbfe5fc..d0f66b6b 100644 --- a/tests/corpora/punctuation_analysis/test_quotation_mark_finder.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_finder.py @@ -1,14 +1,14 @@ from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, QuotationMarkFinder, QuotationMarkStringMatch, QuoteConventionSet, TextSegment, - standard_quote_conventions, ) def test_that_all_possible_quotation_marks_are_identified() -> None: - quotation_mark_finder = QuotationMarkFinder(standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS) + quotation_mark_finder = QuotationMarkFinder(STANDARD_QUOTE_CONVENTIONS) assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( TextSegment.Builder().set_text("\u201cSample Text\u201d").build() ) == [ @@ -177,9 +177,7 @@ def test_that_all_possible_quotation_marks_are_identified() -> None: def test_that_it_uses_the_quote_convention_set() -> None: - standard_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") - ) + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert standard_english_quote_convention is not None english_quotation_mark_finder = QuotationMarkFinder(QuoteConventionSet([standard_english_quote_convention])) @@ -192,9 +190,7 @@ def test_that_it_uses_the_quote_convention_set() -> None: == [] ) - typewriter_english_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") - ) + typewriter_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") assert typewriter_english_quote_convention is not None typewriter_english_quotation_mark_finder = QuotationMarkFinder( @@ -214,9 +210,7 @@ def test_that_it_uses_the_quote_convention_set() -> None: ) ] - western_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") - ) + western_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") assert western_european_quote_convention is not None western_european_quotation_mark_finder = QuotationMarkFinder( @@ -236,10 +230,8 @@ def test_that_it_uses_the_quote_convention_set() -> None: ) ] - typewriter_western_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "typewriter_western_european" - ) + typewriter_western_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( + "typewriter_western_european" ) assert typewriter_western_european_quote_convention is not None @@ -267,9 +259,7 @@ def test_that_it_uses_the_quote_convention_set() -> None: ), ] - central_european_quote_convention = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") - ) + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") assert central_european_quote_convention is not None central_european_quotation_mark_finder = QuotationMarkFinder( diff --git a/tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py b/tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py index 912cf11a..e4e65d9b 100644 --- a/tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py @@ -1,11 +1,11 @@ from typing import Union from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, QuotationMarkDirection, QuotationMarkMetadata, QuoteConvention, TextSegment, - standard_quote_conventions, ) @@ -45,8 +45,6 @@ def test_update_quotation_mark() -> None: def get_quote_convention_by_name(name: str) -> QuoteConvention: - quote_convention: Union[QuoteConvention, None] = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) - ) + quote_convention: Union[QuoteConvention, None] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) assert quote_convention is not None return quote_convention diff --git a/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py b/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py index ce17063c..a91e77e4 100644 --- a/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py @@ -1,19 +1,19 @@ from typing import List from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, DepthBasedQuotationMarkResolver, QuotationMarkResolver, QuotationMarkStringMatch, QuoteConventionDetectionResolutionSettings, TextSegment, UsfmMarkerType, - standard_quote_conventions, ) def test_reset() -> None: quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( - QuoteConventionDetectionResolutionSettings(standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS) + QuoteConventionDetectionResolutionSettings(STANDARD_QUOTE_CONVENTIONS) ) assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] diff --git a/tests/corpora/punctuation_analysis/test_quote_convention.py b/tests/corpora/punctuation_analysis/test_quote_convention.py index 89e413a2..514ffdf1 100644 --- a/tests/corpora/punctuation_analysis/test_quote_convention.py +++ b/tests/corpora/punctuation_analysis/test_quote_convention.py @@ -123,9 +123,9 @@ def test_get_opening_quote_at_level() -> None: SingleLevelQuoteConvention("\u00ab", "\u00bb"), ], ) - assert quote_convention.get_opening_quotation_mark_at_level(1) == "\u201c" - assert quote_convention.get_opening_quotation_mark_at_level(2) == "\u2018" - assert quote_convention.get_opening_quotation_mark_at_level(3) == "\u00ab" + assert quote_convention.get_opening_quotation_mark_at_depth(1) == "\u201c" + assert quote_convention.get_opening_quotation_mark_at_depth(2) == "\u2018" + assert quote_convention.get_opening_quotation_mark_at_depth(3) == "\u00ab" def test_get_closing_quote_at_level() -> None: @@ -137,9 +137,9 @@ def test_get_closing_quote_at_level() -> None: SingleLevelQuoteConvention("\u00ab", "\u00bb"), ], ) - assert quote_convention.get_closing_quotation_mark_at_level(1) == "\u201d" - assert quote_convention.get_closing_quotation_mark_at_level(2) == "\u2019" - assert quote_convention.get_closing_quotation_mark_at_level(3) == "\u00bb" + assert quote_convention.get_closing_quotation_mark_at_depth(1) == "\u201d" + assert quote_convention.get_closing_quotation_mark_at_depth(2) == "\u2019" + assert quote_convention.get_closing_quotation_mark_at_depth(3) == "\u00bb" def test_get_expected_quotation_mark() -> None: @@ -313,14 +313,14 @@ def test_normalize() -> None: normalized_standard_english_quote_convention = standard_english_quote_convention.normalize() assert normalized_standard_english_quote_convention.name == "standard-english-quote-convention_normalized" assert normalized_standard_english_quote_convention.num_levels == 4 - assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_level(1) == '"' - assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_level(1) == '"' - assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_level(2) == "'" - assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_level(2) == "'" - assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_level(3) == '"' - assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_level(3) == '"' - assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_level(4) == "'" - assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_level(4) == "'" + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_depth(1) == '"' + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_depth(1) == '"' + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_depth(2) == "'" + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_depth(2) == "'" + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_depth(3) == '"' + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_depth(3) == '"' + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_depth(4) == "'" + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_depth(4) == "'" western_european_quote_convention = QuoteConvention( "test-quote-convention", @@ -333,12 +333,12 @@ def test_normalize() -> None: normalized_western_european_quote_convention = western_european_quote_convention.normalize() assert normalized_western_european_quote_convention.name == "test-quote-convention_normalized" assert normalized_western_european_quote_convention.num_levels == 3 - assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_level(1) == '"' - assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_level(1) == '"' - assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_level(2) == '"' - assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_level(2) == '"' - assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_level(3) == "'" - assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_level(3) == "'" + assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_depth(1) == '"' + assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_depth(1) == '"' + assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_depth(2) == '"' + assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_depth(2) == '"' + assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_depth(3) == "'" + assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_depth(3) == "'" hybrid_british_typewriter_english_quote_convention = QuoteConvention( "hybrid-british-typewriter-english-quote-convention", @@ -357,12 +357,12 @@ def test_normalize() -> None: == "hybrid-british-typewriter-english-quote-convention_normalized" ) assert normalized_hybrid_british_typewriter_english_quote_convention.num_levels == 3 - assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_level(1) == '"' - assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_level(1) == '"' - assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_level(2) == "'" - assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_level(2) == "'" - assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_level(3) == '"' - assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_level(3) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_depth(1) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_depth(1) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_depth(2) == "'" + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_depth(2) == "'" + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_depth(3) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_depth(3) == '"' def test_print_summary() -> None: diff --git a/tests/corpora/punctuation_analysis/test_text_segment.py b/tests/corpora/punctuation_analysis/test_text_segment.py index 4fa34058..bb8f529d 100644 --- a/tests/corpora/punctuation_analysis/test_text_segment.py +++ b/tests/corpora/punctuation_analysis/test_text_segment.py @@ -6,12 +6,12 @@ def test_builder_initialization() -> None: builder = TextSegment.Builder() assert builder._text_segment._text == "" - assert builder._text_segment._previous_segment is None - assert builder._text_segment._next_segment is None + assert builder._text_segment.previous_segment is None + assert builder._text_segment.next_segment is None assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER assert builder._text_segment._markers_in_preceding_context == set() - assert builder._text_segment._index_in_verse == 0 - assert builder._text_segment._num_segments_in_verse == 0 + assert builder._text_segment.index_in_verse == 0 + assert builder._text_segment.num_segments_in_verse == 0 assert builder._text_segment._usfm_token is None @@ -28,12 +28,12 @@ def test_builder_set_previous_segment() -> None: previous_segment = TextSegment.Builder().set_text("previous segment text").build() builder.set_previous_segment(previous_segment) - assert builder._text_segment._previous_segment == previous_segment - assert builder._text_segment._next_segment is None + assert builder._text_segment.previous_segment == previous_segment + assert builder._text_segment.next_segment is None assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER assert builder._text_segment._markers_in_preceding_context == set() - assert builder._text_segment._index_in_verse == 0 - assert builder._text_segment._num_segments_in_verse == 0 + assert builder._text_segment.index_in_verse == 0 + assert builder._text_segment.num_segments_in_verse == 0 def test_builder_add_preceding_marker() -> None: @@ -42,8 +42,8 @@ def test_builder_add_preceding_marker() -> None: assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.CHAPTER assert builder._text_segment._markers_in_preceding_context == {UsfmMarkerType.CHAPTER} - assert builder._text_segment._previous_segment is None - assert builder._text_segment._next_segment is None + assert builder._text_segment.previous_segment is None + assert builder._text_segment.next_segment is None builder.add_preceding_marker(UsfmMarkerType.VERSE) assert builder._text_segment._immediate_preceding_marker == UsfmMarkerType.VERSE @@ -51,8 +51,8 @@ def test_builder_add_preceding_marker() -> None: UsfmMarkerType.CHAPTER, UsfmMarkerType.VERSE, } - assert builder._text_segment._previous_segment is None - assert builder._text_segment._next_segment is None + assert builder._text_segment.previous_segment is None + assert builder._text_segment.next_segment is None def test_builder_set_usfm_token() -> None: @@ -63,58 +63,8 @@ def test_builder_set_usfm_token() -> None: assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT assert builder._text_segment._usfm_token.text == "USFM token text" assert builder._text_segment._text == "" - assert builder._text_segment._previous_segment is None - assert builder._text_segment._next_segment is None - - -def test_set_previous_segment() -> None: - text_segment = TextSegment.Builder().set_text("example text").build() - previous_segment = TextSegment.Builder().set_text("previous segment text").build() - text_segment.previous_segment = previous_segment - - assert text_segment._previous_segment == previous_segment - assert text_segment._next_segment is None - assert text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER - assert text_segment._markers_in_preceding_context == set() - assert text_segment._index_in_verse == 0 - assert text_segment._num_segments_in_verse == 0 - - -def test_set_next_segment() -> None: - text_segment = TextSegment.Builder().set_text("example text").build() - next_segment = TextSegment.Builder().set_text("next segment text").build() - text_segment.next_segment = next_segment - - assert text_segment._previous_segment is None - assert text_segment._next_segment == next_segment - assert text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER - assert text_segment._markers_in_preceding_context == set() - assert text_segment._index_in_verse == 0 - assert text_segment._num_segments_in_verse == 0 - - -def test_set_index_in_verse() -> None: - text_segment = TextSegment.Builder().set_text("example text").build() - text_segment.set_index_in_verse(2) - - assert text_segment._index_in_verse == 2 - assert text_segment._previous_segment is None - assert text_segment._next_segment is None - assert text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER - assert text_segment._markers_in_preceding_context == set() - assert text_segment._num_segments_in_verse == 0 - - -def test_set_num_segments_in_verse() -> None: - text_segment = TextSegment.Builder().set_text("example text").build() - text_segment.set_num_segments_in_verse(5) - - assert text_segment._num_segments_in_verse == 5 - assert text_segment._previous_segment is None - assert text_segment._next_segment is None - assert text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER - assert text_segment._markers_in_preceding_context == set() - assert text_segment._index_in_verse == 0 + assert builder._text_segment.previous_segment is None + assert builder._text_segment.next_segment is None def test_equals() -> None: @@ -128,11 +78,11 @@ def test_equals() -> None: assert basic_segment != different_text_segment segment_with_index = TextSegment.Builder().set_text("text1").build() - segment_with_index.set_index_in_verse(1) + segment_with_index.index_in_verse = 1 segment_with_same_index = TextSegment.Builder().set_text("text1").build() - segment_with_same_index.set_index_in_verse(1) + segment_with_same_index.index_in_verse = 1 segment_with_different_index = TextSegment.Builder().set_text("text1").build() - segment_with_different_index.set_index_in_verse(2) + segment_with_different_index.index_in_verse = 2 assert segment_with_index == segment_with_same_index assert segment_with_index != segment_with_different_index @@ -171,11 +121,11 @@ def test_equals() -> None: # attributes that are not used in equality checks segment_with_num_verses = TextSegment.Builder().set_text("text1").build() - segment_with_num_verses.set_num_segments_in_verse(3) + segment_with_num_verses.num_segments_in_verse = 3 segment_with_same_num_verses = TextSegment.Builder().set_text("text1").build() - segment_with_same_num_verses.set_num_segments_in_verse(3) + segment_with_same_num_verses.num_segments_in_verse = 3 segment_with_different_num_verses = TextSegment.Builder().set_text("text1").build() - segment_with_different_num_verses.set_num_segments_in_verse(4) + segment_with_different_num_verses.num_segments_in_verse = 4 assert segment_with_num_verses == segment_with_same_num_verses assert segment_with_num_verses != segment_with_different_num_verses @@ -269,24 +219,24 @@ def test_is_marker_in_preceding_context() -> None: def test_is_first_segment_in_verse() -> None: text_segment = TextSegment.Builder().set_text("example text").build() - text_segment.set_index_in_verse(0) + text_segment.index_in_verse = 0 assert text_segment.is_first_segment_in_verse() is True - text_segment.set_index_in_verse(1) + text_segment.index_in_verse = 1 assert text_segment.is_first_segment_in_verse() is False def test_is_last_segment_in_verse() -> None: text_segment = TextSegment.Builder().set_text("example text").build() - text_segment.set_index_in_verse(0) - text_segment.set_num_segments_in_verse(1) + text_segment.index_in_verse = 0 + text_segment.num_segments_in_verse = 1 assert text_segment.is_last_segment_in_verse() is True - text_segment.set_index_in_verse(0) - text_segment.set_num_segments_in_verse(2) + text_segment.index_in_verse = 0 + text_segment.num_segments_in_verse = 2 assert text_segment.is_last_segment_in_verse() is False - text_segment.set_index_in_verse(1) + text_segment.index_in_verse = 1 assert text_segment.is_last_segment_in_verse() is True diff --git a/tests/corpora/punctuation_analysis/test_verse.py b/tests/corpora/punctuation_analysis/test_verse.py index ddfa58d4..6212e2b3 100644 --- a/tests/corpora/punctuation_analysis/test_verse.py +++ b/tests/corpora/punctuation_analysis/test_verse.py @@ -23,9 +23,9 @@ def test_segment_indices() -> None: verse = Verse(text_segments) - assert verse.text_segments[0]._index_in_verse == 0 - assert verse.text_segments[1]._index_in_verse == 1 - assert verse.text_segments[2]._index_in_verse == 2 + assert verse.text_segments[0].index_in_verse == 0 + assert verse.text_segments[1].index_in_verse == 1 + assert verse.text_segments[2].index_in_verse == 2 def test_num_segments_in_verse() -> None: @@ -37,6 +37,6 @@ def test_num_segments_in_verse() -> None: verse = Verse(text_segments) - assert verse.text_segments[0]._num_segments_in_verse == 3 - assert verse.text_segments[1]._num_segments_in_verse == 3 - assert verse.text_segments[2]._num_segments_in_verse == 3 + assert verse.text_segments[0].num_segments_in_verse == 3 + assert verse.text_segments[1].num_segments_in_verse == 3 + assert verse.text_segments[2].num_segments_in_verse == 3 diff --git a/tests/corpora/test_fallback_quotation_mark_resolver.py b/tests/corpora/test_fallback_quotation_mark_resolver.py index e9ad7bdc..381d1976 100644 --- a/tests/corpora/test_fallback_quotation_mark_resolver.py +++ b/tests/corpora/test_fallback_quotation_mark_resolver.py @@ -1,5 +1,6 @@ from machine.corpora import FallbackQuotationMarkResolver, QuotationMarkUpdateResolutionSettings from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, QuotationMarkDirection, QuotationMarkMetadata, QuotationMarkResolutionIssue, @@ -7,18 +8,15 @@ QuoteConventionDetectionResolutionSettings, QuoteConventionSet, TextSegment, - standard_quote_conventions, ) def test_reset(): - english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "standard_english" - ) + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None basic_quotation_mark_resolver = FallbackQuotationMarkResolver( - QuotationMarkUpdateResolutionSettings(english_quote_convention, english_quote_convention) + QuotationMarkUpdateResolutionSettings(english_quote_convention) ) basic_quotation_mark_resolver._last_quotation_mark = QuotationMarkMetadata( @@ -32,13 +30,11 @@ def test_reset(): def test_simple_quotation_mark_resolution(): - english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "standard_english" - ) + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None basic_quotation_mark_resolver = FallbackQuotationMarkResolver( - QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) ) actual_resolved_quotation_marks = list( @@ -65,13 +61,11 @@ def test_simple_quotation_mark_resolution(): def test_is_opening_quote(): - english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "standard_english" - ) + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None basic_quotation_mark_resolver = FallbackQuotationMarkResolver( - QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) ) # valid opening quote at start of segment @@ -108,9 +102,7 @@ def test_is_opening_quote(): def test_is_opening_quote_with_unambiguous_quote_convention(): - english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "standard_english" - ) + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None basic_quotation_mark_resolver = FallbackQuotationMarkResolver( @@ -135,13 +127,11 @@ def test_is_opening_quote_with_unambiguous_quote_convention(): def test_is_opening_quote_stateful(): - english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "standard_english" - ) + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None basic_quotation_mark_resolver = FallbackQuotationMarkResolver( - QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) ) # no preceding quote @@ -156,13 +146,11 @@ def test_is_opening_quote_stateful(): def test_does_most_recent_opening_mark_immediately_precede(): - english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "standard_english" - ) + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None basic_quotation_mark_resolver = FallbackQuotationMarkResolver( - QuotationMarkUpdateResolutionSettings(english_quote_convention, english_quote_convention) + QuotationMarkUpdateResolutionSettings(english_quote_convention) ) # no preceding quote @@ -196,13 +184,11 @@ def test_does_most_recent_opening_mark_immediately_precede(): def test_is_closing_quote(): - english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "standard_english" - ) + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None basic_quotation_mark_resolver = FallbackQuotationMarkResolver( - QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) ) # valid closing quote at end of segment @@ -239,9 +225,7 @@ def test_is_closing_quote(): def test_is_closing_quote_with_unambiguous_quote_convention(): - english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "standard_english" - ) + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None basic_quotation_mark_resolver = FallbackQuotationMarkResolver( @@ -266,13 +250,11 @@ def test_is_closing_quote_with_unambiguous_quote_convention(): def test_resolve_opening_quote(): - english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "standard_english" - ) + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None basic_quotation_mark_resolver = FallbackQuotationMarkResolver( - QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) ) expected_resolved_quotation_mark = QuotationMarkMetadata( @@ -286,13 +268,11 @@ def test_resolve_opening_quote(): def test_resolve_closing_quote(): - english_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - "standard_english" - ) + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None basic_quotation_mark_resolver = FallbackQuotationMarkResolver( - QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize(), english_quote_convention) + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) ) expected_resolved_quotation_mark = QuotationMarkMetadata( diff --git a/tests/corpora/test_quotation_denormalization.py b/tests/corpora/test_quotation_denormalization.py new file mode 100644 index 00000000..650f5ac8 --- /dev/null +++ b/tests/corpora/test_quotation_denormalization.py @@ -0,0 +1,53 @@ +from machine.corpora import ( + QuotationMarkDenormalizationFirstPass, + QuotationMarkDenormalizationUsfmUpdateBlockHandler, + QuotationMarkUpdateSettings, + UpdateUsfmParserHandler, + parse_usfm, +) +from machine.corpora.punctuation_analysis import STANDARD_QUOTE_CONVENTIONS + + +def test_full_quotation_denormalization_pipeline() -> None: + normalized_usfm = """ + \\id GEN + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + 'You shall not eat of any tree of the garden'?" + \\v 2 The woman said to the serpent, + "We may eat fruit from the trees of the garden, + \\v 3 but not the fruit of the tree which is in the middle of the garden. + God has said, 'You shall not eat of it. You shall not touch it, lest you die.'" + """ + + expected_denormalized_usfm = """\\id GEN\r +\\c 1\r +\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”\r +\\v 2 The woman said to the serpent, “We may eat fruit from the trees of the garden,\r +\\v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’”\r +""" # noqa: E501 + + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + + quotation_mark_denormalization_first_pass = QuotationMarkDenormalizationFirstPass( + standard_english_quote_convention, standard_english_quote_convention + ) + + parse_usfm(normalized_usfm, quotation_mark_denormalization_first_pass) + best_chapter_strategies = quotation_mark_denormalization_first_pass.find_best_chapter_strategies() + + quotation_mark_denormalizer = QuotationMarkDenormalizationUsfmUpdateBlockHandler( + standard_english_quote_convention, + standard_english_quote_convention, + QuotationMarkUpdateSettings(chapter_strategies=best_chapter_strategies), + ) + + updater = UpdateUsfmParserHandler(update_block_handlers=[quotation_mark_denormalizer]) + parse_usfm(normalized_usfm, updater) + + actual_denormalized_usfm = updater.get_usfm() + + assert actual_denormalized_usfm == expected_denormalized_usfm diff --git a/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py index 36f73133..e6a20b1d 100644 --- a/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py +++ b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py @@ -7,7 +7,7 @@ UpdateUsfmParserHandler, parse_usfm, ) -from machine.corpora.punctuation_analysis import QuoteConvention, standard_quote_conventions +from machine.corpora.punctuation_analysis import STANDARD_QUOTE_CONVENTIONS, QuoteConvention simple_normalized_usfm = """\\c 1 \\v 1 Now the serpent was more subtle than any animal @@ -302,7 +302,7 @@ def test_fallback_quotation_denormalization_same_as_full() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -324,7 +324,7 @@ def test_fallback_quotation_denormalization_incorrectly_nested() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -346,7 +346,7 @@ def test_fallback_quotation_denormalization_incorrectly_nested_second_case() -> normalized_usfm, "standard_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -368,7 +368,7 @@ def test_fallback_quotation_denormalization_unclosed_quote() -> None: normalized_usfm, "standard_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -412,8 +412,6 @@ def assert_usfm_equal(observed_usfm: str, expected_usfm: str) -> None: def get_quote_convention_by_name(name: str) -> QuoteConvention: - quote_convention: Union[QuoteConvention, None] = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) - ) + quote_convention: Union[QuoteConvention, None] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) assert quote_convention is not None return quote_convention diff --git a/tests/corpora/test_quotation_mark_update_first_pass.py b/tests/corpora/test_quotation_mark_update_first_pass.py index 2e80c2bd..95915602 100644 --- a/tests/corpora/test_quotation_mark_update_first_pass.py +++ b/tests/corpora/test_quotation_mark_update_first_pass.py @@ -2,12 +2,12 @@ from machine.corpora import QuotationMarkUpdateFirstPass, QuotationMarkUpdateStrategy, parse_usfm from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, Chapter, QuotationMarkResolutionIssue, QuoteConvention, TextSegment, Verse, - standard_quote_conventions, ) @@ -631,14 +631,10 @@ def test_ambiguous_in_first_unpaired_in_second() -> None: def run_first_pass( normalized_usfm: str, source_quote_convention_name: str, target_quote_convention_name: str ) -> List[QuotationMarkUpdateStrategy]: - source_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - source_quote_convention_name - ) + source_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(source_quote_convention_name) assert source_quote_convention is not None - target_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - target_quote_convention_name - ) + target_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(target_quote_convention_name) assert target_quote_convention is not None first_pass_analyzer = QuotationMarkUpdateFirstPass(source_quote_convention, target_quote_convention) @@ -650,14 +646,10 @@ def run_first_pass( def run_first_pass_on_chapter( verse_texts: List[str], source_quote_convention_name: str, target_quote_convention_name: str ) -> QuotationMarkUpdateStrategy: - source_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - source_quote_convention_name - ) + source_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(source_quote_convention_name) assert source_quote_convention is not None - target_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - target_quote_convention_name - ) + target_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(target_quote_convention_name) assert target_quote_convention is not None first_pass_analyzer = QuotationMarkUpdateFirstPass(source_quote_convention, target_quote_convention) @@ -668,8 +660,6 @@ def run_first_pass_on_chapter( def get_quote_convention_by_name(name: str) -> QuoteConvention: - quote_convention: Union[QuoteConvention, None] = ( - standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) - ) + quote_convention: Union[QuoteConvention, None] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) assert quote_convention is not None return quote_convention diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py index 8b60fb80..4944aea4 100644 --- a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py +++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py @@ -14,6 +14,7 @@ parse_usfm, ) from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, QuotationMarkDirection, QuotationMarkFinder, QuotationMarkMetadata, @@ -23,7 +24,6 @@ QuoteConventionSet, TextSegment, UsfmMarkerType, - standard_quote_conventions, ) @@ -142,7 +142,7 @@ def test_fallback_strategy_same_as_full() -> None: normalized_usfm, "british_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -164,7 +164,7 @@ def test_fallback_strategy_incorrectly_nested() -> None: normalized_usfm, "british_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -186,7 +186,7 @@ def test_fallback_strategy_incorrectly_nested_second_case() -> None: normalized_usfm, "british_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -208,7 +208,7 @@ def test_fallback_strategy_unclosed_quote() -> None: normalized_usfm, "british_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_usfm) @@ -249,7 +249,7 @@ def test_default_quotation_mark_update_strategy() -> None: normalized_usfm, "typewriter_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FULL), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FULL), ) assert_usfm_equal(observed_usfm, expected_full_usfm) @@ -257,7 +257,7 @@ def test_default_quotation_mark_update_strategy() -> None: normalized_usfm, "typewriter_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), ) assert_usfm_equal(observed_usfm, expected_basic_usfm) @@ -265,7 +265,7 @@ def test_default_quotation_mark_update_strategy() -> None: normalized_usfm, "typewriter_english", "standard_english", - QuotationMarkUpdateSettings(default_chapter_action=QuotationMarkUpdateStrategy.SKIP), + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.SKIP), ) assert_usfm_equal(observed_usfm, expected_skipped_usfm) @@ -299,7 +299,7 @@ def test_single_chapter_quotation_mark_update_strategy() -> None: normalized_usfm, "typewriter_english", "standard_english", - QuotationMarkUpdateSettings(chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FULL]), + QuotationMarkUpdateSettings(chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FULL]), ) assert_usfm_equal(observed_usfm, expected_full_usfm) @@ -307,7 +307,7 @@ def test_single_chapter_quotation_mark_update_strategy() -> None: normalized_usfm, "typewriter_english", "standard_english", - QuotationMarkUpdateSettings(chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FALLBACK]), + QuotationMarkUpdateSettings(chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FALLBACK]), ) assert_usfm_equal(observed_usfm, expected_basic_usfm) @@ -315,7 +315,7 @@ def test_single_chapter_quotation_mark_update_strategy() -> None: normalized_usfm, "typewriter_english", "standard_english", - QuotationMarkUpdateSettings(chapter_actions=[QuotationMarkUpdateStrategy.SKIP]), + QuotationMarkUpdateSettings(chapter_strategies=[QuotationMarkUpdateStrategy.SKIP]), ) assert_usfm_equal(observed_usfm, expected_skipped_usfm) @@ -347,7 +347,7 @@ def test_multiple_chapter_same_strategy() -> None: "typewriter_english", "standard_english", QuotationMarkUpdateSettings( - chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FULL] + chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FULL] ), ) assert_usfm_equal(observed_usfm, expected_full_usfm) @@ -357,7 +357,7 @@ def test_multiple_chapter_same_strategy() -> None: "typewriter_english", "standard_english", QuotationMarkUpdateSettings( - chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FALLBACK] ), ) assert_usfm_equal(observed_usfm, expected_fallback_usfm) @@ -397,7 +397,7 @@ def test_multiple_chapter_multiple_strategies() -> None: "typewriter_english", "standard_english", QuotationMarkUpdateSettings( - chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FALLBACK] ), ) assert_usfm_equal(observed_usfm, expected_full_then_fallback_usfm) @@ -407,7 +407,7 @@ def test_multiple_chapter_multiple_strategies() -> None: "typewriter_english", "standard_english", QuotationMarkUpdateSettings( - chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FULL] + chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FULL] ), ) assert_usfm_equal(observed_usfm, expected_fallback_then_full_usfm) @@ -417,7 +417,7 @@ def test_multiple_chapter_multiple_strategies() -> None: "typewriter_english", "standard_english", QuotationMarkUpdateSettings( - chapter_actions=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.SKIP] + chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.SKIP] ), ) assert_usfm_equal(observed_usfm, expected_fallback_then_skip_usfm) @@ -459,8 +459,8 @@ def test_create_text_segments_basic() -> None: assert text_segments[0]._text == "test segment" assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER assert text_segments[0]._markers_in_preceding_context == set() - assert text_segments[0]._previous_segment is None - assert text_segments[0]._next_segment is None + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment is None def test_create_text_segments_with_preceding_markers() -> None: @@ -485,8 +485,8 @@ def test_create_text_segments_with_preceding_markers() -> None: UsfmMarkerType.VERSE, UsfmMarkerType.PARAGRAPH, } - assert text_segments[0]._previous_segment is None - assert text_segments[0]._next_segment is None + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment is None def test_create_text_segments_with_multiple_text_tokens() -> None: @@ -515,16 +515,16 @@ def test_create_text_segments_with_multiple_text_tokens() -> None: UsfmMarkerType.VERSE, UsfmMarkerType.PARAGRAPH, } - assert text_segments[0]._previous_segment is None - assert text_segments[0]._next_segment == text_segments[1] + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment == text_segments[1] assert text_segments[1]._text == "test segment2" assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER assert text_segments[1]._markers_in_preceding_context == { UsfmMarkerType.VERSE, UsfmMarkerType.CHARACTER, } - assert text_segments[1]._previous_segment == text_segments[0] - assert text_segments[1]._next_segment is None + assert text_segments[1].previous_segment == text_segments[0] + assert text_segments[1].next_segment is None def test_create_text_segment() -> None: @@ -555,12 +555,12 @@ def test_set_previous_and_next_for_segments() -> None: quote_convention_changer._set_previous_and_next_for_segments(segments) - assert segments[0]._previous_segment is None - assert segments[0]._next_segment == segments[1] - assert segments[1]._previous_segment == segments[0] - assert segments[1]._next_segment == segments[2] - assert segments[2]._previous_segment == segments[1] - assert segments[2]._next_segment is None + assert segments[0].previous_segment is None + assert segments[0].next_segment == segments[1] + assert segments[1].previous_segment == segments[0] + assert segments[1].next_segment == segments[2] + assert segments[2].previous_segment == segments[1] + assert segments[2].next_segment is None def test_check_for_chapter_change() -> None: @@ -585,7 +585,7 @@ def test_start_new_chapter() -> None: "standard_english", "standard_english", QuotationMarkUpdateSettings( - chapter_actions=[ + chapter_strategies=[ QuotationMarkUpdateStrategy.SKIP, QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FALLBACK, @@ -639,14 +639,10 @@ def create_quote_convention_changing_usfm_update_block_handler( target_quote_convention_name: str, quotation_mark_update_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), ) -> QuoteConventionChangingUsfmUpdateBlockHandler: - source_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - source_quote_convention_name - ) + source_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(source_quote_convention_name) assert source_quote_convention is not None - target_quote_convention = standard_quote_conventions.STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( - target_quote_convention_name - ) + target_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(target_quote_convention_name) assert target_quote_convention is not None return QuoteConventionChangingUsfmUpdateBlockHandler( From fcc9e3b6c4794975a85a1b7708d39c2ba1792b4b Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 17 Jul 2025 11:14:27 -0400 Subject: [PATCH 25/31] Fix typing issue in tests for QuotationMarkUpdateFirstPass --- .../test_quotation_mark_update_first_pass.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/corpora/test_quotation_mark_update_first_pass.py b/tests/corpora/test_quotation_mark_update_first_pass.py index 95915602..c959cde0 100644 --- a/tests/corpora/test_quotation_mark_update_first_pass.py +++ b/tests/corpora/test_quotation_mark_update_first_pass.py @@ -334,25 +334,25 @@ def test_choose_best_action_based_on_observed_issues() -> None: first_pass_analyzer._will_fallback_mode_work = False # Test with no issues - best_action = first_pass_analyzer._choose_best_strategy_based_on_observed_issues([]) + best_action = first_pass_analyzer._choose_best_strategy_based_on_observed_issues(set()) assert best_action == QuotationMarkUpdateStrategy.APPLY_FULL # Test with one issue assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK] + {QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK} ) == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK] + {QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK} ) == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [QuotationMarkResolutionIssue.TOO_DEEP_NESTING] + {QuotationMarkResolutionIssue.TOO_DEEP_NESTING} ) == QuotationMarkUpdateStrategy.SKIP ) @@ -360,28 +360,28 @@ def test_choose_best_action_based_on_observed_issues() -> None: # Test with multiple issues assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [ + { QuotationMarkResolutionIssue.TOO_DEEP_NESTING, QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, - ] + } ) == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [ + { QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, - ] + } ) == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [ + { QuotationMarkResolutionIssue.TOO_DEEP_NESTING, QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, - ] + } ) == QuotationMarkUpdateStrategy.SKIP ) @@ -392,25 +392,25 @@ def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> No first_pass_analyzer._will_fallback_mode_work = True # Test with no issues - best_action = first_pass_analyzer._choose_best_strategy_based_on_observed_issues([]) + best_action = first_pass_analyzer._choose_best_strategy_based_on_observed_issues(set()) assert best_action == QuotationMarkUpdateStrategy.APPLY_FULL # Test with one issue assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK] + {QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK} ) == QuotationMarkUpdateStrategy.APPLY_FALLBACK ) assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK] + {QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK} ) == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [QuotationMarkResolutionIssue.TOO_DEEP_NESTING] + {QuotationMarkResolutionIssue.TOO_DEEP_NESTING} ) == QuotationMarkUpdateStrategy.APPLY_FALLBACK ) @@ -418,28 +418,28 @@ def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> No # Test with multiple issues assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [ + { QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, - ] + } ) == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [ + { QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, QuotationMarkResolutionIssue.TOO_DEEP_NESTING, - ] + } ) == QuotationMarkUpdateStrategy.SKIP ) assert ( first_pass_analyzer._choose_best_strategy_based_on_observed_issues( - [ + { QuotationMarkResolutionIssue.TOO_DEEP_NESTING, QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, - ] + } ) == QuotationMarkUpdateStrategy.APPLY_FALLBACK ) From 7392827d619b39c08d523f54ebe9c7ec1591b38c Mon Sep 17 00:00:00 2001 From: Ben King Date: Tue, 22 Jul 2025 11:23:45 -0400 Subject: [PATCH 26/31] Bug fix for multi-character quotation marks --- .../quotation_mark_metadata.py | 11 ++ ...tion_changing_usfm_update_block_handler.py | 24 ++- .../test_quotation_mark_metadata.py | 28 +++ ...tion_changing_usfm_block_update_handler.py | 160 ++++++++++++++++++ 4 files changed, 221 insertions(+), 2 deletions(-) diff --git a/machine/corpora/punctuation_analysis/quotation_mark_metadata.py b/machine/corpora/punctuation_analysis/quotation_mark_metadata.py index 7737bd41..efc9bc29 100644 --- a/machine/corpora/punctuation_analysis/quotation_mark_metadata.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_metadata.py @@ -15,6 +15,14 @@ class QuotationMarkMetadata: start_index: int end_index: int + @property + def length(self) -> int: + return self.end_index - self.start_index + + def shift_indices(self, shift_amount: int) -> None: + self.start_index += shift_amount + self.end_index += shift_amount + def update_quotation_mark(self, quote_convention: QuoteConvention) -> None: updated_quotation_mark = quote_convention.get_expected_quotation_mark(self.depth, self.direction) if updated_quotation_mark == self.quotation_mark: @@ -25,3 +33,6 @@ def update_quotation_mark(self, quote_convention: QuoteConvention) -> None: self.end_index, updated_quotation_mark, ) + + if len(updated_quotation_mark) != len(self.quotation_mark): + self.end_index += len(updated_quotation_mark) - len(self.quotation_mark) diff --git a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py index bdc648ee..818311dd 100644 --- a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py +++ b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py @@ -3,6 +3,7 @@ from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver from .punctuation_analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver from .punctuation_analysis.quotation_mark_finder import QuotationMarkFinder +from .punctuation_analysis.quotation_mark_metadata import QuotationMarkMetadata from .punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch from .punctuation_analysis.quote_convention import QuoteConvention @@ -85,8 +86,10 @@ def _process_scripture_element( quotation_mark_matches: List[QuotationMarkStringMatch] = ( self._quotation_mark_finder.find_all_potential_quotation_marks_in_text_segments(text_segments) ) - for resolved_quotation_mark in quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches): - resolved_quotation_mark.update_quotation_mark(self._target_quote_convention) + resolved_quotation_mark_matches: List[QuotationMarkMetadata] = list( + quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches) + ) + self._update_quotation_marks(resolved_quotation_mark_matches) def _create_text_segments(self, element: UsfmUpdateBlockElement) -> List[TextSegment]: text_segments: List[TextSegment] = [] @@ -122,6 +125,23 @@ def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) text_segments[i].next_segment = text_segments[i + 1] return text_segments + def _update_quotation_marks(self, resolved_quotation_mark_matches: List[QuotationMarkMetadata]) -> None: + for quotation_mark_index, resolved_quotation_mark_match in enumerate(resolved_quotation_mark_matches): + previous_length: int = resolved_quotation_mark_match.length + resolved_quotation_mark_match.update_quotation_mark(self._target_quote_convention) + updated_length: int = resolved_quotation_mark_match.length + + if previous_length != updated_length: + self._shift_quotation_mark_metadata_indices( + resolved_quotation_mark_matches[quotation_mark_index + 1 :], updated_length - previous_length + ) + + def _shift_quotation_mark_metadata_indices( + self, quotation_mark_metadata_list: List[QuotationMarkMetadata], shift_amount: int + ) -> None: + for quotation_mark_metadata in quotation_mark_metadata_list: + quotation_mark_metadata.shift_indices(shift_amount) + def _check_for_chapter_change(self, block: UsfmUpdateBlock) -> None: for scripture_ref in block.refs: if scripture_ref.chapter_num != self._current_chapter_number: diff --git a/tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py b/tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py index e4e65d9b..2607ab9e 100644 --- a/tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_metadata.py @@ -44,6 +44,34 @@ def test_update_quotation_mark() -> None: assert quotation_mark_metadata.text_segment._text == 'He said to the woman, "«as God really said,' +def test_update_quotation_mark_with_multi_character_quotation_marks() -> None: + quotation_mark_metadata = QuotationMarkMetadata( + quotation_mark='"', + depth=1, + direction=QuotationMarkDirection.OPENING, + text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), + start_index=22, + end_index=23, + ) + quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french")) + assert quotation_mark_metadata.text_segment._text == "He said to the woman, < QuoteConvention: quote_convention: Union[QuoteConvention, None] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) assert quote_convention is not None diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py index 4944aea4..f4b287de 100644 --- a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py +++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py @@ -423,6 +423,44 @@ def test_multiple_chapter_multiple_strategies() -> None: assert_usfm_equal(observed_usfm, expected_fallback_then_skip_usfm) +def test_multi_character_quotation_marks_in_source_quote_convention() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, " + + "‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks(input_usfm, "typewriter_french", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_multi_character_quotation_marks_in_target_quote_convention() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, <?>>" + ) + + observed_usfm = change_quotation_marks(input_usfm, "standard_english", "typewriter_french") + assert_usfm_equal(observed_usfm, expected_usfm) + + def test_process_scripture_element() -> None: quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( create_quote_convention_changing_usfm_update_block_handler("standard_english", "british_english") @@ -563,6 +601,128 @@ def test_set_previous_and_next_for_segments() -> None: assert segments[2].next_segment is None +def test_update_quotation_marks() -> None: + multi_char_to_single_char_quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("typewriter_french", "standard_english") + ) + + multi_character_text_segment: TextSegment = TextSegment.Builder().set_text("this < >>").build() + multi_character_quotation_marks: List[QuotationMarkMetadata] = [ + QuotationMarkMetadata( + quotation_mark="<<", + depth=1, + direction=QuotationMarkDirection.OPENING, + text_segment=multi_character_text_segment, + start_index=5, + end_index=7, + ), + QuotationMarkMetadata( + quotation_mark="<", + depth=2, + direction=QuotationMarkDirection.OPENING, + text_segment=multi_character_text_segment, + start_index=10, + end_index=11, + ), + QuotationMarkMetadata( + quotation_mark=">", + depth=2, + direction=QuotationMarkDirection.CLOSING, + text_segment=multi_character_text_segment, + start_index=25, + end_index=26, + ), + QuotationMarkMetadata( + quotation_mark=">>", + depth=1, + direction=QuotationMarkDirection.CLOSING, + text_segment=multi_character_text_segment, + start_index=27, + end_index=29, + ), + ] + + multi_char_to_single_char_quote_convention_changer._update_quotation_marks(multi_character_quotation_marks) + + assert multi_character_text_segment.text == "this “is ‘a test segment’ ”" + + assert multi_character_quotation_marks[0].start_index == 5 + assert multi_character_quotation_marks[0].end_index == 6 + assert multi_character_quotation_marks[0].text_segment == multi_character_text_segment + + assert multi_character_quotation_marks[1].start_index == 9 + assert multi_character_quotation_marks[1].end_index == 10 + assert multi_character_quotation_marks[1].text_segment == multi_character_text_segment + + assert multi_character_quotation_marks[2].start_index == 24 + assert multi_character_quotation_marks[2].end_index == 25 + assert multi_character_quotation_marks[2].text_segment == multi_character_text_segment + + assert multi_character_quotation_marks[3].start_index == 26 + assert multi_character_quotation_marks[3].end_index == 27 + assert multi_character_quotation_marks[3].text_segment == multi_character_text_segment + + single_char_to_multi_char_quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "typewriter_french") + ) + + single_character_text_segment: TextSegment = TextSegment.Builder().set_text("this “is ‘a test segment’ ”").build() + single_character_quotation_marks: List[QuotationMarkMetadata] = [ + QuotationMarkMetadata( + quotation_mark="“", + depth=1, + direction=QuotationMarkDirection.OPENING, + text_segment=single_character_text_segment, + start_index=5, + end_index=6, + ), + QuotationMarkMetadata( + quotation_mark="‘", + depth=2, + direction=QuotationMarkDirection.OPENING, + text_segment=single_character_text_segment, + start_index=9, + end_index=10, + ), + QuotationMarkMetadata( + quotation_mark="’", + depth=2, + direction=QuotationMarkDirection.CLOSING, + text_segment=single_character_text_segment, + start_index=24, + end_index=25, + ), + QuotationMarkMetadata( + quotation_mark="”", + depth=1, + direction=QuotationMarkDirection.CLOSING, + text_segment=single_character_text_segment, + start_index=26, + end_index=27, + ), + ] + + single_char_to_multi_char_quote_convention_changer._update_quotation_marks(single_character_quotation_marks) + + assert single_character_text_segment.text == "this < >>" + + assert single_character_quotation_marks[0].start_index == 5 + assert single_character_quotation_marks[0].end_index == 7 + assert single_character_quotation_marks[0].text_segment == single_character_text_segment + + assert single_character_quotation_marks[1].start_index == 10 + assert single_character_quotation_marks[1].end_index == 11 + assert single_character_quotation_marks[1].text_segment == single_character_text_segment + + assert single_character_quotation_marks[2].start_index == 25 + assert single_character_quotation_marks[2].end_index == 26 + assert single_character_quotation_marks[2].text_segment == single_character_text_segment + + assert single_character_quotation_marks[3].start_index == 27 + assert single_character_quotation_marks[3].end_index == 29 + assert single_character_quotation_marks[3].text_segment == single_character_text_segment + + def test_check_for_chapter_change() -> None: quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") From ee01a7f5ccdea4c0123667d1d03cd6d10e3be0f1 Mon Sep 17 00:00:00 2001 From: Ben King Date: Tue, 22 Jul 2025 12:45:51 -0400 Subject: [PATCH 27/31] Use is_at_start_of_segment for consistency --- .../corpora/punctuation_analysis/quotation_mark_string_match.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine/corpora/punctuation_analysis/quotation_mark_string_match.py b/machine/corpora/punctuation_analysis/quotation_mark_string_match.py index 67dd4a46..573e37c7 100644 --- a/machine/corpora/punctuation_analysis/quotation_mark_string_match.py +++ b/machine/corpora/punctuation_analysis/quotation_mark_string_match.py @@ -54,7 +54,7 @@ def previous_character_matches(self, regex_pattern: regex.Pattern) -> bool: @property def previous_character(self) -> Optional[str]: - if self._start_index == 0: + if self.is_at_start_of_segment(): previous_segment = self._text_segment.previous_segment if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context( UsfmMarkerType.PARAGRAPH From 29be81ec59f82ffc3e9761b0095636568892cef3 Mon Sep 17 00:00:00 2001 From: Ben King Date: Tue, 22 Jul 2025 16:41:58 -0400 Subject: [PATCH 28/31] Include analysis details in QuoteConventionAnalysis --- .../punctuation_analysis/quote_convention_detector.py | 10 +++++----- .../test_quote_convention_detector.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/machine/corpora/punctuation_analysis/quote_convention_detector.py b/machine/corpora/punctuation_analysis/quote_convention_detector.py index eac56445..5a8d098d 100644 --- a/machine/corpora/punctuation_analysis/quote_convention_detector.py +++ b/machine/corpora/punctuation_analysis/quote_convention_detector.py @@ -19,6 +19,7 @@ class QuoteConventionAnalysis: best_quote_convention: QuoteConvention best_quote_convention_score: float + analysis_summary: str class QuoteConventionDetector(UsfmStructureExtractor): @@ -50,16 +51,15 @@ def _count_quotation_marks_in_chapter( self._quotation_mark_tabulator.tabulate(resolved_quotation_marks) - def detect_quote_convention(self, print_summary: bool) -> Optional[QuoteConventionAnalysis]: + def detect_quote_convention(self) -> Optional[QuoteConventionAnalysis]: self._count_quotation_marks_in_chapters(self.get_chapters()) (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention( self._quotation_mark_tabulator ) - if print_summary: - print(self._quotation_mark_tabulator.get_summary_message()) - if score > 0 and best_quote_convention is not None: - return QuoteConventionAnalysis(best_quote_convention, score) + return QuoteConventionAnalysis( + best_quote_convention, score, self._quotation_mark_tabulator.get_summary_message() + ) return None diff --git a/tests/corpora/punctuation_analysis/test_quote_convention_detector.py b/tests/corpora/punctuation_analysis/test_quote_convention_detector.py index 2e5b015a..190abb18 100644 --- a/tests/corpora/punctuation_analysis/test_quote_convention_detector.py +++ b/tests/corpora/punctuation_analysis/test_quote_convention_detector.py @@ -302,4 +302,4 @@ def test_mismatched_quotation_marks() -> None: def detect_quote_convention(usfm: str) -> Union[QuoteConventionAnalysis, None]: quote_convention_detector = QuoteConventionDetector() parse_usfm(usfm, quote_convention_detector) - return quote_convention_detector.detect_quote_convention(print_summary=False) + return quote_convention_detector.detect_quote_convention() From f93a19566877530ef6a335ec0112086ef01630fd Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 25 Jul 2025 17:23:53 -0400 Subject: [PATCH 29/31] Better guesses for ambiguous quotation marks + quote continuer edge case fix --- .../fallback_quotation_mark_resolver.py | 13 ++++ .../depth_based_quotation_mark_resolver.py | 3 + .../test_fallback_quotation_mark_resolver.py | 68 +++++++++++++++++-- 3 files changed, 79 insertions(+), 5 deletions(-) diff --git a/machine/corpora/fallback_quotation_mark_resolver.py b/machine/corpora/fallback_quotation_mark_resolver.py index 41b33a5e..0221cbbb 100644 --- a/machine/corpora/fallback_quotation_mark_resolver.py +++ b/machine/corpora/fallback_quotation_mark_resolver.py @@ -42,6 +42,19 @@ def _resolve_quotation_mark( else: self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) else: + # Make a reasonable guess about the direction of the quotation mark + if ( + self._last_quotation_mark is None + or self._last_quotation_mark.direction is QuotationMarkDirection.CLOSING + ): + quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_opening_mark(quotation_mark_match) + if quotation_mark is not None: + yield quotation_mark + else: + quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_closing_mark(quotation_mark_match) + if quotation_mark is not None: + yield quotation_mark + self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) def _is_opening_quotation_mark( diff --git a/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py index 1de92884..4643701d 100644 --- a/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py +++ b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py @@ -176,6 +176,9 @@ def _meets_quote_continuer_prerequisites( self, quotation_mark_match: QuotationMarkStringMatch, ) -> bool: + if self._quote_continuer_state.current_depth >= self._quotation_mark_resolver_state.current_depth: + return False + if ( self._settings.should_rely_on_paragraph_markers and not quotation_mark_match._text_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH) diff --git a/tests/corpora/test_fallback_quotation_mark_resolver.py b/tests/corpora/test_fallback_quotation_mark_resolver.py index 381d1976..9a889c6c 100644 --- a/tests/corpora/test_fallback_quotation_mark_resolver.py +++ b/tests/corpora/test_fallback_quotation_mark_resolver.py @@ -29,7 +29,7 @@ def test_reset(): assert len(basic_quotation_mark_resolver._issues) == 0 -def test_simple_quotation_mark_resolution(): +def test_simple_quotation_mark_resolution_with_no_previous_mark(): english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") assert english_quote_convention is not None @@ -40,17 +40,75 @@ def test_simple_quotation_mark_resolution(): actual_resolved_quotation_marks = list( basic_quotation_mark_resolver.resolve_quotation_marks( [ - QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1), - QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 10, 11), + QuotationMarkStringMatch(TextSegment.Builder().set_text('test " text').build(), 5, 6), ] ) ) expected_resolved_quotation_marks = [ QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"test text"').build(), 0, 1 + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('test " text').build(), 5, 6 + ), + ] + + assert_resolved_quotation_marks_equal( + actual_resolved_quotation_marks, + expected_resolved_quotation_marks, + ) + + +def test_simple_quotation_mark_resolution_with_previous_opening_mark(): + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert english_quote_convention is not None + + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) + ) + + actual_resolved_quotation_marks = list( + basic_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('"test " text').build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text('"test " text').build(), 6, 7), + ] + ) + ) + expected_resolved_quotation_marks = [ + QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"test " text').build(), 0, 1 + ), + QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().set_text('"test " text').build(), 6, 7 + ), + ] + + assert_resolved_quotation_marks_equal( + actual_resolved_quotation_marks, + expected_resolved_quotation_marks, + ) + + +def test_simple_quotation_mark_resolution_with_previous_closing_mark(): + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert english_quote_convention is not None + + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) + ) + + actual_resolved_quotation_marks = list( + basic_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('test" " text').build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text('test" " text').build(), 6, 7), + ] + ) + ) + expected_resolved_quotation_marks = [ + QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().set_text('test" " text').build(), 4, 5 ), QuotationMarkMetadata( - '"', 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().set_text('"test text"').build(), 10, 11 + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('test" " text').build(), 6, 7 ), ] From c17c54faa6d5966a0d23e65312125789966ee248 Mon Sep 17 00:00:00 2001 From: Ben King Date: Wed, 30 Jul 2025 15:14:46 -0400 Subject: [PATCH 30/31] Code review changes requested for tests --- .../quotation_mark_update_first_pass.py | 14 +++-- ...est_depth_based_quotation_mark_resolver.py | 7 ++- .../test_quotation_mark_string_match.py | 2 +- .../test_quote_convention.py | 54 +++++++++---------- 4 files changed, 37 insertions(+), 40 deletions(-) diff --git a/machine/corpora/quotation_mark_update_first_pass.py b/machine/corpora/quotation_mark_update_first_pass.py index ee5827a5..e863c06c 100644 --- a/machine/corpora/quotation_mark_update_first_pass.py +++ b/machine/corpora/quotation_mark_update_first_pass.py @@ -1,3 +1,4 @@ +from collections import defaultdict from typing import Dict, List, Set from .punctuation_analysis.chapter import Chapter @@ -33,15 +34,12 @@ def __init__(self, source_quote_convention: QuoteConvention, target_quote_conven def _check_whether_fallback_mode_will_work( self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention ) -> bool: - target_marks_by_source_marks: Dict[str, Set[str]] = {} - for depth in range(1, source_quote_convention.num_levels + 1): + target_marks_by_source_marks: Dict[str, Set[str]] = defaultdict(set) + for depth in range(1, min(source_quote_convention.num_levels, target_quote_convention.num_levels) + 1): opening_quotation_mark = source_quote_convention.get_opening_quotation_mark_at_depth(depth) - if opening_quotation_mark not in target_marks_by_source_marks: - target_marks_by_source_marks[opening_quotation_mark] = set() - if depth <= target_quote_convention.num_levels: - target_marks_by_source_marks[opening_quotation_mark].add( - target_quote_convention.get_closing_quotation_mark_at_depth(depth) - ) + target_marks_by_source_marks[opening_quotation_mark].add( + target_quote_convention.get_closing_quotation_mark_at_depth(depth) + ) for source_mark in target_marks_by_source_marks: if len(target_marks_by_source_marks[source_mark]) > 1: diff --git a/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py b/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py index 3a69af3d..f9303ebc 100644 --- a/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py +++ b/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py @@ -993,19 +993,19 @@ def test_is_opening_quote() -> None: QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( - QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 1, 2) + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) ) assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) ) assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( - QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 1, 2) + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) ) assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019").build(), 1, 2) ) assert not three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( - QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 1, 2) + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) ) assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201c").build(), 1, 2) @@ -2290,7 +2290,6 @@ def test_too_deep_nesting_issue() -> None: QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment, 6, 7), QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.OPENING, text_segment, 10, 11), QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.OPENING, text_segment, 13, 14), - # QuotationMarkMetadata("\u201c", 5, QuotationMarkDirection.Opening, text_segment, 20, 21), ] assert standard_english_quotation_mark_resolver.get_issues() == { QuotationMarkResolutionIssue.TOO_DEEP_NESTING, diff --git a/tests/corpora/punctuation_analysis/test_quotation_mark_string_match.py b/tests/corpora/punctuation_analysis/test_quotation_mark_string_match.py index c21568e9..82744ec8 100644 --- a/tests/corpora/punctuation_analysis/test_quotation_mark_string_match.py +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_string_match.py @@ -154,7 +154,7 @@ def test_does_trailing_substring_match() -> None: assert quotation_mark_string_match.trailing_substring_matches(regex.compile(r"^ text$")) quotation_mark_string_match = QuotationMarkStringMatch( - TextSegment.Builder().set_text("sample text").build(), 11, 12 + TextSegment.Builder().set_text("sample text").build(), 10, 11 ) assert not quotation_mark_string_match.trailing_substring_matches(regex.compile(r".+")) diff --git a/tests/corpora/punctuation_analysis/test_quote_convention.py b/tests/corpora/punctuation_analysis/test_quote_convention.py index 514ffdf1..12c5deb5 100644 --- a/tests/corpora/punctuation_analysis/test_quote_convention.py +++ b/tests/corpora/punctuation_analysis/test_quote_convention.py @@ -89,13 +89,13 @@ def test_get_num_levels() -> None: assert empty_quote_convention.num_levels == 0 one_level_quote_convention = QuoteConvention( - "one-level-quote-convention", + "one_level_quote_convention", [SingleLevelQuoteConvention("\u201c", "\u201d")], ) assert one_level_quote_convention.num_levels == 1 two_level_quote_convention = QuoteConvention( - "two-level-quote-convention", + "two_level_quote_convention", [ SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -104,7 +104,7 @@ def test_get_num_levels() -> None: assert two_level_quote_convention.num_levels == 2 three_level_quote_convention = QuoteConvention( - "three-level-quote-convention", + "three_level_quote_convention", [ SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -116,7 +116,7 @@ def test_get_num_levels() -> None: def test_get_opening_quote_at_level() -> None: quote_convention = QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -130,7 +130,7 @@ def test_get_opening_quote_at_level() -> None: def test_get_closing_quote_at_level() -> None: quote_convention = QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -144,7 +144,7 @@ def test_get_closing_quote_at_level() -> None: def test_get_expected_quotation_mark() -> None: quote_convention = QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -168,28 +168,28 @@ def test_includes_opening_quotation_mark() -> None: assert not empty_quote_convention._includes_opening_quotation_mark("\u201c") positive_quote_convention1 = QuoteConvention( - "positive quote convention 1", [SingleLevelQuoteConvention("\u201c", "\u201d")] + "positive_quote_convention_1", [SingleLevelQuoteConvention("\u201c", "\u201d")] ) assert positive_quote_convention1._includes_opening_quotation_mark("\u201c") negative_quote_convention1 = QuoteConvention( - "negative quote convention 1", [SingleLevelQuoteConvention("\u2018", "\u2019")] + "negative_quote_convention_1", [SingleLevelQuoteConvention("\u2018", "\u2019")] ) assert not negative_quote_convention1._includes_opening_quotation_mark("\u201c") negative_quote_convention2 = QuoteConvention( - "negative quote convention 2", [SingleLevelQuoteConvention("\u201d", "\u201c")] + "negative_quote_convention_2", [SingleLevelQuoteConvention("\u201d", "\u201c")] ) assert not negative_quote_convention2._includes_opening_quotation_mark("\u201c") positive_quote_convention2 = QuoteConvention( - "positive quote convention 2", + "positive_quote_convention_2", [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019")], ) assert positive_quote_convention2._includes_opening_quotation_mark("\u201c") positive_quote_convention3 = QuoteConvention( - "positive quote convention 3", + "positive_quote_convention_3", [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u201c", "\u201d")], ) assert positive_quote_convention3._includes_opening_quotation_mark("\u201c") @@ -210,34 +210,34 @@ def test_includes_closing_quotation_mark() -> None: assert not empty_quote_convention._includes_closing_quotation_mark("\u201d") positive_quote_convention1 = QuoteConvention( - "positive quote convention 1", [SingleLevelQuoteConvention("\u201c", "\u201d")] + "positive_quote_convention_1", [SingleLevelQuoteConvention("\u201c", "\u201d")] ) assert positive_quote_convention1._includes_closing_quotation_mark("\u201d") negative_quote_convention1 = QuoteConvention( - "negative quote convention 1", [SingleLevelQuoteConvention("\u2018", "\u2019")] + "negative_quote_convention_1", [SingleLevelQuoteConvention("\u2018", "\u2019")] ) assert not negative_quote_convention1._includes_closing_quotation_mark("\u201d") negative_quote_convention2 = QuoteConvention( - "negative quote convention 2", [SingleLevelQuoteConvention("\u201d", "\u201c")] + "negative_quote_convention_2", [SingleLevelQuoteConvention("\u201d", "\u201c")] ) assert not negative_quote_convention2._includes_closing_quotation_mark("\u201d") positive_quote_convention2 = QuoteConvention( - "positive quote convention 2", + "positive_quote_convention_2", [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019")], ) assert positive_quote_convention2._includes_closing_quotation_mark("\u201d") positive_quote_convention3 = QuoteConvention( - "positive quote convention 3", + "positive_quote_convention_3", [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u201c", "\u201d")], ) assert positive_quote_convention3._includes_closing_quotation_mark("\u201d") negative_quote_convention3 = QuoteConvention( - "negative quote convention 3", + "negative_quote_convention_3", [ SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("'", "'"), @@ -249,7 +249,7 @@ def test_includes_closing_quotation_mark() -> None: def test_get_possible_depths() -> None: quote_convention = QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -271,7 +271,7 @@ def test_get_possible_depths() -> None: def test_is_compatible_with_observed_quotation_marks() -> None: quote_convention = QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -302,7 +302,7 @@ def test_normalize() -> None: assert normalized_empty_quote_convention.num_levels == 0 standard_english_quote_convention = QuoteConvention( - "standard-english-quote-convention", + "standard_english_quote_convention", [ SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -311,7 +311,7 @@ def test_normalize() -> None: ], ) normalized_standard_english_quote_convention = standard_english_quote_convention.normalize() - assert normalized_standard_english_quote_convention.name == "standard-english-quote-convention_normalized" + assert normalized_standard_english_quote_convention.name == "standard_english_quote_convention_normalized" assert normalized_standard_english_quote_convention.num_levels == 4 assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_depth(1) == '"' assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_depth(1) == '"' @@ -323,7 +323,7 @@ def test_normalize() -> None: assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_depth(4) == "'" western_european_quote_convention = QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u00ab", "\u00bb"), @@ -331,7 +331,7 @@ def test_normalize() -> None: ], ) normalized_western_european_quote_convention = western_european_quote_convention.normalize() - assert normalized_western_european_quote_convention.name == "test-quote-convention_normalized" + assert normalized_western_european_quote_convention.name == "test_quote_convention_normalized" assert normalized_western_european_quote_convention.num_levels == 3 assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_depth(1) == '"' assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_depth(1) == '"' @@ -341,7 +341,7 @@ def test_normalize() -> None: assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_depth(3) == "'" hybrid_british_typewriter_english_quote_convention = QuoteConvention( - "hybrid-british-typewriter-english-quote-convention", + "hybrid_british_typewriter_english_quote_convention", [ SingleLevelQuoteConvention("\u00ab", "\u00bb"), SingleLevelQuoteConvention("'", "'"), @@ -354,7 +354,7 @@ def test_normalize() -> None: ) assert ( normalized_hybrid_british_typewriter_english_quote_convention.name - == "hybrid-british-typewriter-english-quote-convention_normalized" + == "hybrid_british_typewriter_english_quote_convention_normalized" ) assert normalized_hybrid_british_typewriter_english_quote_convention.num_levels == 3 assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_depth(1) == '"' @@ -367,7 +367,7 @@ def test_normalize() -> None: def test_print_summary() -> None: quote_convention = QuoteConvention( - "test-quote-convention", + "test_quote_convention", [ SingleLevelQuoteConvention("\u201c", "\u201D"), SingleLevelQuoteConvention("\u2018", "\u2019"), @@ -375,7 +375,7 @@ def test_print_summary() -> None: ], ) expected_summary_message = ( - "test-quote-convention\n" + "test_quote_convention\n" + "\u201CFirst-level quote\u201D\n" + "\u2018Second-level quote\u2019\n" + "\u201DThird-level quote\u201D\n" From f291d4d648e20cb5e49ae615ec65e8c4a16f586a Mon Sep 17 00:00:00 2001 From: Ben King Date: Thu, 31 Jul 2025 15:21:43 -0400 Subject: [PATCH 31/31] Fix fallback denormalization logic + line endings in a test --- .../quotation_mark_update_first_pass.py | 28 ++++++++++----- .../corpora/test_quotation_denormalization.py | 14 ++++---- .../test_quotation_mark_update_first_pass.py | 36 +++++++++++++++++++ 3 files changed, 63 insertions(+), 15 deletions(-) diff --git a/machine/corpora/quotation_mark_update_first_pass.py b/machine/corpora/quotation_mark_update_first_pass.py index e863c06c..b42af15a 100644 --- a/machine/corpora/quotation_mark_update_first_pass.py +++ b/machine/corpora/quotation_mark_update_first_pass.py @@ -1,4 +1,3 @@ -from collections import defaultdict from typing import Dict, List, Set from .punctuation_analysis.chapter import Chapter @@ -34,16 +33,27 @@ def __init__(self, source_quote_convention: QuoteConvention, target_quote_conven def _check_whether_fallback_mode_will_work( self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention ) -> bool: - target_marks_by_source_marks: Dict[str, Set[str]] = defaultdict(set) + opening_target_marks_by_source_marks: Dict[str, str] = {} + closing_target_marks_by_source_marks: Dict[str, str] = {} for depth in range(1, min(source_quote_convention.num_levels, target_quote_convention.num_levels) + 1): - opening_quotation_mark = source_quote_convention.get_opening_quotation_mark_at_depth(depth) - target_marks_by_source_marks[opening_quotation_mark].add( - target_quote_convention.get_closing_quotation_mark_at_depth(depth) - ) - - for source_mark in target_marks_by_source_marks: - if len(target_marks_by_source_marks[source_mark]) > 1: + source_opening_quotation_mark = source_quote_convention.get_opening_quotation_mark_at_depth(depth) + target_opening_quotation_mark = target_quote_convention.get_opening_quotation_mark_at_depth(depth) + if ( + source_opening_quotation_mark in opening_target_marks_by_source_marks + and opening_target_marks_by_source_marks[source_opening_quotation_mark] != target_opening_quotation_mark + ): + return False + opening_target_marks_by_source_marks[source_opening_quotation_mark] = target_opening_quotation_mark + + source_closing_quotation_mark = source_quote_convention.get_closing_quotation_mark_at_depth(depth) + target_closing_quotation_mark = target_quote_convention.get_closing_quotation_mark_at_depth(depth) + if ( + source_closing_quotation_mark in closing_target_marks_by_source_marks + and closing_target_marks_by_source_marks[source_closing_quotation_mark] != target_closing_quotation_mark + ): return False + closing_target_marks_by_source_marks[source_closing_quotation_mark] = target_closing_quotation_mark + return True def find_best_chapter_strategies(self) -> List[QuotationMarkUpdateStrategy]: diff --git a/tests/corpora/test_quotation_denormalization.py b/tests/corpora/test_quotation_denormalization.py index 650f5ac8..06327f5b 100644 --- a/tests/corpora/test_quotation_denormalization.py +++ b/tests/corpora/test_quotation_denormalization.py @@ -1,3 +1,5 @@ +from testutils.corpora_test_helpers import ignore_line_endings + from machine.corpora import ( QuotationMarkDenormalizationFirstPass, QuotationMarkDenormalizationUsfmUpdateBlockHandler, @@ -22,11 +24,11 @@ def test_full_quotation_denormalization_pipeline() -> None: God has said, 'You shall not eat of it. You shall not touch it, lest you die.'" """ - expected_denormalized_usfm = """\\id GEN\r -\\c 1\r -\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”\r -\\v 2 The woman said to the serpent, “We may eat fruit from the trees of the garden,\r -\\v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’”\r + expected_denormalized_usfm = """\\id GEN +\\c 1 +\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?” +\\v 2 The woman said to the serpent, “We may eat fruit from the trees of the garden, +\\v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’” """ # noqa: E501 standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") @@ -50,4 +52,4 @@ def test_full_quotation_denormalization_pipeline() -> None: actual_denormalized_usfm = updater.get_usfm() - assert actual_denormalized_usfm == expected_denormalized_usfm + ignore_line_endings(actual_denormalized_usfm, expected_denormalized_usfm) diff --git a/tests/corpora/test_quotation_mark_update_first_pass.py b/tests/corpora/test_quotation_mark_update_first_pass.py index c959cde0..5b7ab9b0 100644 --- a/tests/corpora/test_quotation_mark_update_first_pass.py +++ b/tests/corpora/test_quotation_mark_update_first_pass.py @@ -6,6 +6,7 @@ Chapter, QuotationMarkResolutionIssue, QuoteConvention, + SingleLevelQuoteConvention, TextSegment, Verse, ) @@ -229,6 +230,41 @@ def test_check_whether_fallback_mode_will_work_with_normalized_conventions() -> ) +def test_check_whether_fallback_mode_will_work_with_artificial_conventions() -> None: + + first_pass_analyzer = QuotationMarkUpdateFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + + # This tests combinations of quotation marks that haven't been observed in real-world conventions, + # but would cause fallback mode not to work. + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + QuoteConvention( + "artificial_source_quote_convention1", + [SingleLevelQuoteConvention('"', '"'), SingleLevelQuoteConvention('"', '"')], + ), + QuoteConvention( + "artificial_target_quote_convention1", + [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u201c", "\u201c")], + ), + ) + is False + ) + + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + QuoteConvention( + "artificial_source_quote_convention2", + [SingleLevelQuoteConvention('"', '"'), SingleLevelQuoteConvention('"', '"')], + ), + QuoteConvention( + "artificial_target_quote_convention2", + [SingleLevelQuoteConvention("\u201d", "\u201d"), SingleLevelQuoteConvention("\u201c", "\u201d")], + ), + ) + is False + ) + + def test_choose_best_action_for_chapter() -> None: # Verse text with no issues actual_action = run_first_pass_on_chapter(