From e9868b8357874cf8e9073f44aed5fcd56d9e6237 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 29 Sep 2025 14:54:23 -0400 Subject: [PATCH 1/2] Port https://github.com/sillsdev/machine.py/pull/234 and https://github.com/sillsdev/machine.py/pull/235 --- .../PlaceMarkersUsfmUpdateBlockHandler.cs | 4 - .../QuotationMarkDenormalizationFirstPass.cs | 14 -- .../FallbackQuotationMarkResolver.cs | 6 +- .../ParatextProjectQuoteConventionDetector.cs | 7 +- .../PreliminaryQuotationMarkAnalyzer.cs | 46 ++++++ .../QuotationMarkDenormalizationFirstPass.cs | 9 ++ ...rkDenormalizationUsfmUpdateBlockHandler.cs | 7 +- .../QuotationMarkUpdateFirstPass.cs | 40 ++--- .../QuotationMarkUpdateResolutionSettings.cs | 15 +- .../QuotationMarkUpdateSettings.cs | 4 +- .../QuotationMarkUpdateStrategy.cs | 2 +- ...onventionChangingUsfmUpdateBlockHandler.cs | 24 ++- .../Corpora/QuotationDenormalizationTests.cs | 2 - ...ormalizationUsfmBlockUpdateHandlerTests.cs | 139 ++++++++++-------- 14 files changed, 164 insertions(+), 155 deletions(-) delete mode 100644 src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs rename src/SIL.Machine/{Corpora => PunctuationAnalysis}/FallbackQuotationMarkResolver.cs (98%) rename src/SIL.Machine/{Corpora => PunctuationAnalysis}/ParatextProjectQuoteConventionDetector.cs (93%) create mode 100644 src/SIL.Machine/PunctuationAnalysis/QuotationMarkDenormalizationFirstPass.cs rename src/SIL.Machine/{Corpora => PunctuationAnalysis}/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs (76%) rename src/SIL.Machine/{Corpora => PunctuationAnalysis}/QuotationMarkUpdateFirstPass.cs (67%) rename src/SIL.Machine/{Corpora => PunctuationAnalysis}/QuotationMarkUpdateResolutionSettings.cs (77%) rename src/SIL.Machine/{Corpora => PunctuationAnalysis}/QuotationMarkUpdateSettings.cs (93%) rename src/SIL.Machine/{Corpora => PunctuationAnalysis}/QuotationMarkUpdateStrategy.cs (73%) rename src/SIL.Machine/{Corpora => PunctuationAnalysis}/QuoteConventionChangingUsfmUpdateBlockHandler.cs (94%) diff --git a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs index d93b8fe0..4dd5ae40 100644 --- a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs @@ -1,7 +1,3 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using SIL.Extensions; using SIL.Machine.Translation; namespace SIL.Machine.Corpora diff --git a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs b/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs deleted file mode 100644 index c90827d5..00000000 --- a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs +++ /dev/null @@ -1,14 +0,0 @@ -using SIL.Machine.PunctuationAnalysis; - -namespace SIL.Machine.Corpora -{ - // This is a convenience class so that users don't have to know to normalize the source quote convention - public class QuotationMarkDenormalizationFirstPass : QuotationMarkUpdateFirstPass - { - public QuotationMarkDenormalizationFirstPass( - QuoteConvention sourceQuoteConvention, - QuoteConvention targetQuoteConvention - ) - : base(sourceQuoteConvention.Normalize(), targetQuoteConvention) { } - } -} diff --git a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs b/src/SIL.Machine/PunctuationAnalysis/FallbackQuotationMarkResolver.cs similarity index 98% rename from src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs rename to src/SIL.Machine/PunctuationAnalysis/FallbackQuotationMarkResolver.cs index 7e4a1af6..e0e0591f 100644 --- a/src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs +++ b/src/SIL.Machine/PunctuationAnalysis/FallbackQuotationMarkResolver.cs @@ -1,8 +1,4 @@ -using System.Collections.Generic; -using System.Linq; -using SIL.Machine.PunctuationAnalysis; - -namespace SIL.Machine.Corpora +namespace SIL.Machine.PunctuationAnalysis { public class FallbackQuotationMarkResolver : IQuotationMarkResolver { diff --git a/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs similarity index 93% rename from src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs rename to src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs index db2c6a92..a32b6323 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs +++ b/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs @@ -1,9 +1,4 @@ -using System; -using System.IO; -using System.Text; -using SIL.Machine.PunctuationAnalysis; - -namespace SIL.Machine.Corpora +namespace SIL.Machine.PunctuationAnalysis { public abstract class ParatextProjectQuoteConventionDetector { diff --git a/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs b/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs index 867119a0..203b447e 100644 --- a/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs +++ b/src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs @@ -6,6 +6,42 @@ namespace SIL.Machine.PunctuationAnalysis { + public class QuotationMarkCounter + { + private const double NegligibleProportionThreshold = 0.01; + private Dictionary _quotationMarkCounts; + private int _totalQuotationMarkCount; + + public QuotationMarkCounter() + { + Reset(); + } + + public void Reset() + { + _quotationMarkCounts = new Dictionary(); + _totalQuotationMarkCount = 0; + } + + public void CountQuotationMarks(List quotationMarks) + { + foreach (var quotationMarkMatch in quotationMarks) + { + string mark = quotationMarkMatch.QuotationMark; + _quotationMarkCounts.UpdateValue(mark, () => 0, i => i + 1); + _totalQuotationMarkCount++; + } + } + + public bool IsQuotationMarkProportionNegligible(string quotationMark) + { + if (_totalQuotationMarkCount == 0) + return true; + int quotationMarkCount = _quotationMarkCounts.TryGetValue(quotationMark, out int count) ? count : 0; + return ((double)quotationMarkCount / _totalQuotationMarkCount) < NegligibleProportionThreshold; + } + } + public class ApostropheProportionStatistics { private int _numCharacters; @@ -385,12 +421,14 @@ public class PreliminaryQuotationMarkAnalyzer private readonly QuoteConventionSet _quoteConventions; private readonly PreliminaryApostropheAnalyzer _apostropheAnalyzer; private readonly QuotationMarkSequences _quotationMarkSequences; + private readonly QuotationMarkCounter _quotationMarkCounts; public PreliminaryQuotationMarkAnalyzer(QuoteConventionSet quoteConventions) { _quoteConventions = quoteConventions; _apostropheAnalyzer = new PreliminaryApostropheAnalyzer(); _quotationMarkSequences = new QuotationMarkSequences(); + _quotationMarkCounts = new QuotationMarkCounter(); Reset(); } @@ -398,6 +436,7 @@ public void Reset() { _apostropheAnalyzer.Reset(); _quotationMarkSequences.Reset(); + _quotationMarkCounts.Reset(); } public QuoteConventionSet NarrowDownPossibleQuoteConventions(List chapters) @@ -420,6 +459,7 @@ private void AnalyzeQuotationMarksForVerse(Verse verse) ).FindAllPotentialQuotationMarksInVerse(verse); AnalyzeQuotationMarkSequence(quotationMarks); _apostropheAnalyzer.ProcessQuotationMarks(verse.TextSegments.ToList(), quotationMarks); + _quotationMarkCounts.CountQuotationMarks(quotationMarks); } private void AnalyzeQuotationMarkSequence(List quotationMarks) @@ -450,6 +490,9 @@ private List FindOpeningQuotationMarks() private bool IsOpeningQuotationMark(string quotationMark) { + if (_quotationMarkCounts.IsQuotationMarkProportionNegligible(quotationMark)) + return false; + if (_apostropheAnalyzer.IsApostropheOnly(quotationMark)) return false; @@ -475,6 +518,9 @@ private List FindClosingQuotationMarks() private bool IsClosingQuotationMark(string quotationMark) { + if (_quotationMarkCounts.IsQuotationMarkProportionNegligible(quotationMark)) + return false; + if (_apostropheAnalyzer.IsApostropheOnly(quotationMark)) return false; diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkDenormalizationFirstPass.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkDenormalizationFirstPass.cs new file mode 100644 index 00000000..b7a89038 --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkDenormalizationFirstPass.cs @@ -0,0 +1,9 @@ +namespace SIL.Machine.PunctuationAnalysis +{ + // This is a convenience class so that users don't have to know to normalize the source quote convention + public class QuotationMarkDenormalizationFirstPass : QuotationMarkUpdateFirstPass + { + public QuotationMarkDenormalizationFirstPass(QuoteConvention targetQuoteConvention) + : base(targetQuoteConvention.Normalize(), targetQuoteConvention) { } + } +} diff --git a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs similarity index 76% rename from src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs index f5ac923f..95de5971 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs @@ -1,17 +1,14 @@ -using SIL.Machine.PunctuationAnalysis; - -namespace SIL.Machine.Corpora +namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkDenormalizationUsfmUpdateBlockHandler : QuoteConventionChangingUsfmUpdateBlockHandler { // This is a convenience class so that users don't have to know to normalize the source quote convention public QuotationMarkDenormalizationUsfmUpdateBlockHandler( - QuoteConvention sourceQuoteConvention, QuoteConvention targetQuoteConvention, QuotationMarkUpdateSettings settings = null ) : base( - sourceQuoteConvention.Normalize(), + targetQuoteConvention.Normalize(), targetQuoteConvention, settings ?? new QuotationMarkUpdateSettings() ) { } diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateFirstPass.cs similarity index 67% rename from src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateFirstPass.cs index f5106501..7ecc4fae 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateFirstPass.cs @@ -1,9 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using SIL.Machine.PunctuationAnalysis; - -namespace SIL.Machine.Corpora +namespace SIL.Machine.PunctuationAnalysis { // Determines the best strategy to take for each chapter public class QuotationMarkUpdateFirstPass : UsfmStructureExtractor @@ -12,46 +7,37 @@ public class QuotationMarkUpdateFirstPass : UsfmStructureExtractor private readonly DepthBasedQuotationMarkResolver _quotationMarkResolver; public bool WillFallbackModeWork { get; set; } - public QuotationMarkUpdateFirstPass( - QuoteConvention sourceQuoteConvention, - QuoteConvention targetQuoteConvention - ) + public QuotationMarkUpdateFirstPass(QuoteConvention oldQuoteConvention, QuoteConvention newQuoteConvention) { _quotationMarkFinder = new QuotationMarkFinder( - new QuoteConventionSet(new List { sourceQuoteConvention, targetQuoteConvention }) + new QuoteConventionSet(new List { oldQuoteConvention, newQuoteConvention }) ); _quotationMarkResolver = new DepthBasedQuotationMarkResolver( - new QuotationMarkUpdateResolutionSettings(sourceQuoteConvention) + new QuotationMarkUpdateResolutionSettings(oldQuoteConvention) ); - WillFallbackModeWork = CheckWhetherFallbackModeWillWork(sourceQuoteConvention, targetQuoteConvention); + WillFallbackModeWork = CheckWhetherFallbackModeWillWork(oldQuoteConvention, newQuoteConvention); } public bool CheckWhetherFallbackModeWillWork( - QuoteConvention sourceQuoteConvention, - QuoteConvention targetQuoteConvention + QuoteConvention oldQuoteConvention, + QuoteConvention newQuoteConvention ) { - var targetMarkBySourceMark = new Dictionary(); + var newMarkByOldMark = new Dictionary(); foreach ( - int depth in Enumerable.Range( - 1, - Math.Min(sourceQuoteConvention.NumLevels, targetQuoteConvention.NumLevels) - ) + int depth in Enumerable.Range(1, Math.Min(oldQuoteConvention.NumLevels, newQuoteConvention.NumLevels)) ) { - string openingQuotationMark = sourceQuoteConvention.GetOpeningQuotationMarkAtDepth(depth); - string closingQuotationMark = targetQuoteConvention.GetClosingQuotationMarkAtDepth(depth); + string openingQuotationMark = oldQuoteConvention.GetOpeningQuotationMarkAtDepth(depth); + string closingQuotationMark = newQuoteConvention.GetClosingQuotationMarkAtDepth(depth); if ( - targetMarkBySourceMark.TryGetValue( - openingQuotationMark, - out string correspondingClosingQuotationMark - ) + newMarkByOldMark.TryGetValue(openingQuotationMark, out string correspondingClosingQuotationMark) && correspondingClosingQuotationMark != closingQuotationMark ) { return false; } - targetMarkBySourceMark[openingQuotationMark] = closingQuotationMark; + newMarkByOldMark[openingQuotationMark] = closingQuotationMark; } return true; } diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateResolutionSettings.cs similarity index 77% rename from src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateResolutionSettings.cs index 7791d048..9177275f 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateResolutionSettings.cs @@ -1,17 +1,14 @@ -using System.Collections.Generic; -using System.Text.RegularExpressions; - namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkUpdateResolutionSettings : IQuotationMarkResolutionSettings { - private readonly QuoteConvention _sourceQuoteConvention; + private readonly QuoteConvention _oldQuoteConvention; private readonly QuoteConventionSet _quoteConventionSingletonSet; - public QuotationMarkUpdateResolutionSettings(QuoteConvention sourceQuoteConvention) + public QuotationMarkUpdateResolutionSettings(QuoteConvention oldQuoteConvention) { - _sourceQuoteConvention = sourceQuoteConvention; - _quoteConventionSingletonSet = new QuoteConventionSet(new List { sourceQuoteConvention }); + _oldQuoteConvention = oldQuoteConvention; + _quoteConventionSingletonSet = new QuoteConventionSet(new List { oldQuoteConvention }); } public bool AreMarksAValidPair(string openingMark, string closingMark) @@ -31,7 +28,7 @@ public Regex GetOpeningQuotationMarkRegex() public HashSet GetPossibleDepths(string quotationMark, QuotationMarkDirection direction) { - return _sourceQuoteConvention.GetPossibleDepths(quotationMark, direction); + return _oldQuoteConvention.GetPossibleDepths(quotationMark, direction); } public bool IsValidClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch) @@ -46,7 +43,7 @@ public bool IsValidOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMa public bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction) { - return _sourceQuoteConvention.GetExpectedQuotationMark(depth, direction) == quotationMark; + return _oldQuoteConvention.GetExpectedQuotationMark(depth, direction) == quotationMark; } public bool ShouldRelyOnParagraphMarkers() diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateSettings.cs similarity index 93% rename from src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateSettings.cs index fc8b50fb..a10781c8 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateSettings.cs @@ -1,6 +1,4 @@ -using System.Collections.Generic; - -namespace SIL.Machine.Corpora +namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkUpdateSettings { diff --git a/src/SIL.Machine/Corpora/QuotationMarkUpdateStrategy.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateStrategy.cs similarity index 73% rename from src/SIL.Machine/Corpora/QuotationMarkUpdateStrategy.cs rename to src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateStrategy.cs index e6ae10b0..ae62e230 100644 --- a/src/SIL.Machine/Corpora/QuotationMarkUpdateStrategy.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateStrategy.cs @@ -1,4 +1,4 @@ -namespace SIL.Machine.Corpora +namespace SIL.Machine.PunctuationAnalysis { public enum QuotationMarkUpdateStrategy { diff --git a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionChangingUsfmUpdateBlockHandler.cs similarity index 94% rename from src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs rename to src/SIL.Machine/PunctuationAnalysis/QuoteConventionChangingUsfmUpdateBlockHandler.cs index 0817854d..f2ad4d2d 100644 --- a/src/SIL.Machine/Corpora/QuoteConventionChangingUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionChangingUsfmUpdateBlockHandler.cs @@ -1,13 +1,9 @@ -using System.Collections.Generic; -using System.Linq; -using SIL.Machine.PunctuationAnalysis; - -namespace SIL.Machine.Corpora +namespace SIL.Machine.PunctuationAnalysis { public class QuoteConventionChangingUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler { - private readonly QuoteConvention _sourceQuoteConvention; - private readonly QuoteConvention _targetQuoteConvention; + private readonly QuoteConvention _oldQuoteConvention; + private readonly QuoteConvention _newQuoteConvention; private readonly QuotationMarkUpdateSettings _settings; protected QuotationMarkFinder QuotationMarkFinder { get; set; } protected TextSegment.Builder NextScriptureTextSegmentBuilder { get; set; } @@ -19,23 +15,23 @@ public class QuoteConventionChangingUsfmUpdateBlockHandler : IUsfmUpdateBlockHan private int _currentVerseNumber; public QuoteConventionChangingUsfmUpdateBlockHandler( - QuoteConvention sourceQuoteConvention, - QuoteConvention targetQuoteConvention, + QuoteConvention oldQuoteConvention, + QuoteConvention newQuoteConvention, QuotationMarkUpdateSettings settings ) { - _sourceQuoteConvention = sourceQuoteConvention; - _targetQuoteConvention = targetQuoteConvention; + _oldQuoteConvention = oldQuoteConvention; + _newQuoteConvention = newQuoteConvention; _settings = settings; QuotationMarkFinder = new QuotationMarkFinder( - new QuoteConventionSet(new List { _sourceQuoteConvention }) + new QuoteConventionSet(new List { _oldQuoteConvention }) ); NextScriptureTextSegmentBuilder = new TextSegment.Builder(); IQuotationMarkResolutionSettings resolutionSettings = new QuotationMarkUpdateResolutionSettings( - sourceQuoteConvention + oldQuoteConvention ); // Each embed represents a separate context for quotation marks @@ -140,7 +136,7 @@ QuotationMarkMetadata resolvedQuotationMarkMatch ) { int previousLength = resolvedQuotationMarkMatch.Length; - resolvedQuotationMarkMatch.UpdateQuotationMark(_targetQuoteConvention); + resolvedQuotationMarkMatch.UpdateQuotationMark(_newQuoteConvention); int updatedLength = resolvedQuotationMarkMatch.Length; if (previousLength != updatedLength) diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs index 6b9fcfdc..81750a7a 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationTests.cs @@ -37,7 +37,6 @@ of the field which Yahweh God had made. Assert.IsNotNull(standardEnglishQuoteConvention); var quotationMarkDenormalizationFirstPass = new QuotationMarkDenormalizationFirstPass( - standardEnglishQuoteConvention, standardEnglishQuoteConvention ); @@ -46,7 +45,6 @@ of the field which Yahweh God had made. quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); var quotationMarkDenormalizer = new QuotationMarkDenormalizationUsfmUpdateBlockHandler( - standardEnglishQuoteConvention, standardEnglishQuoteConvention, new QuotationMarkUpdateSettings(chapterStrategies: bestChapterStrategies) ); diff --git a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs index c265d36f..fc709fac 100644 --- a/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/QuotationDenormalizationUsfmBlockUpdateHandlerTests.cs @@ -6,25 +6,24 @@ namespace SIL.Machine.Corpora; [TestFixture] public class QuotationMarkDenormalizationUsfmUpdateBlockHandlerTests { - private const string SimpleNormalizedUsfm = - @"\c 1 + [Test] + public void SimpleEnglishQuoteDenormalization() + { + string normalizedUsfm = + @"\c 1 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, 'You shall not eat of any tree of the garden'?"" "; - - [Test] - public void SimpleEnglishQuoteDenormalization() - { - string normalizedUsfm = SimpleNormalizedUsfm; + ; string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_english"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); } @@ -44,43 +43,51 @@ of the field which Yahweh God had made. + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "british_english", "british_english"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "british_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); - - // no denormalization should be needed for this example } + // no denormalization should be needed for this example [Test] public void SimpleTypewriterEnglishQuoteDenormalization() { - string normalizedUsfm = SimpleNormalizedUsfm; + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + "; + ; string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, \"Has God really said, 'You shall not eat of any tree of the garden'?\"" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "typewriter_english"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "typewriter_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); - - // some of the quotes shouldn't need to be denormalized } + // some of the quotes shouldn't need to be denormalized [Test] public void SimpleHybridTypewriterEnglishQuoteDenormalization() { - string normalizedUsfm = SimpleNormalizedUsfm; + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + "; + ; string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, “Has God really said, 'You shall not eat of any tree of the garden'?”" ); - string observedUsfm = DenormalizeQuotationMarks( - normalizedUsfm, - "standard_english", - "hybrid_typewriter_english" - ); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "hybrid_typewriter_english"); AssertUsfmEqual(observedUsfm, expectedUsfm); // the single guillemets shouldn't need to be denormalized @@ -103,7 +110,7 @@ of the field which Yahweh God had made. + "the woman, «Has God really said, ‹You shall not eat of any tree of the garden›?»" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_french", "standard_french"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_french"); AssertUsfmEqual(observedUsfm, expectedUsfm); // the unusual quotation marks shouldn't need to be denormalized @@ -125,12 +132,11 @@ of the field which Yahweh God had made. + "the woman, <?>>" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "typewriter_french", "typewriter_french"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "typewriter_french"); AssertUsfmEqual(observedUsfm, expectedUsfm); - - // the 1st- and 2nd-level quotes are denormalized to identical marks } + // the 1st- and 2nd-level quotes are denormalized to identical marks [Test] public void SimpleWesternEuropeanQuoteDenormalization() { @@ -147,7 +153,7 @@ of the field which Yahweh God had made. + "the woman, «Has God really said, “You shall not eat of any tree of the garden”?»" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "western_european", "western_european"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "western_european"); AssertUsfmEqual(observedUsfm, expectedUsfm); } @@ -167,11 +173,7 @@ of the field which Yahweh God had made. + "the woman, <>" ); - string observedUsfm = DenormalizeQuotationMarks( - normalizedUsfm, - "typewriter_western_european", - "typewriter_western_european" - ); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "typewriter_western_european"); AssertUsfmEqual(observedUsfm, expectedUsfm); } @@ -191,11 +193,7 @@ of the field which Yahweh God had made. + "the woman, \"Has God really said, ?\"" ); - string observedUsfm = DenormalizeQuotationMarks( - normalizedUsfm, - "typewriter_western_european_variant", - "typewriter_western_european_variant" - ); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "typewriter_western_european_variant"); AssertUsfmEqual(observedUsfm, expectedUsfm); } @@ -215,11 +213,7 @@ of the field which Yahweh God had made. + "the woman, «Has God really said, \"You shall not eat of any tree of the garden\"?»" ); - string observedUsfm = DenormalizeQuotationMarks( - normalizedUsfm, - "hybrid_typewriter_western_european", - "hybrid_typewriter_western_european" - ); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "hybrid_typewriter_western_european"); AssertUsfmEqual(observedUsfm, expectedUsfm); } @@ -239,7 +233,7 @@ of the field which Yahweh God had made. + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden‘?“" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "central_european", "central_european"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "central_european"); AssertUsfmEqual(observedUsfm, expectedUsfm); } @@ -259,11 +253,7 @@ of the field which Yahweh God had made. + "the woman, »Has God really said, ›You shall not eat of any tree of the garden‹?«" ); - string observedUsfm = DenormalizeQuotationMarks( - normalizedUsfm, - "central_european_guillemets", - "central_european_guillemets" - ); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "central_european_guillemets"); AssertUsfmEqual(observedUsfm, expectedUsfm); } @@ -283,35 +273,49 @@ of the field which Yahweh God had made. + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden’?”" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_swedish", "standard_swedish"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_swedish"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SimpleFinnishQuoteDenormalization() { - string normalizedUsfm = SimpleNormalizedUsfm; + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + "; + ; string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, »Has God really said, ’You shall not eat of any tree of the garden’?»" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_finnish"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_finnish"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SimpleEasternEuropeanQuoteDenormalization() { - string normalizedUsfm = SimpleNormalizedUsfm; + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + "; + ; string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden’?”" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "eastern_european"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "eastern_european"); AssertUsfmEqual(observedUsfm, expectedUsfm); } @@ -331,28 +335,42 @@ of the field which Yahweh God had made. + "the woman, «Has God really said, „You shall not eat of any tree of the garden“?»" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_russian", "standard_russian"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_russian"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void SimpleArabicQuoteDenormalization() { - string normalizedUsfm = SimpleNormalizedUsfm; + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + "; + ; string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden‘?“" ); - string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_english", "standard_arabic"); + string observedUsfm = DenormalizeQuotationMarks(normalizedUsfm, "standard_arabic"); AssertUsfmEqual(observedUsfm, expectedUsfm); } [Test] public void FallbackQuotationDenormalizationSameAsFull() { - string normalizedUsfm = SimpleNormalizedUsfm; + string normalizedUsfm = + @"\c 1 + \v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ""Has God really said, + 'You shall not eat of any tree of the garden'?"" + "; + ; string expectedUsfm = ( "\\c 1\n" + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " @@ -362,7 +380,6 @@ public void FallbackQuotationDenormalizationSameAsFull() string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "standard_english", - "standard_english", new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) ); AssertUsfmEqual(observedUsfm, expectedUsfm); @@ -387,7 +404,6 @@ of the field which Yahweh God had made. string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "standard_english", - "standard_english", new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) ); AssertUsfmEqual(observedUsfm, expectedUsfm); @@ -412,7 +428,6 @@ of the field which Yahweh God had made. string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "standard_english", - "standard_english", new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) ); AssertUsfmEqual(observedUsfm, expectedUsfm); @@ -437,7 +452,6 @@ You shall not eat of any tree of the garden'?"" string observedUsfm = DenormalizeQuotationMarks( normalizedUsfm, "standard_english", - "standard_english", new QuotationMarkUpdateSettings(QuotationMarkUpdateStrategy.ApplyFallback) ); AssertUsfmEqual(observedUsfm, expectedUsfm); @@ -445,7 +459,6 @@ You shall not eat of any tree of the garden'?"" public string DenormalizeQuotationMarks( string normalizedUsfm, - string sourceQuoteConventionName, string targetQuoteConventionName, QuotationMarkUpdateSettings? quotationDenormalizationSettings = null ) @@ -453,7 +466,6 @@ public string DenormalizeQuotationMarks( quotationDenormalizationSettings ??= new QuotationMarkUpdateSettings(); QuotationMarkDenormalizationUsfmUpdateBlockHandler quotationDenormalizer = ( CreateQuotationDenormalizationUsfmUpdateBlockHandler( - sourceQuoteConventionName, targetQuoteConventionName, quotationDenormalizationSettings ) @@ -466,17 +478,14 @@ public string DenormalizeQuotationMarks( } public QuotationMarkDenormalizationUsfmUpdateBlockHandler CreateQuotationDenormalizationUsfmUpdateBlockHandler( - string sourceQuoteConventionName, string targetQuoteConventionName, QuotationMarkUpdateSettings? quotationDenormalizationSettings = null ) { quotationDenormalizationSettings ??= new QuotationMarkUpdateSettings(); - QuoteConvention sourceQuoteConvention = GetQuoteConventionByName(sourceQuoteConventionName); QuoteConvention targetQuoteConvention = GetQuoteConventionByName(targetQuoteConventionName); return new QuotationMarkDenormalizationUsfmUpdateBlockHandler( - sourceQuoteConvention, targetQuoteConvention, quotationDenormalizationSettings ); From db406073d9bd3fe0d739b87f7530e0997913949c Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 29 Sep 2025 20:22:39 -0400 Subject: [PATCH 2/2] Re-add imports; change namespaces --- .../Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs | 3 +++ .../PunctuationAnalysis/FallbackQuotationMarkResolver.cs | 3 +++ .../ParatextProjectQuoteConventionDetector.cs | 5 +++++ .../PunctuationAnalysis/QuotationMarkUpdateFirstPass.cs | 4 ++++ .../QuotationMarkUpdateResolutionSettings.cs | 3 +++ .../PunctuationAnalysis/QuotationMarkUpdateSettings.cs | 2 ++ .../QuoteConventionChangingUsfmUpdateBlockHandler.cs | 4 ++++ .../ZipParatextProjectQuoteConventionDetector.cs | 3 ++- .../MemoryParatextProjectQuoteConvetionDetector.cs | 3 ++- 9 files changed, 28 insertions(+), 2 deletions(-) rename src/SIL.Machine/{Corpora => PunctuationAnalysis}/ZipParatextProjectQuoteConventionDetector.cs (91%) rename tests/SIL.Machine.Tests/{Corpora => PunctuationAnalysis}/MemoryParatextProjectQuoteConvetionDetector.cs (89%) diff --git a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs index 4dd5ae40..4312319e 100644 --- a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs @@ -1,3 +1,6 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Extensions; using SIL.Machine.Translation; namespace SIL.Machine.Corpora diff --git a/src/SIL.Machine/PunctuationAnalysis/FallbackQuotationMarkResolver.cs b/src/SIL.Machine/PunctuationAnalysis/FallbackQuotationMarkResolver.cs index e0e0591f..c4ab81ab 100644 --- a/src/SIL.Machine/PunctuationAnalysis/FallbackQuotationMarkResolver.cs +++ b/src/SIL.Machine/PunctuationAnalysis/FallbackQuotationMarkResolver.cs @@ -1,3 +1,6 @@ +using System.Collections.Generic; +using System.Linq; + namespace SIL.Machine.PunctuationAnalysis { public class FallbackQuotationMarkResolver : IQuotationMarkResolver diff --git a/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs index a32b6323..5cbdc52a 100644 --- a/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs +++ b/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs @@ -1,3 +1,8 @@ +using System; +using System.IO; +using System.Text; +using SIL.Machine.Corpora; + namespace SIL.Machine.PunctuationAnalysis { public abstract class ParatextProjectQuoteConventionDetector diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateFirstPass.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateFirstPass.cs index 7ecc4fae..866fafc9 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateFirstPass.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateFirstPass.cs @@ -1,3 +1,7 @@ +using System; +using System.Collections.Generic; +using System.Linq; + namespace SIL.Machine.PunctuationAnalysis { // Determines the best strategy to take for each chapter diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateResolutionSettings.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateResolutionSettings.cs index 9177275f..ac73a1aa 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateResolutionSettings.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateResolutionSettings.cs @@ -1,3 +1,6 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkUpdateResolutionSettings : IQuotationMarkResolutionSettings diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateSettings.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateSettings.cs index a10781c8..3b124e83 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateSettings.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateSettings.cs @@ -1,3 +1,5 @@ +using System.Collections.Generic; + namespace SIL.Machine.PunctuationAnalysis { public class QuotationMarkUpdateSettings diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionChangingUsfmUpdateBlockHandler.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionChangingUsfmUpdateBlockHandler.cs index f2ad4d2d..a0e8ea7a 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionChangingUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionChangingUsfmUpdateBlockHandler.cs @@ -1,3 +1,7 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Machine.Corpora; + namespace SIL.Machine.PunctuationAnalysis { public class QuoteConventionChangingUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/PunctuationAnalysis/ZipParatextProjectQuoteConventionDetector.cs similarity index 91% rename from src/SIL.Machine/Corpora/ZipParatextProjectQuoteConventionDetector.cs rename to src/SIL.Machine/PunctuationAnalysis/ZipParatextProjectQuoteConventionDetector.cs index 91736056..fa8af932 100644 --- a/src/SIL.Machine/Corpora/ZipParatextProjectQuoteConventionDetector.cs +++ b/src/SIL.Machine/PunctuationAnalysis/ZipParatextProjectQuoteConventionDetector.cs @@ -1,7 +1,8 @@ using System.IO; using System.IO.Compression; +using SIL.Machine.Corpora; -namespace SIL.Machine.Corpora +namespace SIL.Machine.PunctuationAnalysis { public class ZipParatextProjectQuoteConventionDetector : ParatextProjectQuoteConventionDetector { diff --git a/tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectQuoteConvetionDetector.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/MemoryParatextProjectQuoteConvetionDetector.cs similarity index 89% rename from tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectQuoteConvetionDetector.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/MemoryParatextProjectQuoteConvetionDetector.cs index 01d959d8..d74f7483 100644 --- a/tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectQuoteConvetionDetector.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/MemoryParatextProjectQuoteConvetionDetector.cs @@ -1,6 +1,7 @@ using System.Text; +using SIL.Machine.Corpora; -namespace SIL.Machine.Corpora; +namespace SIL.Machine.PunctuationAnalysis; public class MemoryParatextProjectQuoteConventionDetector( ParatextProjectSettings settings,