From f3a94f8eedab0531e92740cff60ed3cb54975a6f Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 18 Nov 2025 17:25:12 -0500 Subject: [PATCH 1/6] Port machine.py changes --- .../ParatextProjectQuoteConventionDetector.cs | 34 ++---- .../QuotationMarkTabulator.cs | 80 +++++++++++--- .../PunctuationAnalysis/QuoteConvention.cs | 18 ++++ .../QuoteConventionAnalysis.cs | 100 ++++++++++++++++++ .../QuoteConventionDetector.cs | 32 +----- .../PunctuationAnalysis/QuoteConventionSet.cs | 11 ++ .../StandardQuoteConventions.cs | 9 ++ .../Corpora/UsfmManualTests.cs | 13 +-- ...textProjectQuoteConventionDetectorTests.cs | 44 ++++++-- .../QuotationMarkTabulatorTests.cs | 6 +- ...sts.cs => QuoteConventionDetectorTests.cs} | 46 ++++---- .../QuoteConventionSetTests.cs | 8 +- 12 files changed, 284 insertions(+), 117 deletions(-) create mode 100644 src/SIL.Machine/PunctuationAnalysis/QuoteConventionAnalysis.cs rename tests/SIL.Machine.Tests/PunctuationAnalysis/{QuotationConventionDetectorTests.cs => QuoteConventionDetectorTests.cs} (88%) diff --git a/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs index a7821049..1c928bed 100644 --- a/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs +++ b/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs @@ -22,29 +22,12 @@ ParatextProjectSettings settings _paratextProjectFileHandler = paratextProjectFileHandler; } - public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null) - { - Dictionary> includeChapters = null; - return GetQuoteConventionAnalysis(handler, includeChapters); - } - - public QuoteConventionAnalysis GetQuoteConventionAnalysis( - QuoteConventionDetector handler = null, - IReadOnlyDictionary> includeChapters = null - ) - { - return GetQuoteConventionAnalysis( - handler, - includeChapters?.ToDictionary(kvp => Canon.BookIdToNumber(kvp.Key), kvp => kvp.Value) - ); - } - public QuoteConventionAnalysis GetQuoteConventionAnalysis( - QuoteConventionDetector handler = null, IReadOnlyDictionary> includeChapters = null ) { - handler = handler ?? new QuoteConventionDetector(); + var bookQuoteConventionsAnalyses = new List(); + foreach ( string bookId in Canon .AllBookNumbers.Where(num => Canon.IsCanonical(num)) @@ -54,12 +37,14 @@ string bookId in Canon if (includeChapters != null && !includeChapters.ContainsKey(Canon.BookIdToNumber(bookId))) continue; + var handler = new QuoteConventionDetector(); + string fileName = _settings.GetBookFileName(bookId); - if (!Exists(fileName)) + if (!_paratextProjectFileHandler.Exists(fileName)) continue; string usfm; - using (var reader = new StreamReader(Open(fileName))) + using (var reader = new StreamReader(_paratextProjectFileHandler.Open(fileName))) { usfm = reader.ReadToEnd(); } @@ -77,12 +62,9 @@ string bookId in Canon sb.Append($". Error: '{ex.Message}'"); throw new InvalidOperationException(sb.ToString(), ex); } + bookQuoteConventionsAnalyses.Add(handler.DetectQuoteConvention(includeChapters)); } - return handler.DetectQuoteConvention(includeChapters); + return QuoteConventionAnalysis.CombineWithWeightedAverage(bookQuoteConventionsAnalyses); } - - private bool Exists(string fileName) => _paratextProjectFileHandler.Exists(fileName); - - private Stream Open(string fileName) => _paratextProjectFileHandler.Open(fileName); } } diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs index e12a2054..e77a37e0 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -1,5 +1,5 @@ -using System; using System.Collections.Generic; +using System.Linq; using System.Text; using SIL.Extensions; @@ -23,6 +23,16 @@ public void CountQuotationMark(string quotationMark) TotalCount++; } + public void CountFrom(QuotationMarkCounts quotationMarkCounts) + { + foreach (KeyValuePair kvp in quotationMarkCounts._quotationMarkCounter) + { + (string quotationMark, int count) = (kvp.Key, kvp.Value); + _quotationMarkCounter.UpdateValue(quotationMark, () => 0, i => i + count); + } + TotalCount += quotationMarkCounts.TotalCount; + } + public (string BestString, int BestStringCount, int TotalStringCount) FindBestQuotationMarkProportion() { string bestString = _quotationMarkCounter.MaxBy(kvp => kvp.Value).Key; @@ -60,6 +70,24 @@ public void Tabulate(List quotationMarks) } } + public void TabulateFrom(QuotationMarkTabulator tabulatedQuotationMarks) + { + foreach ( + KeyValuePair< + (int, QuotationMarkDirection), + QuotationMarkCounts + > kvp in tabulatedQuotationMarks._quotationCountsByDepthAndDirection + ) + { + ((int depth, QuotationMarkDirection direction), QuotationMarkCounts counts) = (kvp.Key, kvp.Value); + if (!_quotationCountsByDepthAndDirection.ContainsKey((depth, direction))) + { + _quotationCountsByDepthAndDirection[(depth, direction)] = new QuotationMarkCounts(); + } + _quotationCountsByDepthAndDirection[(depth, direction)].CountFrom(counts); + } + } + private void CountQuotationMark(QuotationMarkMetadata quote) { (int Depth, QuotationMarkDirection Direction) key = (quote.Depth, quote.Direction); @@ -75,26 +103,52 @@ private void CountQuotationMark(QuotationMarkMetadata quote) ); } + public int GetTotalQuotationMarkCount() + { + return _quotationCountsByDepthAndDirection.Values.Select(c => c.TotalCount).Sum(); + } + public double CalculateSimilarity(QuoteConvention quoteConvention) { - double weightedDifference = 0.0; - double totalWeight = 0.0; - foreach ((int depth, QuotationMarkDirection direction) in _quotationCountsByDepthAndDirection.Keys) + var numMarksByDepth = new Dictionary(); + var numMatchingMarksByDepth = new Dictionary(); + foreach ( + (int depth, QuotationMarkDirection direction) in _quotationCountsByDepthAndDirection.Keys.OrderBy(k => + k + ) + ) { string expectedQuotationMark = quoteConvention.GetExpectedQuotationMark(depth, direction); - - // Give higher weight to shallower depths, since deeper marks are more likely to be mistakes - weightedDifference += ( - _quotationCountsByDepthAndDirection[(depth, direction)] - .CalculateNumDifferences(expectedQuotationMark) * Math.Pow(2, -depth) + int numMatchingMarks = _quotationCountsByDepthAndDirection[(depth, direction)].TotalCount; + numMarksByDepth.UpdateValue(depth, () => 0, i => i + numMatchingMarks); + numMatchingMarksByDepth.UpdateValue( + depth, + () => 0, + i => + i + + numMatchingMarks + - _quotationCountsByDepthAndDirection[(depth, direction)] + .CalculateNumDifferences(expectedQuotationMark) ); - totalWeight += _quotationCountsByDepthAndDirection[(depth, direction)].TotalCount * Math.Pow(2, -depth); } - if (totalWeight == 0.0) + + // The scores of greater depths depend on the scores of shallower depths + var scoresByDepth = new Dictionary(); + foreach (int depth in numMarksByDepth.Keys.OrderBy(k => k)) { - return 0.0; + double previousDepthScore = 1; + if (scoresByDepth.TryGetValue(depth - 1, out double score)) + { + previousDepthScore = score / numMarksByDepth[depth - 1]; + } + scoresByDepth[depth] = previousDepthScore * numMatchingMarksByDepth[depth]; } - return 1 - (weightedDifference / totalWeight); + int totalMarks = numMarksByDepth.Values.Sum(); + double totalScore = scoresByDepth.Values.Sum(); + + if (totalMarks == 0) + return 0; + return totalScore / totalMarks; } private bool DepthAndDirectionObserved(int depth, QuotationMarkDirection direction) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs index c8e17e85..756d1efb 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs @@ -42,6 +42,14 @@ out char quote : ClosingQuotationMark; return new SingleLevelQuoteConvention(normalizedOpeningQuotationMark, normalizedClosingQuotationMark); } + + public override int GetHashCode() + { + int hashCode = 23; + hashCode = hashCode * 31 + OpeningQuotationMark.GetHashCode(); + hashCode = hashCode * 31 + ClosingQuotationMark.GetHashCode(); + return hashCode; + } } public class QuoteConvention @@ -150,5 +158,15 @@ public QuoteConvention Normalize() { return new QuoteConvention(Name + "_normalized", LevelConventions.Select(l => l.Normalize()).ToList()); } + + public override int GetHashCode() + { + int hashCode = 23; + foreach (SingleLevelQuoteConvention quoteConvention in LevelConventions) + { + hashCode = hashCode * 31 + quoteConvention.GetHashCode(); + } + return hashCode; + } } } diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionAnalysis.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionAnalysis.cs new file mode 100644 index 00000000..cfeda0bb --- /dev/null +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionAnalysis.cs @@ -0,0 +1,100 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Extensions; + +namespace SIL.Machine.PunctuationAnalysis +{ + public class QuoteConventionAnalysis + { + public QuoteConvention BestQuoteConvention { get; private set; } + public double BestQuoteConventionScore { get; private set; } + public string AnalysisSummary { get; private set; } + public IReadOnlyDictionary ConventionScores { get; private set; } + public QuotationMarkTabulator TabulatedQuotationMarks { get; private set; } + public double AnalysisWeight { get; private set; } + + public QuoteConventionAnalysis( + Dictionary conventionScores, + QuotationMarkTabulator tabulatedQuotationMarks, + double analysisWeight = 1.0 + ) + { + ConventionScores = conventionScores; + if (ConventionScores.Count > 0) + { + KeyValuePair maxKvp = ConventionScores.MaxBy(kvp => kvp.Value); + (BestQuoteConvention, BestQuoteConventionScore) = (maxKvp.Key, maxKvp.Value); + } + else + { + BestQuoteConventionScore = 0; + BestQuoteConvention = null; + } + TabulatedQuotationMarks = tabulatedQuotationMarks; + AnalysisWeight = analysisWeight; + } + + public class Builder + { + public Dictionary ConventionScores { get; private set; } + public QuotationMarkTabulator TabulatedQuotationMarks { get; private set; } + + public Builder(QuotationMarkTabulator tabulatedQuotationMarks) + { + ConventionScores = new Dictionary(); + TabulatedQuotationMarks = tabulatedQuotationMarks; + } + + public void RecordConventionScore(QuoteConvention quoteConvention, double score) + { + ConventionScores[quoteConvention] = score; + } + + public QuoteConventionAnalysis Build() + { + return new QuoteConventionAnalysis( + ConventionScores, + TabulatedQuotationMarks, + TabulatedQuotationMarks.GetTotalQuotationMarkCount() + ); + } + } + + public static QuoteConventionAnalysis CombineWithWeightedAverage( + List quoteConventionAnalyses + ) + { + double totalWeight = 0; + Dictionary conventionVotes = new Dictionary(); + Dictionary quoteConventionsByName = new Dictionary(); + QuotationMarkTabulator totalTabulatedQuotationMarks = new QuotationMarkTabulator(); + foreach (QuoteConventionAnalysis quoteConventionAnalysis in quoteConventionAnalyses) + { + totalTabulatedQuotationMarks.TabulateFrom(quoteConventionAnalysis.TabulatedQuotationMarks); + totalWeight += quoteConventionAnalysis.AnalysisWeight; + foreach ( + (QuoteConvention convention, double score) in quoteConventionAnalysis.ConventionScores.Select(kvp => + (kvp.Key, kvp.Value) + ) + ) + { + quoteConventionsByName[convention.Name] = convention; + conventionVotes.UpdateValue( + convention.Name, + () => 0, + s => s + score * quoteConventionAnalysis.AnalysisWeight + ); + } + } + QuoteConventionAnalysis.Builder builder = new QuoteConventionAnalysis.Builder(totalTabulatedQuotationMarks); + foreach ((string conventionName, double totalScore) in conventionVotes.Select(kvp => (kvp.Key, kvp.Value))) + { + if (totalScore > 0) + { + builder.RecordConventionScore(quoteConventionsByName[conventionName], totalScore / totalWeight); + } + } + return builder.Build(); + } + } +} diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs index 3200e2ab..691f78f3 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs @@ -3,24 +3,6 @@ namespace SIL.Machine.PunctuationAnalysis { - public class QuoteConventionAnalysis - { - public QuoteConvention BestQuoteConvention { get; private set; } - public double BestQuoteConventionScore { get; private set; } - public string AnalysisSummary { get; private set; } - - public QuoteConventionAnalysis( - QuoteConvention bestQuoteConvention, - double bestQuoteConventionScore, - string analysisSummary - ) - { - BestQuoteConvention = bestQuoteConvention; - BestQuoteConventionScore = bestQuoteConventionScore; - AnalysisSummary = analysisSummary; - } - } - public class QuoteConventionDetector : UsfmStructureExtractor { private readonly QuotationMarkTabulator _quotationMarkTabulator; @@ -60,19 +42,7 @@ public QuoteConventionAnalysis DetectQuoteConvention(IReadOnlyDictionary 0 && bestQuoteConvention != null) - { - return new QuoteConventionAnalysis( - bestQuoteConvention, - score, - _quotationMarkTabulator.GetSummaryMessage() - ); - } - return null; + return QuoteConventions.Standard.ScoreAllQuoteConventions(_quotationMarkTabulator); } } } diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs index f208df92..6c6e7401 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs @@ -230,5 +230,16 @@ QuotationMarkTabulator tabulatedQuotationMarks } return (bestQuoteConvention, bestSimilarity); } + + public QuoteConventionAnalysis ScoreAllQuoteConventions(QuotationMarkTabulator tabulatedQuotationMarks) + { + var builder = new QuoteConventionAnalysis.Builder(tabulatedQuotationMarks); + foreach (QuoteConvention convention in Conventions) + { + double score = tabulatedQuotationMarks.CalculateSimilarity(convention); + builder.RecordConventionScore(convention, score); + } + return builder.Build(); + } } } diff --git a/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs b/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs index 5720198e..4a8f578d 100644 --- a/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs +++ b/src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs @@ -215,6 +215,15 @@ public static class QuoteConventions new SingleLevelQuoteConvention("\u2019", "\u2018"), } ), + new QuoteConvention( + "arabic_inspired_western_european", + new List + { + new SingleLevelQuoteConvention("\u00ab", "\u00bb"), + new SingleLevelQuoteConvention("\u201d", "\u201c"), + new SingleLevelQuoteConvention("\u2019", "\u2018"), + } + ), } ); } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index 96966bfd..762acdf6 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -60,23 +60,18 @@ string sfmFileName in Directory [Ignore("This is for manual testing only. Remove this tag to run the test.")] public void AnalyzeCorporaQuoteConventions() { - var sourceHandler = new QuoteConventionDetector(); using ZipArchive zipArchive = ZipFile.OpenRead(CorporaTestHelpers.UsfmSourceProjectZipPath); var quoteConventionDetector = new ZipParatextProjectQuoteConventionDetector(zipArchive); - quoteConventionDetector.GetQuoteConventionAnalysis(sourceHandler); + QuoteConventionAnalysis sourceAnalysis = quoteConventionDetector.GetQuoteConventionAnalysis(); - var targetHandler = new QuoteConventionDetector(); using ZipArchive zipArchive2 = ZipFile.OpenRead(CorporaTestHelpers.UsfmTargetProjectZipPath); var quoteConventionDetector2 = new ZipParatextProjectQuoteConventionDetector(zipArchive2); - quoteConventionDetector2.GetQuoteConventionAnalysis(targetHandler); - - QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuoteConvention(); - QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuoteConvention(); + QuoteConventionAnalysis targetAnalysis = quoteConventionDetector2.GetQuoteConventionAnalysis(); Assert.Multiple(() => { - Assert.NotNull(sourceAnalysis); - Assert.NotNull(targetAnalysis); + Assert.NotNull(sourceAnalysis.BestQuoteConvention); + Assert.NotNull(targetAnalysis.BestQuoteConvention); }); } diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/ParatextProjectQuoteConventionDetectorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/ParatextProjectQuoteConventionDetectorTests.cs index e083961b..09bc1e0c 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/ParatextProjectQuoteConventionDetectorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/ParatextProjectQuoteConventionDetectorTests.cs @@ -27,7 +27,7 @@ public void TestGetQuotationAnalysis() } ); QuoteConventionAnalysis analysis = env.GetQuoteConvention(); - Assert.That(analysis, Is.Not.Null); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.8)); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); } @@ -53,13 +53,13 @@ public void TestGetQuotationByBook() } ); QuoteConventionAnalysis analysis = env.GetQuoteConvention("MRK"); - Assert.That(analysis, Is.Not.Null); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.8)); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french")); } [Test] - public void TestGetQuotationConventionByChapter() + public void TestGetQuoteConventionByChapter() { var env = new TestEnvironment( files: new Dictionary() @@ -83,13 +83,13 @@ public void TestGetQuotationConventionByChapter() } ); QuoteConventionAnalysis analysis = env.GetQuoteConvention("MRK2,4-5"); - Assert.That(analysis, Is.Not.Null); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.66)); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french")); } [Test] - public void TestGetQuotationConventionByChapterIndeterminate() + public void TestGetQuoteConventionByChapterIndeterminate() { var env = new TestEnvironment( files: new Dictionary() @@ -105,11 +105,11 @@ public void TestGetQuotationConventionByChapterIndeterminate() } ); QuoteConventionAnalysis analysis = env.GetQuoteConvention("MAT1,3"); - Assert.That(analysis, Is.Null); + Assert.That(analysis.BestQuoteConvention, Is.Null); } [Test] - public void TestGetQuotationConventionInvalidBookCode() + public void TestGetQuoteConventionInvalidBookCode() { var env = new TestEnvironment( files: new Dictionary() @@ -123,7 +123,35 @@ public void TestGetQuotationConventionInvalidBookCode() } ); QuoteConventionAnalysis analysis = env.GetQuoteConvention("MAT"); - Assert.That(analysis, Is.Null); + Assert.That(analysis.BestQuoteConvention, Is.Null); + } + + [Test] + public void TestGetQuoteConventionWeightedAverageOfMultipleBooks() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "41MATTest.SFM", + $@"\id MAT +{GetTestChapter(1, StandardEnglishQuoteConvention)} +" + }, + { + "42MRKTest.SFM", + $@"\id MRK +\c 1 +\v 1 This ""sentence uses a different"" convention +" + } + } + ); + QuoteConventionAnalysis analysis = env.GetQuoteConvention(); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); + Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.8)); + Assert.That(analysis.BestQuoteConventionScore, Is.LessThan(0.9)); } private class TestEnvironment(ParatextProjectSettings? settings = null, Dictionary? files = null) diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkTabulatorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkTabulatorTests.cs index c3daec46..e9962af3 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkTabulatorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkTabulatorTests.cs @@ -169,7 +169,7 @@ public void CalculateSimilarity() twoLevelQuotationMarkTabulator.CalculateSimilarity( new QuoteConvention("", [new SingleLevelQuoteConvention("\u201c", "\u201d")]) ), - Is.EqualTo(0.66666666666667).Within(1e-9) + Is.EqualTo(0.5).Within(1e-9) ); Assert.That( twoLevelQuotationMarkTabulator.CalculateSimilarity( @@ -193,7 +193,7 @@ public void CalculateSimilarity() ] ) ), - Is.EqualTo(0.66666666666667).Within(1e-9) + Is.EqualTo(0.5).Within(1e-9) ); Assert.That( twoLevelQuotationMarkTabulator.CalculateSimilarity( @@ -205,7 +205,7 @@ public void CalculateSimilarity() ] ) ), - Is.EqualTo(0.33333333333333).Within(1e-9) + Is.EqualTo(0.0) ); } } diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionDetectorTests.cs similarity index 88% rename from tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs rename to tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionDetectorTests.cs index 212daca8..d74e4626 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionDetectorTests.cs @@ -4,7 +4,7 @@ namespace SIL.Machine.PunctuationAnalysis; [TestFixture] -public class QuotationConventionDetectorTests +public class QuoteConventionDetectorTests { // Text comes from the World English Bible, which is in the public domain. [Test] @@ -19,7 +19,7 @@ of the field which Yahweh God had made. ‘You shall not eat of any tree of the garden’?” "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); } @@ -35,7 +35,7 @@ of the field which Yahweh God had made. 'You shall not eat of any tree of the garden'?\"" "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_english")); } @@ -51,7 +51,7 @@ of the field which Yahweh God had made. “You shall not eat of any tree of the garden”?’ "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_english")); } @@ -67,7 +67,7 @@ of the field which Yahweh God had made. ""You shall not eat of any tree of the garden""?' "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_typewriter_english")); } @@ -83,7 +83,7 @@ of the field which Yahweh God had made. 'You shall not eat of any tree of the garden'?” "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_typewriter_english")); } @@ -99,7 +99,7 @@ of the field which Yahweh God had made. ‹You shall not eat of any tree of the garden›?» "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french")); } @@ -115,7 +115,7 @@ of the field which Yahweh God had made. ?>> "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_french")); } @@ -132,7 +132,7 @@ of the field which Yahweh God had made. “You shall not eat of any tree of the garden”?» "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("western_european")); } @@ -148,7 +148,7 @@ of the field which Yahweh God had made. ‘You shall not eat of any tree of the garden’?» "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("british_inspired_western_european")); } @@ -164,7 +164,7 @@ of the field which Yahweh God had made. ""You shall not eat of any tree of the garden""?>> "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_western_european")); } @@ -180,7 +180,7 @@ of the field which Yahweh God had made. ?"" "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("typewriter_western_european_variant")); } @@ -196,7 +196,7 @@ of the field which Yahweh God had made. ""You shall not eat of any tree of the garden""?» "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_typewriter_western_european")); } @@ -212,7 +212,7 @@ of the field which Yahweh God had made. 'You shall not eat of any tree of the garden'?» "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("hybrid_british_typewriter_western_european")); } @@ -228,7 +228,7 @@ of the field which Yahweh God had made. ‚You shall not eat of any tree of the garden‘?“ "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("central_european")); } @@ -244,7 +244,7 @@ of the field which Yahweh God had made. ›You shall not eat of any tree of the garden‹?« "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("central_european_guillemets")); } @@ -260,7 +260,7 @@ of the field which Yahweh God had made. ’You shall not eat of any tree of the garden’?” "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_swedish")); } @@ -276,7 +276,7 @@ of the field which Yahweh God had made. ’You shall not eat of any tree of the garden’?» "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_finnish")); } @@ -292,7 +292,7 @@ of the field which Yahweh God had made. ‚You shall not eat of any tree of the garden’?” "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("eastern_european")); } @@ -308,7 +308,7 @@ of the field which Yahweh God had made. „You shall not eat of any tree of the garden“?» "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_russian")); } @@ -324,7 +324,7 @@ of the field which Yahweh God had made. ’You shall not eat of any tree of the garden‘?“ "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_arabic")); } @@ -340,7 +340,7 @@ of the field which Yahweh God had made. ’You shall not eat of any tree of the garden‘?» "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("non-standard_arabic")); } @@ -360,7 +360,7 @@ of the field which Yahweh God had made. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’ "; QuoteConventionAnalysis analysis = DetectQuotationConvention(usfm); - Assert.IsNotNull(analysis); + Assert.That(analysis.BestQuoteConvention, Is.Not.Null); Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); } diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs index 305282c6..25afafbb 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuoteConventionSetTests.cs @@ -1715,12 +1715,12 @@ public void FindMostSimilarConvention() noisyMultipleEnglishQuotesTabulator ); Assert.That(convention, Is.EqualTo(standardEnglishQuoteConvention)); - Assert.That(similarity, Is.EqualTo(0.9).Within(1e-9)); + Assert.That(similarity, Is.EqualTo(0.8333333333333).Within(1e-9)); (convention, similarity) = twoFrenchQuoteConventionSet.FindMostSimilarConvention( noisyMultipleEnglishQuotesTabulator ); Assert.That(convention, Is.EqualTo(westernEuropeanQuoteConvention)); - Assert.That(similarity, Is.EqualTo(0.1).Within(1e-9)); + Assert.That(similarity, Is.EqualTo(0)); var noisyMultipleFrenchQuotesTabulator = new QuotationMarkTabulator(); noisyMultipleFrenchQuotesTabulator.Tabulate( @@ -1795,7 +1795,7 @@ public void FindMostSimilarConvention() noisyMultipleFrenchQuotesTabulator ); Assert.That(convention, Is.EqualTo(standardFrenchQuoteConvention)); - Assert.That(similarity, Is.EqualTo(0.916666666666).Within(1e-9)); + Assert.That(similarity, Is.EqualTo(0.875).Within(1e-9)); var tooDeepEnglishQuotesTabulator = new QuotationMarkTabulator(); tooDeepEnglishQuotesTabulator.Tabulate( @@ -1844,7 +1844,7 @@ public void FindMostSimilarConvention() ); (convention, similarity) = allThreeQuoteConventionSet.FindMostSimilarConvention(tooDeepEnglishQuotesTabulator); Assert.That(convention, Is.EqualTo(standardEnglishQuoteConvention)); - Assert.That(similarity, Is.EqualTo(0.967741935483871).Within(1e-9)); + Assert.That(similarity, Is.EqualTo(0.8).Within(1e-9)); // in case of ties, the earlier convention in the list should be returned var unknownQuoteTabulator = new QuotationMarkTabulator(); From 46be1c04516a1bba3c4da70156b512ee02a70c33 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 19 Nov 2025 10:50:52 -0500 Subject: [PATCH 2/6] Use TryGetValue --- src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs index e77a37e0..981a729e 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -80,11 +80,11 @@ public void TabulateFrom(QuotationMarkTabulator tabulatedQuotationMarks) ) { ((int depth, QuotationMarkDirection direction), QuotationMarkCounts counts) = (kvp.Key, kvp.Value); - if (!_quotationCountsByDepthAndDirection.ContainsKey((depth, direction))) + if (!_quotationCountsByDepthAndDirection.TryGetValue((depth, direction), out QuotationMarkCounts count)) { _quotationCountsByDepthAndDirection[(depth, direction)] = new QuotationMarkCounts(); } - _quotationCountsByDepthAndDirection[(depth, direction)].CountFrom(counts); + counts.CountFrom(counts); } } From 55ac62faa3dfeb64ed0bb4528251d2014134fd18 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 19 Nov 2025 11:21:11 -0500 Subject: [PATCH 3/6] Fix 'count' confusion --- .../QuotationMarkTabulator.cs | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs index 981a729e..70a6cc15 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -73,18 +73,22 @@ public void Tabulate(List quotationMarks) public void TabulateFrom(QuotationMarkTabulator tabulatedQuotationMarks) { foreach ( - KeyValuePair< - (int, QuotationMarkDirection), - QuotationMarkCounts - > kvp in tabulatedQuotationMarks._quotationCountsByDepthAndDirection + ( + (int depth, QuotationMarkDirection direction), + QuotationMarkCounts otherCounts + ) in tabulatedQuotationMarks._quotationCountsByDepthAndDirection.Select(kvp => (kvp.Key, kvp.Value)) ) { - ((int depth, QuotationMarkDirection direction), QuotationMarkCounts counts) = (kvp.Key, kvp.Value); - if (!_quotationCountsByDepthAndDirection.TryGetValue((depth, direction), out QuotationMarkCounts count)) + if ( + !_quotationCountsByDepthAndDirection.TryGetValue( + (depth, direction), + out QuotationMarkCounts currentCounts + ) + ) { - _quotationCountsByDepthAndDirection[(depth, direction)] = new QuotationMarkCounts(); + currentCounts = new QuotationMarkCounts(); } - counts.CountFrom(counts); + currentCounts.CountFrom(otherCounts); } } From fe52e102bb01e72dddf46f0e3b3c9b358cd47913 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 19 Nov 2025 11:21:42 -0500 Subject: [PATCH 4/6] Use 'thisCounts' --- .../PunctuationAnalysis/QuotationMarkTabulator.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs index 70a6cc15..2a2f084c 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -82,13 +82,13 @@ QuotationMarkCounts otherCounts if ( !_quotationCountsByDepthAndDirection.TryGetValue( (depth, direction), - out QuotationMarkCounts currentCounts + out QuotationMarkCounts thisCounts ) ) { - currentCounts = new QuotationMarkCounts(); + thisCounts = new QuotationMarkCounts(); } - currentCounts.CountFrom(otherCounts); + thisCounts.CountFrom(otherCounts); } } From f3b7feaabec7337d476dc584bed9ca71d08df654 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 19 Nov 2025 11:26:46 -0500 Subject: [PATCH 5/6] Initialize counts when key not found --- src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs index 2a2f084c..aed8a6ae 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -86,7 +86,7 @@ out QuotationMarkCounts thisCounts ) ) { - thisCounts = new QuotationMarkCounts(); + _quotationCountsByDepthAndDirection[(depth, direction)] = new QuotationMarkCounts(); } thisCounts.CountFrom(otherCounts); } From 4e07d0f1e11af466c2a899ef060725e56de1f363 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 19 Nov 2025 11:33:21 -0500 Subject: [PATCH 6/6] Assign thisCounts when key not found --- src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs index aed8a6ae..04054f8e 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs @@ -86,7 +86,8 @@ out QuotationMarkCounts thisCounts ) ) { - _quotationCountsByDepthAndDirection[(depth, direction)] = new QuotationMarkCounts(); + thisCounts = new QuotationMarkCounts(); + _quotationCountsByDepthAndDirection[(depth, direction)] = thisCounts; } thisCounts.CountFrom(otherCounts); }