diff --git a/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs b/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs index 5cbdc52a..93fbe81c 100644 --- a/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs +++ b/src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs @@ -1,7 +1,10 @@ using System; +using System.Collections.Generic; using System.IO; +using System.Linq; using System.Text; using SIL.Machine.Corpora; +using SIL.Scripture; namespace SIL.Machine.PunctuationAnalysis { @@ -20,10 +23,38 @@ protected ParatextProjectQuoteConventionDetector(ParatextProjectSettingsParserBa } public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null) + { + Dictionary> includeChapters = null; + return GetQuoteConventionAnalysis(handler, includeChapters); + } + + public QuoteConventionAnalysis GetQuoteConventionAnalysis( + QuoteConventionDetector handler = null, + IReadOnlyDictionary> includeChapters = null + ) + { + return GetQuoteConventionAnalysis( + handler, + includeChapters.ToDictionary(kvp => Canon.BookIdToNumber(kvp.Key), kvp => kvp.Value) + ); + } + + public QuoteConventionAnalysis GetQuoteConventionAnalysis( + QuoteConventionDetector handler = null, + IReadOnlyDictionary> includeChapters = null + ) { handler = handler ?? new QuoteConventionDetector(); - foreach (string fileName in _settings.GetAllScriptureBookFileNames()) + foreach ( + string bookId in Canon + .AllBookNumbers.Where(num => Canon.IsCanonical(num)) + .Select(num => Canon.BookNumberToId(num)) + ) { + if (includeChapters != null && !includeChapters.ContainsKey(Canon.BookIdToNumber(bookId))) + continue; + + string fileName = _settings.GetBookFileName(bookId); if (!Exists(fileName)) continue; @@ -47,7 +78,7 @@ public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetecto throw new InvalidOperationException(sb.ToString(), ex); } } - return handler.DetectQuotationConvention(); + return handler.DetectQuoteConvention(includeChapters); } protected abstract bool Exists(string fileName); diff --git a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs index bd6c7fea..3200e2ab 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs @@ -56,9 +56,9 @@ private void CountQuotationMarksInChapter(Chapter chapter, QuoteConventionSet po _quotationMarkTabulator.Tabulate(resolvedQuotationMarks); } - public QuoteConventionAnalysis DetectQuotationConvention() + public QuoteConventionAnalysis DetectQuoteConvention(IReadOnlyDictionary> includeChapters = null) { - CountQuotationMarksInChapters(GetChapters()); + CountQuotationMarksInChapters(GetChapters(includeChapters)); (QuoteConvention bestQuoteConvention, double score) = QuoteConventions.Standard.FindMostSimilarConvention( _quotationMarkTabulator diff --git a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs index 13ef13ec..e4a6d06f 100644 --- a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -12,6 +12,8 @@ public string Text get => _surrogatePairString.ToString(); private set => _surrogatePairString = new SurrogatePairString(value); } + public string Book { get; private set; } + public int Chapter { get; private set; } public UsfmMarkerType ImmediatePrecedingMarker { get; private set; } public HashSet MarkersInPrecedingContext { get; private set; } public TextSegment PreviousSegment { get; set; } @@ -139,6 +141,18 @@ public Builder AddPrecedingMarker(UsfmMarkerType marker) return this; } + public Builder SetBook(string code) + { + _textSegment.Book = code; + return this; + } + + public Builder SetChapter(int number) + { + _textSegment.Chapter = number; + return this; + } + public Builder SetUsfmToken(UsfmToken token) { _textSegment.UsfmToken = token; diff --git a/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs b/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs index ce2d6cd7..cf8cf27f 100644 --- a/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs +++ b/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs @@ -1,5 +1,6 @@ using System.Collections.Generic; using SIL.Machine.Corpora; +using SIL.Scripture; namespace SIL.Machine.PunctuationAnalysis { @@ -14,9 +15,15 @@ public UsfmStructureExtractor() _nextTextSegmentBuilder = new TextSegment.Builder(); } + public void StartBook(UsfmParserState state, string marker, string code) + { + _nextTextSegmentBuilder.SetBook(code); + } + public void Chapter(UsfmParserState state, string number, string marker, string altNumber, string pubNumber) { _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter); + _nextTextSegmentBuilder.SetChapter(state.VerseRef.ChapterNum); } public void EndBook(UsfmParserState state, string marker) { } @@ -65,8 +72,6 @@ public void Ref(UsfmParserState state, string marker, string display, string tar _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed); } - public void StartBook(UsfmParserState state, string marker, string code) { } - public void StartCell(UsfmParserState state, string marker, string align, int colspan) { } public void StartChar( @@ -127,13 +132,26 @@ public void Verse(UsfmParserState state, string number, string marker, string al _nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse); } - public List GetChapters() + public List GetChapters(IReadOnlyDictionary> includeChapters = null) { var chapters = new List(); + int currentBook = 0; + int currentChapter = 0; var currentChapterVerses = new List(); var currentVerseSegments = new List(); foreach (TextSegment textSegment in _textSegments) { + if (textSegment.Book != null) + currentBook = Canon.BookIdToNumber(textSegment.Book); + if (textSegment.Chapter > 0) + currentChapter = textSegment.Chapter; + if (includeChapters != null && currentBook > 0) + { + if (!includeChapters.TryGetValue(currentBook, out List bookChapters)) + continue; + if (currentChapter > 0 && bookChapters.Count > 0 && !bookChapters.Contains(currentChapter)) + continue; + } if (textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)) { if (currentVerseSegments.Count > 0) diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs index 46a75faa..7338d611 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs @@ -8,6 +8,11 @@ namespace SIL.Machine.Corpora; [TestFixture] public class ParatextProjectQuoteConventionDetectorTests { + private static readonly QuoteConvention StandardEnglishQuoteConvention = + QuoteConventions.Standard.GetQuoteConventionByName("standard_english"); + private static readonly QuoteConvention StandardFrenchQuoteConvention = + QuoteConventions.Standard.GetQuoteConventionByName("standard_french"); + [Test] public void TestGetQuotationAnalysis() { @@ -16,18 +21,9 @@ public void TestGetQuotationAnalysis() { { "41MATTest.SFM", - @"\id MAT -\c 1 -\v 1 Someone said, “This is something I am saying! -\v 2 This is also something I am saying” (that is, “something I am speaking”). -\p -\v 3 Other text, and someone else said, -\q1 -\v 4 “Things -\q2 someone else said! -\q3 and more things someone else said.” -\m That is why he said “things someone else said.” -\v 5 Then someone said, “More things someone said.”" + $@"\id MAT +{GetTestChapter(1, StandardEnglishQuoteConvention)} +" } } ); @@ -37,6 +33,100 @@ public void TestGetQuotationAnalysis() Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english")); } + [Test] + public void TestGetQuotationByBook() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "41MATTest.SFM", + $@"\id MAT +{GetTestChapter(1, StandardEnglishQuoteConvention)} +" + }, + { + "42MRKTest.SFM", + $@"\id MRK +{GetTestChapter(1, StandardFrenchQuoteConvention)} +" + } + } + ); + QuoteConventionAnalysis analysis = env.GetQuoteConvention("MRK"); + Assert.That(analysis, Is.Not.Null); + Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.8)); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french")); + } + + [Test] + public void TestGetQuotationConventionByChapter() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "41MATTest.SFM", + $@"\id MAT +{GetTestChapter(1, StandardEnglishQuoteConvention)} +" + }, + { + "42MRKTest.SFM", + $@"\id MRK +{GetTestChapter(1, StandardEnglishQuoteConvention)} +{GetTestChapter(2, StandardFrenchQuoteConvention)} +{GetTestChapter(3, StandardEnglishQuoteConvention)} +{GetTestChapter(4, StandardEnglishQuoteConvention)} +{GetTestChapter(5, StandardFrenchQuoteConvention)} +" + } + } + ); + QuoteConventionAnalysis analysis = env.GetQuoteConvention("MRK2,4-5"); + Assert.That(analysis, Is.Not.Null); + Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.66)); + Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french")); + } + + [Test] + public void TestGetQuotationConventionByChapterIndeterminate() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "41MATTest.SFM", + $@"\id MAT +{GetTestChapter(1)} +{GetTestChapter(2, StandardEnglishQuoteConvention)} +{GetTestChapter(3)} +" + } + } + ); + QuoteConventionAnalysis analysis = env.GetQuoteConvention("MAT1,3"); + Assert.That(analysis, Is.Null); + } + + [Test] + public void TestGetQuotationConventionInvalidBookCode() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "41MATTest.SFM", + $@"\id LUK +{GetTestChapter(1, StandardEnglishQuoteConvention)} +" + } + } + ); + QuoteConventionAnalysis analysis = env.GetQuoteConvention("MAT"); + Assert.That(analysis, Is.Null); + } + private class TestEnvironment(ParatextProjectSettings? settings = null, Dictionary? files = null) { public ParatextProjectQuoteConventionDetector Detector { get; } = @@ -45,12 +135,37 @@ private class TestEnvironment(ParatextProjectSettings? settings = null, Dictiona files ?? new() ); - public QuoteConventionAnalysis GetQuoteConvention() + public QuoteConventionAnalysis GetQuoteConvention(string? scriptureRange = null) { - return Detector.GetQuoteConventionAnalysis(); + Dictionary>? chapters = null; + if (scriptureRange != null) + { + chapters = ScriptureRangeParser + .GetChapters(scriptureRange) + .ToDictionary(kvp => Canon.BookIdToNumber(kvp.Key), kvp => kvp.Value); + } + return Detector.GetQuoteConventionAnalysis(includeChapters: chapters); } } + private static string GetTestChapter(int number, QuoteConvention? quoteConvention = null) + { + string leftQuote = quoteConvention != null ? quoteConvention.GetOpeningQuotationMarkAtDepth(1) : ""; + string rightQuote = quoteConvention != null ? quoteConvention.GetClosingQuotationMarkAtDepth(1) : ""; + return $@"\c {number} +\v 1 Someone said, {leftQuote}This is something I am saying! +\v 2 This is also something I am saying{rightQuote} (that is, {leftQuote}something I am speaking{rightQuote}). +\p +\v 3 Other text, and someone else said, +\q1 +\v 4 {leftQuote}Things +\q2 someone else said! +\q3 and more things someone else said.{rightQuote} +\m That is why he said {leftQuote}things someone else said.{rightQuote} +\v 5 Then someone said, {leftQuote}More things someone said.{rightQuote} + "; + } + private class DefaultParatextProjectSettings( string name = "Test", string fullName = "TestProject", diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index d5ddf36d..e02ba432 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -186,8 +186,8 @@ public void AnalyzeCorporaQuoteConventions() var quoteConventionDetector2 = new ZipParatextProjectQuoteConventionDetector(zipArchive2); quoteConventionDetector2.GetQuoteConventionAnalysis(targetHandler); - QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuotationConvention(); - QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuotationConvention(); + QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuoteConvention(); + QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuoteConvention(); Assert.Multiple(() => { diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs index 8b34a377..212daca8 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs @@ -368,6 +368,6 @@ public QuoteConventionAnalysis DetectQuotationConvention(string usfm) { var quoteConventionDetector = new QuoteConventionDetector(); UsfmParser.Parse(usfm, quoteConventionDetector); - return quoteConventionDetector.DetectQuotationConvention(); + return quoteConventionDetector.DetectQuoteConvention(); } } diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs index 6615ec92..77871027 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs @@ -16,6 +16,63 @@ public void SetUp() _verseTextParserState.SetVerseNum(1); } + [Test] + public void GetChaptersFilterByBook() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.StartBook(_verseTextParserState, "id", "GEN"); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + + Assert.That( + usfmStructureExtractor.GetChapters(new Dictionary> { { 2, [1] } }), // EXO 1 + Has.Count.EqualTo(0) + ); + } + + [Test] + public void GetChaptersFilterByChapter() + { + var usfmStructureExtractor = new UsfmStructureExtractor(); + usfmStructureExtractor.StartBook(_verseTextParserState, "id", "MAT"); + _verseTextParserState.SetChapterNum(1); + usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test"); + _verseTextParserState.SetChapterNum(2); + usfmStructureExtractor.Chapter(_verseTextParserState, "2", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test2"); + _verseTextParserState.SetChapterNum(3); + usfmStructureExtractor.Chapter(_verseTextParserState, "3", "c", null, null); + usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); + usfmStructureExtractor.Text(_verseTextParserState, "test3"); + + List expectedChapters = + [ + new Chapter( + [ + new Verse( + [ + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build() + ] + ) + ] + ) + ]; + List actualChapters = usfmStructureExtractor.GetChapters( + new Dictionary> { { 40, [2] } } + ); + AssertChapterEqual(expectedChapters, actualChapters); + Assert.That(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment, Is.Null); + Assert.That(actualChapters[0].Verses[0].TextSegments[0].NextSegment, Is.Null); + } + [Test] public void ChapterAndVerseMarkers() { @@ -493,5 +550,12 @@ public void SetVerseNum(int verseNum) vref.VerseNum = verseNum; VerseRef = vref; } + + public void SetChapterNum(int chapterNum) + { + VerseRef vref = VerseRef; + vref.ChapterNum = chapterNum; + VerseRef = vref; + } } }