Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using SIL.Machine.Corpora;
using SIL.Scripture;

namespace SIL.Machine.PunctuationAnalysis
{
Expand All @@ -20,10 +23,38 @@ protected ParatextProjectQuoteConventionDetector(ParatextProjectSettingsParserBa
}

public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null)
{
Dictionary<int, List<int>> includeChapters = null;
return GetQuoteConventionAnalysis(handler, includeChapters);
}

public QuoteConventionAnalysis GetQuoteConventionAnalysis(
QuoteConventionDetector handler = null,
IReadOnlyDictionary<string, List<int>> includeChapters = null
)
{
return GetQuoteConventionAnalysis(
handler,
includeChapters.ToDictionary(kvp => Canon.BookIdToNumber(kvp.Key), kvp => kvp.Value)
);
}

public QuoteConventionAnalysis GetQuoteConventionAnalysis(
QuoteConventionDetector handler = null,
IReadOnlyDictionary<int, List<int>> includeChapters = null
)
{
handler = handler ?? new QuoteConventionDetector();
foreach (string fileName in _settings.GetAllScriptureBookFileNames())
foreach (
string bookId in Canon
.AllBookNumbers.Where(num => Canon.IsCanonical(num))
.Select(num => Canon.BookNumberToId(num))
)
{
if (includeChapters != null && !includeChapters.ContainsKey(Canon.BookIdToNumber(bookId)))
continue;

string fileName = _settings.GetBookFileName(bookId);
if (!Exists(fileName))
continue;

Expand All @@ -47,7 +78,7 @@ public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetecto
throw new InvalidOperationException(sb.ToString(), ex);
}
}
return handler.DetectQuotationConvention();
return handler.DetectQuoteConvention(includeChapters);
}

protected abstract bool Exists(string fileName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ private void CountQuotationMarksInChapter(Chapter chapter, QuoteConventionSet po
_quotationMarkTabulator.Tabulate(resolvedQuotationMarks);
}

public QuoteConventionAnalysis DetectQuotationConvention()
public QuoteConventionAnalysis DetectQuoteConvention(IReadOnlyDictionary<int, List<int>> includeChapters = null)
{
CountQuotationMarksInChapters(GetChapters());
CountQuotationMarksInChapters(GetChapters(includeChapters));

(QuoteConvention bestQuoteConvention, double score) = QuoteConventions.Standard.FindMostSimilarConvention(
_quotationMarkTabulator
Expand Down
14 changes: 14 additions & 0 deletions src/SIL.Machine/PunctuationAnalysis/TextSegment.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ public string Text
get => _surrogatePairString.ToString();
private set => _surrogatePairString = new SurrogatePairString(value);
}
public string Book { get; private set; }
public int Chapter { get; private set; }
public UsfmMarkerType ImmediatePrecedingMarker { get; private set; }
public HashSet<UsfmMarkerType> MarkersInPrecedingContext { get; private set; }
public TextSegment PreviousSegment { get; set; }
Expand Down Expand Up @@ -139,6 +141,18 @@ public Builder AddPrecedingMarker(UsfmMarkerType marker)
return this;
}

public Builder SetBook(string code)
{
_textSegment.Book = code;
return this;
}

public Builder SetChapter(int number)
{
_textSegment.Chapter = number;
return this;
}

public Builder SetUsfmToken(UsfmToken token)
{
_textSegment.UsfmToken = token;
Expand Down
24 changes: 21 additions & 3 deletions src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System.Collections.Generic;
using SIL.Machine.Corpora;
using SIL.Scripture;

namespace SIL.Machine.PunctuationAnalysis
{
Expand All @@ -14,9 +15,15 @@ public UsfmStructureExtractor()
_nextTextSegmentBuilder = new TextSegment.Builder();
}

public void StartBook(UsfmParserState state, string marker, string code)
{
_nextTextSegmentBuilder.SetBook(code);
}

public void Chapter(UsfmParserState state, string number, string marker, string altNumber, string pubNumber)
{
_nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter);
_nextTextSegmentBuilder.SetChapter(state.VerseRef.ChapterNum);
}

public void EndBook(UsfmParserState state, string marker) { }
Expand Down Expand Up @@ -65,8 +72,6 @@ public void Ref(UsfmParserState state, string marker, string display, string tar
_nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed);
}

public void StartBook(UsfmParserState state, string marker, string code) { }

public void StartCell(UsfmParserState state, string marker, string align, int colspan) { }

public void StartChar(
Expand Down Expand Up @@ -127,13 +132,26 @@ public void Verse(UsfmParserState state, string number, string marker, string al
_nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse);
}

public List<Chapter> GetChapters()
public List<Chapter> GetChapters(IReadOnlyDictionary<int, List<int>> includeChapters = null)
{
var chapters = new List<Chapter>();
int currentBook = 0;
int currentChapter = 0;
var currentChapterVerses = new List<Verse>();
var currentVerseSegments = new List<TextSegment>();
foreach (TextSegment textSegment in _textSegments)
{
if (textSegment.Book != null)
currentBook = Canon.BookIdToNumber(textSegment.Book);
if (textSegment.Chapter > 0)
currentChapter = textSegment.Chapter;
if (includeChapters != null && currentBook > 0)
{
if (!includeChapters.TryGetValue(currentBook, out List<int> bookChapters))
continue;
if (currentChapter > 0 && bookChapters.Count > 0 && !bookChapters.Contains(currentChapter))
continue;
}
if (textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse))
{
if (currentVerseSegments.Count > 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ namespace SIL.Machine.Corpora;
[TestFixture]
public class ParatextProjectQuoteConventionDetectorTests
{
private static readonly QuoteConvention StandardEnglishQuoteConvention =
QuoteConventions.Standard.GetQuoteConventionByName("standard_english");
private static readonly QuoteConvention StandardFrenchQuoteConvention =
QuoteConventions.Standard.GetQuoteConventionByName("standard_french");

[Test]
public void TestGetQuotationAnalysis()
{
Expand All @@ -16,18 +21,9 @@ public void TestGetQuotationAnalysis()
{
{
"41MATTest.SFM",
@"\id MAT
\c 1
\v 1 Someone said, “This is something I am saying!
\v 2 This is also something I am saying” (that is, “something I am speaking”).
\p
\v 3 Other text, and someone else said,
\q1
\v 4 “Things
\q2 someone else said!
\q3 and more things someone else said.”
\m That is why he said “things someone else said.”
\v 5 Then someone said, “More things someone said.”"
$@"\id MAT
{GetTestChapter(1, StandardEnglishQuoteConvention)}
"
}
}
);
Expand All @@ -37,6 +33,100 @@ public void TestGetQuotationAnalysis()
Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english"));
}

[Test]
public void TestGetQuotationByBook()
{
var env = new TestEnvironment(
files: new Dictionary<string, string>()
{
{
"41MATTest.SFM",
$@"\id MAT
{GetTestChapter(1, StandardEnglishQuoteConvention)}
"
},
{
"42MRKTest.SFM",
$@"\id MRK
{GetTestChapter(1, StandardFrenchQuoteConvention)}
"
}
}
);
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MRK");
Assert.That(analysis, Is.Not.Null);
Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.8));
Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french"));
}

[Test]
public void TestGetQuotationConventionByChapter()
{
var env = new TestEnvironment(
files: new Dictionary<string, string>()
{
{
"41MATTest.SFM",
$@"\id MAT
{GetTestChapter(1, StandardEnglishQuoteConvention)}
"
},
{
"42MRKTest.SFM",
$@"\id MRK
{GetTestChapter(1, StandardEnglishQuoteConvention)}
{GetTestChapter(2, StandardFrenchQuoteConvention)}
{GetTestChapter(3, StandardEnglishQuoteConvention)}
{GetTestChapter(4, StandardEnglishQuoteConvention)}
{GetTestChapter(5, StandardFrenchQuoteConvention)}
"
}
}
);
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MRK2,4-5");
Assert.That(analysis, Is.Not.Null);
Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.66));
Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french"));
}

[Test]
public void TestGetQuotationConventionByChapterIndeterminate()
{
var env = new TestEnvironment(
files: new Dictionary<string, string>()
{
{
"41MATTest.SFM",
$@"\id MAT
{GetTestChapter(1)}
{GetTestChapter(2, StandardEnglishQuoteConvention)}
{GetTestChapter(3)}
"
}
}
);
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MAT1,3");
Assert.That(analysis, Is.Null);
}

[Test]
public void TestGetQuotationConventionInvalidBookCode()
{
var env = new TestEnvironment(
files: new Dictionary<string, string>()
{
{
"41MATTest.SFM",
$@"\id LUK
{GetTestChapter(1, StandardEnglishQuoteConvention)}
"
}
}
);
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MAT");
Assert.That(analysis, Is.Null);
}

private class TestEnvironment(ParatextProjectSettings? settings = null, Dictionary<string, string>? files = null)
{
public ParatextProjectQuoteConventionDetector Detector { get; } =
Expand All @@ -45,12 +135,37 @@ private class TestEnvironment(ParatextProjectSettings? settings = null, Dictiona
files ?? new()
);

public QuoteConventionAnalysis GetQuoteConvention()
public QuoteConventionAnalysis GetQuoteConvention(string? scriptureRange = null)
{
return Detector.GetQuoteConventionAnalysis();
Dictionary<int, List<int>>? chapters = null;
if (scriptureRange != null)
{
chapters = ScriptureRangeParser
.GetChapters(scriptureRange)
.ToDictionary(kvp => Canon.BookIdToNumber(kvp.Key), kvp => kvp.Value);
}
return Detector.GetQuoteConventionAnalysis(includeChapters: chapters);
}
}

private static string GetTestChapter(int number, QuoteConvention? quoteConvention = null)
{
string leftQuote = quoteConvention != null ? quoteConvention.GetOpeningQuotationMarkAtDepth(1) : "";
string rightQuote = quoteConvention != null ? quoteConvention.GetClosingQuotationMarkAtDepth(1) : "";
return $@"\c {number}
\v 1 Someone said, {leftQuote}This is something I am saying!
\v 2 This is also something I am saying{rightQuote} (that is, {leftQuote}something I am speaking{rightQuote}).
\p
\v 3 Other text, and someone else said,
\q1
\v 4 {leftQuote}Things
\q2 someone else said!
\q3 and more things someone else said.{rightQuote}
\m That is why he said {leftQuote}things someone else said.{rightQuote}
\v 5 Then someone said, {leftQuote}More things someone said.{rightQuote}
";
}

private class DefaultParatextProjectSettings(
string name = "Test",
string fullName = "TestProject",
Expand Down
4 changes: 2 additions & 2 deletions tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,8 @@ public void AnalyzeCorporaQuoteConventions()
var quoteConventionDetector2 = new ZipParatextProjectQuoteConventionDetector(zipArchive2);
quoteConventionDetector2.GetQuoteConventionAnalysis(targetHandler);

QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuotationConvention();
QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuotationConvention();
QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuoteConvention();
QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuoteConvention();

Assert.Multiple(() =>
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,6 @@ public QuoteConventionAnalysis DetectQuotationConvention(string usfm)
{
var quoteConventionDetector = new QuoteConventionDetector();
UsfmParser.Parse(usfm, quoteConventionDetector);
return quoteConventionDetector.DetectQuotationConvention();
return quoteConventionDetector.DetectQuoteConvention();
}
}
Loading
Loading