diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs index d5602ed3..2c56e871 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs @@ -1,4 +1,5 @@ using System.Collections.Generic; +using System.Globalization; using System.Linq; using PCRE; @@ -43,11 +44,21 @@ public List FindAllPotentialQuotationMarksInTextSegmen _quoteConventions.IsValidOpeningQuotationMark(match.Groups[0].Value) || _quoteConventions.IsValidClosingQuotationMark(match.Groups[0].Value) ) - .Select(m => new QuotationMarkStringMatch( - textSegment, - m.Groups[0].Index, - m.Groups[0].Index + m.Groups[0].Length - )) + .Select(m => + { + int[] textElementIndices = StringInfo.ParseCombiningCharacters(textSegment.Text); + int startIndex = 0; + int endIndex = textElementIndices.Length; + for (int textElementIndex = 0; textElementIndex < textElementIndices.Length; textElementIndex++) + { + int stringIndex = textElementIndices[textElementIndex]; + if (stringIndex == m.Groups[0].Index) + startIndex = textElementIndex; + if (stringIndex == m.Groups[0].EndIndex) + endIndex = textElementIndex; + } + return new QuotationMarkStringMatch(textSegment, startIndex, endIndex); + }) .ToList(); } } diff --git a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs index f2f783fb..6b20438c 100644 --- a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Globalization; using SIL.Machine.Corpora; namespace SIL.Machine.PunctuationAnalysis @@ -70,7 +71,7 @@ public override int GetHashCode() return hashCode * 31 + ImmediatePrecedingMarker.GetHashCode(); } - public int Length => Text.Length; + public int Length => StringInfo.ParseCombiningCharacters(Text).Length; public string SubstringBefore(int index) { diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs b/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs index b395fef7..2677683b 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs @@ -16,8 +16,10 @@ internal static class CorporaTestHelpers ); public static readonly string UsfmTestProjectPath = Path.Combine(TestDataPath, "usfm", "Tes"); public static readonly string UsfmTargetProjectPath = Path.Combine(TestDataPath, "usfm", "target"); + public static readonly string UsfmTargetProjectZipPath = Path.Combine(TestDataPath, "project", "target"); public static readonly string UsfmTargetCustomVrsPath = Path.Combine(TestDataPath, "usfm", "target", "custom.vrs"); public static readonly string UsfmSourceProjectPath = Path.Combine(TestDataPath, "usfm", "source"); + public static readonly string UsfmSourceProjectZipPath = Path.Combine(TestDataPath, "project", "source"); public static readonly string UsxTestProjectPath = Path.Combine(TestDataPath, "usx", "Tes"); public static readonly string TextTestProjectPath = Path.Combine(TestDataPath, "txt"); public static readonly string DeuterocanonicalsSourcePath = Path.Combine( diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index 30d8ecf7..d5ddf36d 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -1,6 +1,7 @@ using System.IO.Compression; using System.Text.Json; using NUnit.Framework; +using SIL.Machine.PunctuationAnalysis; namespace SIL.Machine.Corpora; @@ -170,4 +171,28 @@ async Task GetUsfmAsync(string projectPath) await GetUsfmAsync(ParatextProjectPath); } } + + [Test] + [Ignore("This is for manual testing only. Remove this tag to run the test.")] + public void AnalyzeCorporaQuoteConventions() + { + var sourceHandler = new QuoteConventionDetector(); + using ZipArchive zipArchive = ZipFile.OpenRead(CorporaTestHelpers.UsfmSourceProjectZipPath); + var quoteConventionDetector = new ZipParatextProjectQuoteConventionDetector(zipArchive); + quoteConventionDetector.GetQuoteConventionAnalysis(sourceHandler); + + var targetHandler = new QuoteConventionDetector(); + using ZipArchive zipArchive2 = ZipFile.OpenRead(CorporaTestHelpers.UsfmTargetProjectZipPath); + var quoteConventionDetector2 = new ZipParatextProjectQuoteConventionDetector(zipArchive2); + quoteConventionDetector2.GetQuoteConventionAnalysis(targetHandler); + + QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuotationConvention(); + QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuotationConvention(); + + Assert.Multiple(() => + { + Assert.NotNull(sourceAnalysis); + Assert.NotNull(targetAnalysis); + }); + } } diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs index 0c70b746..44608d7e 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs @@ -282,6 +282,22 @@ public void ThatAllPossibleQuotationMarksAreIdentified() ] ) ); + + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("उत्पत्ति \"पुस्तकले").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("उत्पत्ति \"पुस्तकले").Build(), + 6, + 7 + ), + ] + ) + ); } [Test] diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs index 2870d81e..f62b750a 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs @@ -188,6 +188,8 @@ public void Length() textSegment = new TextSegment.Builder().SetText("new example text").Build(); Assert.That(textSegment.Length, Is.EqualTo("new example text".Length)); + textSegment = new TextSegment.Builder().SetText("उत्पत्ति पुस्तकले").Build(); + Assert.That(textSegment.Length, Is.EqualTo(11)); } [Test]