diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs index 2c56e871..d045b266 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs @@ -1,5 +1,4 @@ using System.Collections.Generic; -using System.Globalization; using System.Linq; using PCRE; @@ -46,17 +45,9 @@ public List FindAllPotentialQuotationMarksInTextSegmen ) .Select(m => { - int[] textElementIndices = StringInfo.ParseCombiningCharacters(textSegment.Text); - int startIndex = 0; - int endIndex = textElementIndices.Length; - for (int textElementIndex = 0; textElementIndex < textElementIndices.Length; textElementIndex++) - { - int stringIndex = textElementIndices[textElementIndex]; - if (stringIndex == m.Groups[0].Index) - startIndex = textElementIndex; - if (stringIndex == m.Groups[0].EndIndex) - endIndex = textElementIndex; - } + SurrogatePairString surrogatePairString = new SurrogatePairString(textSegment.Text); + int startIndex = surrogatePairString.GetSurrogatePairIndexForStringIndex(m.Groups[0].Index); + int endIndex = surrogatePairString.GetSurrogatePairIndexForStringIndex(m.Groups[0].EndIndex); return new QuotationMarkStringMatch(textSegment, startIndex, endIndex); }) .ToList(); diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs index 0e322ca6..6cee1924 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs @@ -1,5 +1,4 @@ using System; -using System.Globalization; using System.Text.RegularExpressions; using PCRE; @@ -42,8 +41,7 @@ public override int GetHashCode() return code; } - public string QuotationMark => - new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex, EndIndex - StartIndex); + public string QuotationMark => TextSegment.Substring(StartIndex, EndIndex - StartIndex); public bool IsValidOpeningQuotationMark(QuoteConventionSet quoteConventions) => quoteConventions.IsValidOpeningQuotationMark(QuotationMark); @@ -74,14 +72,11 @@ public string PreviousCharacter TextSegment previousSegment = TextSegment.PreviousSegment; if (previousSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) { - return new StringInfo(previousSegment.Text).SubstringByTextElements( - StringInfo.ParseCombiningCharacters(previousSegment.Text).Length - 1, - 1 - ); + return previousSegment.Substring(previousSegment.Length - 1, 1); } return null; } - return new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex - 1, 1); + return TextSegment.Substring(StartIndex - 1, 1); } } @@ -94,11 +89,11 @@ public string NextCharacter TextSegment nextSegment = TextSegment.NextSegment; if (nextSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) { - return new StringInfo(nextSegment.Text).SubstringByTextElements(0, 1); + return nextSegment.Substring(0, 1); } return null; } - return new StringInfo(TextSegment.Text).SubstringByTextElements(EndIndex, 1); + return TextSegment.Substring(EndIndex, 1); } } diff --git a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs index 6b20438c..13ef13ec 100644 --- a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -1,13 +1,17 @@ using System; using System.Collections.Generic; -using System.Globalization; +using System.Linq; using SIL.Machine.Corpora; namespace SIL.Machine.PunctuationAnalysis { public class TextSegment : IEquatable { - public string Text { get; private set; } + public string Text + { + get => _surrogatePairString.ToString(); + private set => _surrogatePairString = new SurrogatePairString(value); + } public UsfmMarkerType ImmediatePrecedingMarker { get; private set; } public HashSet MarkersInPrecedingContext { get; private set; } public TextSegment PreviousSegment { get; set; } @@ -15,6 +19,7 @@ public class TextSegment : IEquatable public int IndexInVerse { get; set; } public int NumSegmentsInVerse { get; set; } public UsfmToken UsfmToken { get; private set; } + private SurrogatePairString _surrogatePairString; public TextSegment() { @@ -71,16 +76,21 @@ public override int GetHashCode() return hashCode * 31 + ImmediatePrecedingMarker.GetHashCode(); } - public int Length => StringInfo.ParseCombiningCharacters(Text).Length; + public int Length => _surrogatePairString.Length; + + public string Substring(int startIndex, int length) + { + return _surrogatePairString.Substring(startIndex, length); + } public string SubstringBefore(int index) { - return Text.Substring(0, index); + return Substring(0, index); } public string SubstringAfter(int index) { - return Text.Substring(index); + return Substring(index, Length - index); } public bool MarkerIsInPrecedingContext(UsfmMarkerType marker) @@ -147,4 +157,91 @@ public TextSegment Build() } } } + + /// + /// Class to handle indexing of strings by unicode code point, treating surrogate pairs as single characters. + /// + public class SurrogatePairString + { + public string String => _stringValue; + public int Length => _stringIndexBySurrogatePairIndex.Count; + + private readonly string _stringValue; + private readonly Dictionary _surrogatePairIndexByStringIndex; + private readonly Dictionary _stringIndexBySurrogatePairIndex; + + public SurrogatePairString(string stringValue) + { + _stringValue = stringValue; + IEnumerable<(int SurrogatePairIndex, int StringIndex)> indexPairs = _stringValue + .Select((c, i) => (c, i)) + .Where(tup => !char.IsLowSurrogate(tup.c)) + .Select((tup, i) => (tup.i, i)); + _surrogatePairIndexByStringIndex = new Dictionary(); + _stringIndexBySurrogatePairIndex = new Dictionary(); + foreach ((int surrogatePairIndex, int stringIndex) in indexPairs) + { + _surrogatePairIndexByStringIndex[stringIndex] = surrogatePairIndex; + _stringIndexBySurrogatePairIndex[surrogatePairIndex] = stringIndex; + } + } + + public override string ToString() + { + return _stringValue; + } + + public string this[int surrogatePairIndex] + { + get + { + if (surrogatePairIndex < 0 || surrogatePairIndex > Length) + { + throw new IndexOutOfRangeException( + $"Index {surrogatePairIndex} is out of bounds for SurrogatePairString with length {Length}." + ); + } + int stringIndex = _stringIndexBySurrogatePairIndex[surrogatePairIndex]; + char characterAtStringIndex = _stringValue[stringIndex]; + if ( + stringIndex < _stringValue.Length + && char.IsSurrogatePair(characterAtStringIndex, _stringValue[stringIndex + 1]) + ) + { + return _stringValue.Substring(stringIndex, 2); + } + return characterAtStringIndex.ToString(); + } + } + + public int GetSurrogatePairIndexForStringIndex(int stringIndex) + { + if (stringIndex == _stringValue.Length) + { + return _surrogatePairIndexByStringIndex.Count; + } + if (!_surrogatePairIndexByStringIndex.TryGetValue(stringIndex, out int surrogatePairIndex)) + { + throw new ArgumentException($"No non-surrogate code point begins at index {stringIndex}"); + } + return surrogatePairIndex; + } + + public string Substring(int startSurrogatePairIndex, int length) + { + int endSurrogatePairIndex = startSurrogatePairIndex + length; + int startStringIndex = GetStringIndexForSurrogatePairIndex(startSurrogatePairIndex); + int endStringIndex = GetStringIndexForSurrogatePairIndex(endSurrogatePairIndex); + return _stringValue.Substring(startStringIndex, endStringIndex - startStringIndex); + } + + public int GetStringIndexForSurrogatePairIndex(int surrogatePairIndex) + { + if (surrogatePairIndex == _surrogatePairIndexByStringIndex.Count) + { + return _stringValue.Length; + } + return _surrogatePairIndexByStringIndex[surrogatePairIndex]; + } + } } diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs index 44608d7e..0af3f39c 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs @@ -292,8 +292,8 @@ public void ThatAllPossibleQuotationMarksAreIdentified() [ new QuotationMarkStringMatch( new TextSegment.Builder().SetText("उत्पत्ति \"पुस्तकले").Build(), - 6, - 7 + 9, + 10 ), ] ) diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs index 94630c80..d099426d 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs @@ -205,7 +205,7 @@ public void GetPreviousCharacter() 0, 1 ); - Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("ले")); + Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("\u0947")); } [Test] diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs index f62b750a..5eb7a8ec 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs @@ -188,8 +188,14 @@ public void Length() textSegment = new TextSegment.Builder().SetText("new example text").Build(); Assert.That(textSegment.Length, Is.EqualTo("new example text".Length)); + + //Combining characters textSegment = new TextSegment.Builder().SetText("उत्पत्ति पुस्तकले").Build(); - Assert.That(textSegment.Length, Is.EqualTo(11)); + Assert.That(textSegment.Length, Is.EqualTo(17)); + + //Surrogate pairs + textSegment = new TextSegment.Builder().SetText("𝜺𝜺").Build(); + Assert.That(textSegment.Length, Is.EqualTo(2)); } [Test]