From 0a815710c4fea8a3c176495e2aa9a2fac0078bd1 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 11 Sep 2025 17:43:16 -0400 Subject: [PATCH 1/5] Add test to specifically cover surrogate pairs, not just combining characters --- .../PunctuationAnalysis/TextSegmentTests.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs index f62b750a..42504511 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs @@ -188,8 +188,14 @@ public void Length() textSegment = new TextSegment.Builder().SetText("new example text").Build(); Assert.That(textSegment.Length, Is.EqualTo("new example text".Length)); + + //Combining characters textSegment = new TextSegment.Builder().SetText("उत्पत्ति पुस्तकले").Build(); Assert.That(textSegment.Length, Is.EqualTo(11)); + + //Surrogate pairs + textSegment = new TextSegment.Builder().SetText("𝜺𝜺").Build(); + Assert.That(textSegment.Length, Is.EqualTo(2)); } [Test] From bc5f65680873879b3a0f2ea6831c194e3013b812 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 16 Sep 2025 10:44:48 -0400 Subject: [PATCH 2/5] Custom string processing to combine only surrogate pairs (not combining characters) --- .../QuotationMarkFinder.cs | 15 +-- .../QuotationMarkStringMatch.cs | 15 +-- .../PunctuationAnalysis/TextSegment.cs | 100 +++++++++++++++++- .../QuotationMarkFinderTests.cs | 4 +- .../QuotationMarkStringMatchTests.cs | 2 +- .../PunctuationAnalysis/TextSegmentTests.cs | 2 +- 6 files changed, 107 insertions(+), 31 deletions(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs index 2c56e871..3a3fa82e 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs @@ -1,5 +1,4 @@ using System.Collections.Generic; -using System.Globalization; using System.Linq; using PCRE; @@ -46,17 +45,9 @@ public List FindAllPotentialQuotationMarksInTextSegmen ) .Select(m => { - int[] textElementIndices = StringInfo.ParseCombiningCharacters(textSegment.Text); - int startIndex = 0; - int endIndex = textElementIndices.Length; - for (int textElementIndex = 0; textElementIndex < textElementIndices.Length; textElementIndex++) - { - int stringIndex = textElementIndices[textElementIndex]; - if (stringIndex == m.Groups[0].Index) - startIndex = textElementIndex; - if (stringIndex == m.Groups[0].EndIndex) - endIndex = textElementIndex; - } + CodePointString codePointString = new CodePointString(textSegment.Text); + int startIndex = codePointString.GetCodePointIndexForStringIndex(m.Groups[0].Index); + int endIndex = codePointString.GetCodePointIndexForStringIndex(m.Groups[0].EndIndex); return new QuotationMarkStringMatch(textSegment, startIndex, endIndex); }) .ToList(); diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs index 0e322ca6..6cee1924 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs @@ -1,5 +1,4 @@ using System; -using System.Globalization; using System.Text.RegularExpressions; using PCRE; @@ -42,8 +41,7 @@ public override int GetHashCode() return code; } - public string QuotationMark => - new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex, EndIndex - StartIndex); + public string QuotationMark => TextSegment.Substring(StartIndex, EndIndex - StartIndex); public bool IsValidOpeningQuotationMark(QuoteConventionSet quoteConventions) => quoteConventions.IsValidOpeningQuotationMark(QuotationMark); @@ -74,14 +72,11 @@ public string PreviousCharacter TextSegment previousSegment = TextSegment.PreviousSegment; if (previousSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) { - return new StringInfo(previousSegment.Text).SubstringByTextElements( - StringInfo.ParseCombiningCharacters(previousSegment.Text).Length - 1, - 1 - ); + return previousSegment.Substring(previousSegment.Length - 1, 1); } return null; } - return new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex - 1, 1); + return TextSegment.Substring(StartIndex - 1, 1); } } @@ -94,11 +89,11 @@ public string NextCharacter TextSegment nextSegment = TextSegment.NextSegment; if (nextSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph)) { - return new StringInfo(nextSegment.Text).SubstringByTextElements(0, 1); + return nextSegment.Substring(0, 1); } return null; } - return new StringInfo(TextSegment.Text).SubstringByTextElements(EndIndex, 1); + return TextSegment.Substring(EndIndex, 1); } } diff --git a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs index 6b20438c..f02ee2b6 100644 --- a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -1,13 +1,21 @@ using System; using System.Collections.Generic; -using System.Globalization; +using System.Linq; using SIL.Machine.Corpora; namespace SIL.Machine.PunctuationAnalysis { public class TextSegment : IEquatable { - public string Text { get; private set; } + public string Text + { + get => _text; + private set + { + _codePointString = new CodePointString(value); + _text = value; + } + } public UsfmMarkerType ImmediatePrecedingMarker { get; private set; } public HashSet MarkersInPrecedingContext { get; private set; } public TextSegment PreviousSegment { get; set; } @@ -16,6 +24,9 @@ public class TextSegment : IEquatable public int NumSegmentsInVerse { get; set; } public UsfmToken UsfmToken { get; private set; } + private string _text; + private CodePointString _codePointString; + public TextSegment() { Text = ""; @@ -71,16 +82,21 @@ public override int GetHashCode() return hashCode * 31 + ImmediatePrecedingMarker.GetHashCode(); } - public int Length => StringInfo.ParseCombiningCharacters(Text).Length; + public int Length => _codePointString.Length; + + public string Substring(int startIndex, int length) + { + return _codePointString.Substring(startIndex, length); + } public string SubstringBefore(int index) { - return Text.Substring(0, index); + return Substring(0, index); } public string SubstringAfter(int index) { - return Text.Substring(index); + return Substring(index, Length - index); } public bool MarkerIsInPrecedingContext(UsfmMarkerType marker) @@ -147,4 +163,78 @@ public TextSegment Build() } } } + + public class CodePointString + { + public string String => _stringValue; + public int Length => _stringIndexByCodePointIndex.Count; + + private readonly string _stringValue; + private readonly Dictionary _codePointIndexByStringIndex; + private readonly Dictionary _stringIndexByCodePointIndex; + + public CodePointString(string stringValue) + { + _stringValue = stringValue; + IEnumerable<(int CodePointIndex, int StringIndex)> indexPairs = _stringValue + .Select((c, i) => (c, i)) + .Where(tup => !char.IsLowSurrogate(tup.c)) + .Select((tup, i) => (tup.i, i)); + _codePointIndexByStringIndex = indexPairs.ToDictionary(tup => tup.StringIndex, tup => tup.CodePointIndex); + _stringIndexByCodePointIndex = indexPairs.ToDictionary(tup => tup.CodePointIndex, tup => tup.StringIndex); + } + + public string this[int codePointIndex] + { + get + { + if (codePointIndex < 0 || codePointIndex > Length) + { + throw new IndexOutOfRangeException( + $"Index {codePointIndex} is out of bounds for CodePointString with length {Length}." + ); + } + int stringIndex = _stringIndexByCodePointIndex[codePointIndex]; + char characterAtStringIndex = _stringValue[stringIndex]; + if ( + stringIndex < _stringValue.Length + && char.IsSurrogatePair(characterAtStringIndex, _stringValue[stringIndex + 1]) + ) + { + return _stringValue.Substring(stringIndex, 2); + } + return characterAtStringIndex.ToString(); + } + } + + public int GetCodePointIndexForStringIndex(int stringIndex) + { + if (stringIndex == _stringValue.Length) + { + return _codePointIndexByStringIndex.Count; + } + if (!_codePointIndexByStringIndex.TryGetValue(stringIndex, out int codePointIndex)) + { + throw new ArgumentException($"No non-surrogate code point begins at index {stringIndex}"); + } + return codePointIndex; + } + + public string Substring(int startCodePointIndex, int length) + { + int endCodePointIndex = startCodePointIndex + length; + int startStringIndex = GetStringIndexForCodePointIndex(startCodePointIndex); + int endStringIndex = GetStringIndexForCodePointIndex(endCodePointIndex); + return _stringValue.Substring(startStringIndex, endStringIndex - startStringIndex); + } + + public int GetStringIndexForCodePointIndex(int codePointIndex) + { + if (codePointIndex == _codePointIndexByStringIndex.Count) + { + return _stringValue.Length; + } + return _codePointIndexByStringIndex[codePointIndex]; + } + } } diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs index 44608d7e..0af3f39c 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs @@ -292,8 +292,8 @@ public void ThatAllPossibleQuotationMarksAreIdentified() [ new QuotationMarkStringMatch( new TextSegment.Builder().SetText("उत्पत्ति \"पुस्तकले").Build(), - 6, - 7 + 9, + 10 ), ] ) diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs index 94630c80..d099426d 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs @@ -205,7 +205,7 @@ public void GetPreviousCharacter() 0, 1 ); - Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("ले")); + Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("\u0947")); } [Test] diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs index 42504511..5eb7a8ec 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs @@ -191,7 +191,7 @@ public void Length() //Combining characters textSegment = new TextSegment.Builder().SetText("उत्पत्ति पुस्तकले").Build(); - Assert.That(textSegment.Length, Is.EqualTo(11)); + Assert.That(textSegment.Length, Is.EqualTo(17)); //Surrogate pairs textSegment = new TextSegment.Builder().SetText("𝜺𝜺").Build(); From 46a3e2a77a9f38ad19b8ae43ef6488c6d0533712 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 16 Sep 2025 16:47:09 -0400 Subject: [PATCH 3/5] Address reviewer comments --- .../PunctuationAnalysis/TextSegment.cs | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs index f02ee2b6..9ce78ba0 100644 --- a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -9,12 +9,8 @@ public class TextSegment : IEquatable { public string Text { - get => _text; - private set - { - _codePointString = new CodePointString(value); - _text = value; - } + get => _codePointString.ToString(); + private set { _codePointString = new CodePointString(value); } } public UsfmMarkerType ImmediatePrecedingMarker { get; private set; } public HashSet MarkersInPrecedingContext { get; private set; } @@ -23,8 +19,6 @@ private set public int IndexInVerse { get; set; } public int NumSegmentsInVerse { get; set; } public UsfmToken UsfmToken { get; private set; } - - private string _text; private CodePointString _codePointString; public TextSegment() @@ -164,6 +158,9 @@ public TextSegment Build() } } + /// + /// Class to handle indexing of strings by unicode code point, treating surrogate pairs as single characters. + /// public class CodePointString { public string String => _stringValue; @@ -180,8 +177,18 @@ public CodePointString(string stringValue) .Select((c, i) => (c, i)) .Where(tup => !char.IsLowSurrogate(tup.c)) .Select((tup, i) => (tup.i, i)); - _codePointIndexByStringIndex = indexPairs.ToDictionary(tup => tup.StringIndex, tup => tup.CodePointIndex); - _stringIndexByCodePointIndex = indexPairs.ToDictionary(tup => tup.CodePointIndex, tup => tup.StringIndex); + _codePointIndexByStringIndex = new Dictionary(); + _stringIndexByCodePointIndex = new Dictionary(); + foreach ((int codePointIndex, int stringIndex) in indexPairs) + { + _codePointIndexByStringIndex[stringIndex] = codePointIndex; + _stringIndexByCodePointIndex[codePointIndex] = stringIndex; + } + } + + public override string ToString() + { + return _stringValue; } public string this[int codePointIndex] From b279bc7bc52edd0dd380998f28a49419ccfa4a0a Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 16 Sep 2025 16:49:38 -0400 Subject: [PATCH 4/5] Use => syntax for setter --- src/SIL.Machine/PunctuationAnalysis/TextSegment.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs index 9ce78ba0..d61b51d3 100644 --- a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -10,7 +10,7 @@ public class TextSegment : IEquatable public string Text { get => _codePointString.ToString(); - private set { _codePointString = new CodePointString(value); } + private set => _codePointString = new CodePointString(value); } public UsfmMarkerType ImmediatePrecedingMarker { get; private set; } public HashSet MarkersInPrecedingContext { get; private set; } From d5195085966daa622bf7c9d7dc34caff4b2655e8 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 17 Sep 2025 18:44:29 -0400 Subject: [PATCH 5/5] Change code point to surrogate pair --- .../QuotationMarkFinder.cs | 6 +- .../PunctuationAnalysis/TextSegment.cs | 62 +++++++++---------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs index 3a3fa82e..d045b266 100644 --- a/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs +++ b/src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs @@ -45,9 +45,9 @@ public List FindAllPotentialQuotationMarksInTextSegmen ) .Select(m => { - CodePointString codePointString = new CodePointString(textSegment.Text); - int startIndex = codePointString.GetCodePointIndexForStringIndex(m.Groups[0].Index); - int endIndex = codePointString.GetCodePointIndexForStringIndex(m.Groups[0].EndIndex); + SurrogatePairString surrogatePairString = new SurrogatePairString(textSegment.Text); + int startIndex = surrogatePairString.GetSurrogatePairIndexForStringIndex(m.Groups[0].Index); + int endIndex = surrogatePairString.GetSurrogatePairIndexForStringIndex(m.Groups[0].EndIndex); return new QuotationMarkStringMatch(textSegment, startIndex, endIndex); }) .ToList(); diff --git a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs index d61b51d3..13ef13ec 100644 --- a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -9,8 +9,8 @@ public class TextSegment : IEquatable { public string Text { - get => _codePointString.ToString(); - private set => _codePointString = new CodePointString(value); + get => _surrogatePairString.ToString(); + private set => _surrogatePairString = new SurrogatePairString(value); } public UsfmMarkerType ImmediatePrecedingMarker { get; private set; } public HashSet MarkersInPrecedingContext { get; private set; } @@ -19,7 +19,7 @@ public string Text public int IndexInVerse { get; set; } public int NumSegmentsInVerse { get; set; } public UsfmToken UsfmToken { get; private set; } - private CodePointString _codePointString; + private SurrogatePairString _surrogatePairString; public TextSegment() { @@ -76,11 +76,11 @@ public override int GetHashCode() return hashCode * 31 + ImmediatePrecedingMarker.GetHashCode(); } - public int Length => _codePointString.Length; + public int Length => _surrogatePairString.Length; public string Substring(int startIndex, int length) { - return _codePointString.Substring(startIndex, length); + return _surrogatePairString.Substring(startIndex, length); } public string SubstringBefore(int index) @@ -161,28 +161,28 @@ public TextSegment Build() /// /// Class to handle indexing of strings by unicode code point, treating surrogate pairs as single characters. /// - public class CodePointString + public class SurrogatePairString { public string String => _stringValue; - public int Length => _stringIndexByCodePointIndex.Count; + public int Length => _stringIndexBySurrogatePairIndex.Count; private readonly string _stringValue; - private readonly Dictionary _codePointIndexByStringIndex; - private readonly Dictionary _stringIndexByCodePointIndex; + private readonly Dictionary _surrogatePairIndexByStringIndex; + private readonly Dictionary _stringIndexBySurrogatePairIndex; - public CodePointString(string stringValue) + public SurrogatePairString(string stringValue) { _stringValue = stringValue; - IEnumerable<(int CodePointIndex, int StringIndex)> indexPairs = _stringValue + IEnumerable<(int SurrogatePairIndex, int StringIndex)> indexPairs = _stringValue .Select((c, i) => (c, i)) .Where(tup => !char.IsLowSurrogate(tup.c)) .Select((tup, i) => (tup.i, i)); - _codePointIndexByStringIndex = new Dictionary(); - _stringIndexByCodePointIndex = new Dictionary(); - foreach ((int codePointIndex, int stringIndex) in indexPairs) + _surrogatePairIndexByStringIndex = new Dictionary(); + _stringIndexBySurrogatePairIndex = new Dictionary(); + foreach ((int surrogatePairIndex, int stringIndex) in indexPairs) { - _codePointIndexByStringIndex[stringIndex] = codePointIndex; - _stringIndexByCodePointIndex[codePointIndex] = stringIndex; + _surrogatePairIndexByStringIndex[stringIndex] = surrogatePairIndex; + _stringIndexBySurrogatePairIndex[surrogatePairIndex] = stringIndex; } } @@ -191,17 +191,17 @@ public override string ToString() return _stringValue; } - public string this[int codePointIndex] + public string this[int surrogatePairIndex] { get { - if (codePointIndex < 0 || codePointIndex > Length) + if (surrogatePairIndex < 0 || surrogatePairIndex > Length) { throw new IndexOutOfRangeException( - $"Index {codePointIndex} is out of bounds for CodePointString with length {Length}." + $"Index {surrogatePairIndex} is out of bounds for SurrogatePairString with length {Length}." ); } - int stringIndex = _stringIndexByCodePointIndex[codePointIndex]; + int stringIndex = _stringIndexBySurrogatePairIndex[surrogatePairIndex]; char characterAtStringIndex = _stringValue[stringIndex]; if ( stringIndex < _stringValue.Length @@ -214,34 +214,34 @@ public string this[int codePointIndex] } } - public int GetCodePointIndexForStringIndex(int stringIndex) + public int GetSurrogatePairIndexForStringIndex(int stringIndex) { if (stringIndex == _stringValue.Length) { - return _codePointIndexByStringIndex.Count; + return _surrogatePairIndexByStringIndex.Count; } - if (!_codePointIndexByStringIndex.TryGetValue(stringIndex, out int codePointIndex)) + if (!_surrogatePairIndexByStringIndex.TryGetValue(stringIndex, out int surrogatePairIndex)) { throw new ArgumentException($"No non-surrogate code point begins at index {stringIndex}"); } - return codePointIndex; + return surrogatePairIndex; } - public string Substring(int startCodePointIndex, int length) + public string Substring(int startSurrogatePairIndex, int length) { - int endCodePointIndex = startCodePointIndex + length; - int startStringIndex = GetStringIndexForCodePointIndex(startCodePointIndex); - int endStringIndex = GetStringIndexForCodePointIndex(endCodePointIndex); + int endSurrogatePairIndex = startSurrogatePairIndex + length; + int startStringIndex = GetStringIndexForSurrogatePairIndex(startSurrogatePairIndex); + int endStringIndex = GetStringIndexForSurrogatePairIndex(endSurrogatePairIndex); return _stringValue.Substring(startStringIndex, endStringIndex - startStringIndex); } - public int GetStringIndexForCodePointIndex(int codePointIndex) + public int GetStringIndexForSurrogatePairIndex(int surrogatePairIndex) { - if (codePointIndex == _codePointIndexByStringIndex.Count) + if (surrogatePairIndex == _surrogatePairIndexByStringIndex.Count) { return _stringValue.Length; } - return _codePointIndexByStringIndex[codePointIndex]; + return _surrogatePairIndexByStringIndex[surrogatePairIndex]; } } }