Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 3 additions & 12 deletions src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using PCRE;

Expand Down Expand Up @@ -46,17 +45,9 @@ public List<QuotationMarkStringMatch> FindAllPotentialQuotationMarksInTextSegmen
)
.Select(m =>
{
int[] textElementIndices = StringInfo.ParseCombiningCharacters(textSegment.Text);
int startIndex = 0;
int endIndex = textElementIndices.Length;
for (int textElementIndex = 0; textElementIndex < textElementIndices.Length; textElementIndex++)
{
int stringIndex = textElementIndices[textElementIndex];
if (stringIndex == m.Groups[0].Index)
startIndex = textElementIndex;
if (stringIndex == m.Groups[0].EndIndex)
endIndex = textElementIndex;
}
SurrogatePairString surrogatePairString = new SurrogatePairString(textSegment.Text);
int startIndex = surrogatePairString.GetSurrogatePairIndexForStringIndex(m.Groups[0].Index);
int endIndex = surrogatePairString.GetSurrogatePairIndexForStringIndex(m.Groups[0].EndIndex);
return new QuotationMarkStringMatch(textSegment, startIndex, endIndex);
})
.ToList();
Expand Down
15 changes: 5 additions & 10 deletions src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System;
using System.Globalization;
using System.Text.RegularExpressions;
using PCRE;

Expand Down Expand Up @@ -42,8 +41,7 @@ public override int GetHashCode()
return code;
}

public string QuotationMark =>
new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex, EndIndex - StartIndex);
public string QuotationMark => TextSegment.Substring(StartIndex, EndIndex - StartIndex);

public bool IsValidOpeningQuotationMark(QuoteConventionSet quoteConventions) =>
quoteConventions.IsValidOpeningQuotationMark(QuotationMark);
Expand Down Expand Up @@ -74,14 +72,11 @@ public string PreviousCharacter
TextSegment previousSegment = TextSegment.PreviousSegment;
if (previousSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph))
{
return new StringInfo(previousSegment.Text).SubstringByTextElements(
StringInfo.ParseCombiningCharacters(previousSegment.Text).Length - 1,
1
);
return previousSegment.Substring(previousSegment.Length - 1, 1);
}
return null;
}
return new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex - 1, 1);
return TextSegment.Substring(StartIndex - 1, 1);
}
}

Expand All @@ -94,11 +89,11 @@ public string NextCharacter
TextSegment nextSegment = TextSegment.NextSegment;
if (nextSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph))
{
return new StringInfo(nextSegment.Text).SubstringByTextElements(0, 1);
return nextSegment.Substring(0, 1);
}
return null;
}
return new StringInfo(TextSegment.Text).SubstringByTextElements(EndIndex, 1);
return TextSegment.Substring(EndIndex, 1);
}
}

Expand Down
107 changes: 102 additions & 5 deletions src/SIL.Machine/PunctuationAnalysis/TextSegment.cs
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using SIL.Machine.Corpora;

namespace SIL.Machine.PunctuationAnalysis
{
public class TextSegment : IEquatable<TextSegment>
{
public string Text { get; private set; }
public string Text
{
get => _surrogatePairString.ToString();
private set => _surrogatePairString = new SurrogatePairString(value);
}
public UsfmMarkerType ImmediatePrecedingMarker { get; private set; }
public HashSet<UsfmMarkerType> MarkersInPrecedingContext { get; private set; }
public TextSegment PreviousSegment { get; set; }
public TextSegment NextSegment { get; set; }
public int IndexInVerse { get; set; }
public int NumSegmentsInVerse { get; set; }
public UsfmToken UsfmToken { get; private set; }
private SurrogatePairString _surrogatePairString;

public TextSegment()
{
Expand Down Expand Up @@ -71,16 +76,21 @@ public override int GetHashCode()
return hashCode * 31 + ImmediatePrecedingMarker.GetHashCode();
}

public int Length => StringInfo.ParseCombiningCharacters(Text).Length;
public int Length => _surrogatePairString.Length;

public string Substring(int startIndex, int length)
{
return _surrogatePairString.Substring(startIndex, length);
}

public string SubstringBefore(int index)
{
return Text.Substring(0, index);
return Substring(0, index);
}

public string SubstringAfter(int index)
{
return Text.Substring(index);
return Substring(index, Length - index);
}

public bool MarkerIsInPrecedingContext(UsfmMarkerType marker)
Expand Down Expand Up @@ -147,4 +157,91 @@ public TextSegment Build()
}
}
}

/// <summary>
/// Class to handle indexing of strings by unicode code point, treating surrogate pairs as single characters.
/// </summary>
public class SurrogatePairString
{
public string String => _stringValue;
public int Length => _stringIndexBySurrogatePairIndex.Count;

private readonly string _stringValue;
private readonly Dictionary<int, int> _surrogatePairIndexByStringIndex;
private readonly Dictionary<int, int> _stringIndexBySurrogatePairIndex;

public SurrogatePairString(string stringValue)
{
_stringValue = stringValue;
IEnumerable<(int SurrogatePairIndex, int StringIndex)> indexPairs = _stringValue
.Select((c, i) => (c, i))
.Where(tup => !char.IsLowSurrogate(tup.c))
.Select((tup, i) => (tup.i, i));
_surrogatePairIndexByStringIndex = new Dictionary<int, int>();
_stringIndexBySurrogatePairIndex = new Dictionary<int, int>();
foreach ((int surrogatePairIndex, int stringIndex) in indexPairs)
{
_surrogatePairIndexByStringIndex[stringIndex] = surrogatePairIndex;
_stringIndexBySurrogatePairIndex[surrogatePairIndex] = stringIndex;
}
}

public override string ToString()
{
return _stringValue;
}

public string this[int surrogatePairIndex]
{
get
{
if (surrogatePairIndex < 0 || surrogatePairIndex > Length)
{
throw new IndexOutOfRangeException(
$"Index {surrogatePairIndex} is out of bounds for SurrogatePairString with length {Length}."
);
}
int stringIndex = _stringIndexBySurrogatePairIndex[surrogatePairIndex];
char characterAtStringIndex = _stringValue[stringIndex];
if (
stringIndex < _stringValue.Length
&& char.IsSurrogatePair(characterAtStringIndex, _stringValue[stringIndex + 1])
)
{
return _stringValue.Substring(stringIndex, 2);
}
return characterAtStringIndex.ToString();
}
}

public int GetSurrogatePairIndexForStringIndex(int stringIndex)
{
if (stringIndex == _stringValue.Length)
{
return _surrogatePairIndexByStringIndex.Count;
}
if (!_surrogatePairIndexByStringIndex.TryGetValue(stringIndex, out int surrogatePairIndex))
{
throw new ArgumentException($"No non-surrogate code point begins at index {stringIndex}");
}
return surrogatePairIndex;
}

public string Substring(int startSurrogatePairIndex, int length)
{
int endSurrogatePairIndex = startSurrogatePairIndex + length;
int startStringIndex = GetStringIndexForSurrogatePairIndex(startSurrogatePairIndex);
int endStringIndex = GetStringIndexForSurrogatePairIndex(endSurrogatePairIndex);
return _stringValue.Substring(startStringIndex, endStringIndex - startStringIndex);
}

public int GetStringIndexForSurrogatePairIndex(int surrogatePairIndex)
{
if (surrogatePairIndex == _surrogatePairIndexByStringIndex.Count)
{
return _stringValue.Length;
}
return _surrogatePairIndexByStringIndex[surrogatePairIndex];
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,8 @@ public void ThatAllPossibleQuotationMarksAreIdentified()
[
new QuotationMarkStringMatch(
new TextSegment.Builder().SetText("उत्पत्ति \"पुस्तकले").Build(),
6,
7
9,
10
),
]
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ public void GetPreviousCharacter()
0,
1
);
Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("ले"));
Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("\u0947"));
}

[Test]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,14 @@ public void Length()

textSegment = new TextSegment.Builder().SetText("new example text").Build();
Assert.That(textSegment.Length, Is.EqualTo("new example text".Length));

//Combining characters
textSegment = new TextSegment.Builder().SetText("उत्पत्ति पुस्तकले").Build();
Assert.That(textSegment.Length, Is.EqualTo(11));
Assert.That(textSegment.Length, Is.EqualTo(17));

//Surrogate pairs
textSegment = new TextSegment.Builder().SetText("𝜺𝜺").Build();
Assert.That(textSegment.Length, Is.EqualTo(2));
}

[Test]
Expand Down
Loading