Skip to content
8 changes: 6 additions & 2 deletions src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ public string UpdateUsfm(
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip,
IEnumerable<string> preserveParagraphStyles = null,
IEnumerable<IUsfmUpdateBlockHandler> updateBlockHandlers = null,
IEnumerable<string> remarks = null
IEnumerable<string> remarks = null,
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null,
bool compareSegments = false
)
{
string fileName = _settings.GetBookFileName(bookId);
Expand All @@ -51,7 +53,9 @@ public string UpdateUsfm(
styleBehavior,
preserveParagraphStyles,
updateBlockHandlers,
remarks
remarks,
errorHandler,
compareSegments
);
try
{
Expand Down
37 changes: 37 additions & 0 deletions src/SIL.Machine/Corpora/ScriptureRefComparer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
using System.Collections.Generic;
using SIL.Extensions;
using SIL.Machine.Corpora;

public class ScriptureRefComparer : IComparer<ScriptureRef>, IEqualityComparer<ScriptureRef>
{
public static ScriptureRefComparer Default { get; } = new ScriptureRefComparer(compareSegments: true);
public static ScriptureRefComparer IgnoreSegments { get; } = new ScriptureRefComparer(compareSegments: false);
private readonly bool _compareSegments;

public ScriptureRefComparer(bool compareSegments = true)
{
_compareSegments = compareSegments;
}

public int Compare(ScriptureRef x, ScriptureRef y)
{
return x.CompareTo(y, _compareSegments);
}

public bool Equals(ScriptureRef x, ScriptureRef y)
{
return x.CompareTo(y, _compareSegments) == 0;
}

public int GetHashCode(ScriptureRef obj)
{
int hashCode = 23;
hashCode =
hashCode * 31
+ (_compareSegments ? obj.VerseRef.BBBCCCVVVS.GetHashCode() : obj.VerseRef.BBBCCCVVV.GetHashCode());
hashCode = hashCode * 31 + obj.Versification.GetHashCode();
// Using ToRelaxed is necessary to maintain equality across relaxed refs, Equals properly handles relaxed ref comparison
hashCode = hashCode * 31 + obj.ToRelaxed().Path.GetSequenceHashCode();
return hashCode;
}
}
15 changes: 8 additions & 7 deletions src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
private VerseRef _curVerseRef;
private readonly Stack<ScriptureElement> _curElements;
private readonly Stack<ScriptureTextType> _curTextType;
private bool _duplicateVerse = false;

protected ScriptureRefUsfmParserHandlerBase()
{
Expand All @@ -29,6 +28,8 @@ protected ScriptureRefUsfmParserHandlerBase()
protected ScriptureTextType CurrentTextType =>
_curTextType.Count == 0 ? ScriptureTextType.None : _curTextType.Peek();

protected bool DuplicateVerse { get; private set; }

private static readonly string[] EmbedStyles = new[] { "f", "fe", "x", "fig" };

private static bool IsEmbedStyle(string marker)
Expand Down Expand Up @@ -66,13 +67,13 @@ public override void Verse(
string pubNumber
)
{
if (state.VerseRef.Equals(_curVerseRef) && !_duplicateVerse)
if (state.VerseRef.Equals(_curVerseRef) && !DuplicateVerse)
{
if (state.VerseRef.VerseNum > 0)
{
EndVerseText(state, CreateVerseRefs());
// ignore duplicate verses
_duplicateVerse = true;
DuplicateVerse = true;
}
}
else if (VerseRef.AreOverlappingVersesRanges(verse1: number, verse2: _curVerseRef.Verse))
Expand Down Expand Up @@ -251,14 +252,14 @@ protected virtual void EndEmbedText(UsfmParserState state, ScriptureRef scriptur

private void StartVerseText(UsfmParserState state)
{
_duplicateVerse = false;
DuplicateVerse = false;
_curTextType.Push(ScriptureTextType.Verse);
StartVerseText(state, CreateVerseRefs());
}

private void EndVerseText(UsfmParserState state)
{
if (!_duplicateVerse && _curVerseRef.VerseNum > 0)
if (!DuplicateVerse && _curVerseRef.VerseNum > 0)
EndVerseText(state, CreateVerseRefs());
if (_curVerseRef.VerseNum > 0)
_curTextType.Pop();
Expand Down Expand Up @@ -291,7 +292,7 @@ private void StartEmbedText(UsfmParserState state, string marker)
{
if (_curVerseRef.IsDefault)
UpdateVerseRef(state.VerseRef, marker);
if (!_duplicateVerse)
if (!DuplicateVerse)
{
CheckConvertVerseParaToNonVerse(state);
NextElement(marker);
Expand All @@ -302,7 +303,7 @@ private void StartEmbedText(UsfmParserState state, string marker)

private void EndEmbedText(UsfmParserState state)
{
if (!_duplicateVerse && _curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.Embed)
if (!DuplicateVerse && _curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.Embed)
{
EndEmbedText(state, CreateNonVerseRef());
_curTextType.Pop();
Expand Down
136 changes: 117 additions & 19 deletions src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
Expand Down Expand Up @@ -42,6 +43,12 @@ public UpdateUsfmRow(
public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
{
private readonly IReadOnlyList<UpdateUsfmRow> _rows;
private int _rowIndex;
private VerseRef _verseRowsRef;
private readonly List<int> _verseRows;
private int _verseRowIndex;
private readonly Dictionary<VerseRef, List<RowInfo>> _verseRowsMap;
private readonly ScrVers _updateRowsVersification;
private readonly List<UsfmToken> _tokens;
private readonly List<UsfmToken> _updatedText;
private readonly List<UsfmToken> _embedTokens;
Expand All @@ -55,10 +62,11 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
private readonly Stack<IUsfmUpdateBlockHandler> _updateBlockHandlers;
private readonly List<string> _remarks;
private readonly Stack<bool> _replace;
private int _rowIndex;
private int _tokenIndex;
private readonly Func<UsfmUpdateBlockHandlerException, bool> _errorHandler;
private readonly bool _compareSegments;

/// <param name="rows">UpdateUsfmRows must be in order</param>
public UpdateUsfmParserHandler(
IReadOnlyList<UpdateUsfmRow> rows = null,
string idText = null,
Expand All @@ -69,10 +77,18 @@ public UpdateUsfmParserHandler(
IEnumerable<string> preserveParagraphStyles = null,
IEnumerable<IUsfmUpdateBlockHandler> updateBlockHandlers = null,
IEnumerable<string> remarks = null,
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null,
bool compareSegments = false
)
{
_rows = rows ?? Array.Empty<UpdateUsfmRow>();
_verseRows = new List<int>();
_verseRowsMap = new Dictionary<VerseRef, List<RowInfo>>(
compareSegments ? VerseRefComparer.Default : VerseRefComparer.IgnoreSegments
);
_updateRowsVersification = ScrVers.English;
if (_rows.Count > 0)
_updateRowsVersification = _rows.First(r => r.Refs.Count > 0).Refs[0].Versification;
_tokens = new List<UsfmToken>();
_updatedText = new List<UsfmToken>();
_updateBlocks = new Stack<UsfmUpdateBlock>();
Expand All @@ -95,6 +111,7 @@ public UpdateUsfmParserHandler(
_errorHandler = errorHandler;
if (_errorHandler == null)
_errorHandler = (error) => false;
_compareSegments = compareSegments;
}

public IReadOnlyList<UsfmToken> Tokens => _tokens;
Expand All @@ -107,6 +124,10 @@ public override void EndUsfm(UsfmParserState state)

public override void StartBook(UsfmParserState state, string marker, string code)
{
_verseRowsRef = state.VerseRef;
UpdateVerseRowsMap();
UpdateVerseRows();

CollectReadonlyTokens(state);
_updateBlocks.Push(new UsfmUpdateBlock());
var startBookTokens = new List<UsfmToken>();
Expand Down Expand Up @@ -137,7 +158,7 @@ IReadOnlyList<UsfmAttribute> attributes
if (state.IsVerseText)
{
// Only strip paragraph markers in a verse
if (_paragraphBehavior == UpdateUsfmMarkerBehavior.Preserve)
if (_paragraphBehavior == UpdateUsfmMarkerBehavior.Preserve && !DuplicateVerse)
{
CollectUpdatableTokens(state);
}
Expand Down Expand Up @@ -193,6 +214,13 @@ string pubNumber
{
UseUpdatedText();

if (!_verseRowsRef.Equals(state.VerseRef))
{
_verseRowsRef = state.VerseRef;
UpdateVerseRowsMap();
UpdateVerseRows();
}

base.Chapter(state, number, marker, altNumber, pubNumber);

CollectReadonlyTokens(state);
Expand Down Expand Up @@ -230,16 +258,31 @@ string pubNumber
}
}

if (!_verseRowsRef.Equals(state.VerseRef))
{
_verseRowsRef = state.VerseRef;
UpdateVerseRows();
}

base.Verse(state, number, marker, altNumber, pubNumber);

CollectReadonlyTokens(state);
if (DuplicateVerse)
{
SkipUpdatableTokens(state);
}
else
{
CollectReadonlyTokens(state);
}
}

public override void StartNote(UsfmParserState state, string marker, string caller, string category)
{
base.StartNote(state, marker, caller, category);

CollectUpdatableTokens(state);
if (!DuplicateVerse)
CollectUpdatableTokens(state);
else
SkipUpdatableTokens(state);
}

public override void EndNote(UsfmParserState state, string marker, bool closed)
Expand Down Expand Up @@ -319,7 +362,7 @@ public override void Text(UsfmParserState state, string text)
base.Text(state, text);

// strip out text in verses that are being replaced
if (ReplaceWithNewTokens(state))
if (ReplaceWithNewTokens(state) || (DuplicateVerse && CurrentTextType == ScriptureTextType.Verse))
SkipUpdatableTokens(state);
else
CollectUpdatableTokens(state);
Expand Down Expand Up @@ -390,15 +433,11 @@ public string GetUsfm(UsfmStylesheet stylesheet)
remarkTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null));
remarkTokens.Add(new UsfmToken(remark));
}

if (tokens.Count > 0 && tokens[0].Marker == "id")
if (tokens.Count > 0)
{
int index = 1;
if (tokens.Count > 1 && tokens[1].Type == UsfmTokenType.Text)
{
index = 2;
}
while (tokens[index].Marker == "rem")
int index = 0;
HashSet<string> markersToSkip = new HashSet<string>() { "id", "ide", "rem" };
while (markersToSkip.Contains(tokens[index].Marker))
{
index++;
if (tokens.Count > index && tokens[index].Type == UsfmTokenType.Text)
Expand All @@ -407,6 +446,7 @@ public string GetUsfm(UsfmStylesheet stylesheet)
tokens.InsertRange(index, remarkTokens);
}
}

return tokenizer.Detokenize(tokens);
}

Expand All @@ -418,11 +458,11 @@ IReadOnlyList<ScriptureRef> segScrRefs
Dictionary<string, object> rowMetadata = null;
int sourceIndex = 0;
// search the sorted rows with updated text, starting from where we left off last.
while (_rowIndex < _rows.Count && sourceIndex < segScrRefs.Count)
while (_verseRowIndex < _verseRows.Count && sourceIndex < segScrRefs.Count)
{
// get the set of references for the current row
int compare = 0;
UpdateUsfmRow row = _rows[_rowIndex];
UpdateUsfmRow row = _rows[_verseRows[_verseRowIndex]];
(IReadOnlyList<ScriptureRef> rowScrRefs, string text, IReadOnlyDictionary<string, object> metadata) = (
row.Refs,
row.Text,
Expand All @@ -432,7 +472,7 @@ IReadOnlyList<ScriptureRef> segScrRefs
{
while (sourceIndex < segScrRefs.Count)
{
compare = rowScrRef.CompareTo(segScrRefs[sourceIndex], compareSegments: false);
compare = rowScrRef.CompareTo(segScrRefs[sourceIndex], compareSegments: _compareSegments);
if (compare > 0)
// row is ahead of source, increment source
sourceIndex++;
Expand All @@ -451,7 +491,7 @@ IReadOnlyList<ScriptureRef> segScrRefs
if (compare <= 0)
{
// source is ahead row, increment row
_rowIndex++;
_verseRowIndex++;
}
}
return (rowTexts, rowMetadata);
Expand Down Expand Up @@ -649,5 +689,63 @@ private bool IsNonverseParagraph(UsfmParserState state, UsfmUpdateBlockElement e
UsfmTag paraTag = state.Stylesheet.GetTag(paraToken.Marker);
return paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != UsfmTextType.NotSpecified;
}

private void UpdateVerseRowsMap()
{
_verseRowsMap.Clear();
while (_rowIndex < _rows.Count && _rows[_rowIndex].Refs[0].ChapterNum == _verseRowsRef.ChapterNum)
{
UpdateUsfmRow row = _rows[_rowIndex];
var ri = new RowInfo(_rowIndex);
foreach (ScriptureRef sr in row.Refs)
{
if (!_verseRowsMap.TryGetValue(sr.VerseRef, out List<RowInfo> rows))
{
rows = new List<RowInfo>();
_verseRowsMap[sr.VerseRef] = rows;
}
rows.Add(ri);
}
_rowIndex++;
}
}

private void UpdateVerseRows()
{
VerseRef vref = _verseRowsRef;
// We are using a dictionary, which uses an equality comparer. As a result, we need to change the
// source verse ref to use the row versification. If we used a SortedList, it wouldn't be necessary, but it
// would be less efficient.
vref.ChangeVersification(_updateRowsVersification);

_verseRows.Clear();
_verseRowIndex = 0;

foreach (VerseRef vr in vref.AllVerses())
{
if (_verseRowsMap.TryGetValue(vr, out List<RowInfo> rows))
{
foreach (RowInfo row in rows)
{
if (!row.IsConsumed)
{
_verseRows.Add(row.RowIndex);
row.IsConsumed = true;
}
}
}
}
}

private class RowInfo
{
public RowInfo(int rowIndex)
{
RowIndex = rowIndex;
}

public int RowIndex { get; set; }
public bool IsConsumed { get; set; }
}
}
}
Loading
Loading