diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 65273298..85dc470a 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -29,7 +29,9 @@ public string UpdateUsfm( UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable preserveParagraphStyles = null, IEnumerable updateBlockHandlers = null, - IEnumerable remarks = null + IEnumerable remarks = null, + Func errorHandler = null, + bool compareSegments = false ) { string fileName = _settings.GetBookFileName(bookId); @@ -51,7 +53,9 @@ public string UpdateUsfm( styleBehavior, preserveParagraphStyles, updateBlockHandlers, - remarks + remarks, + errorHandler, + compareSegments ); try { diff --git a/src/SIL.Machine/Corpora/ScriptureRefComparer.cs b/src/SIL.Machine/Corpora/ScriptureRefComparer.cs new file mode 100644 index 00000000..e3e7cc7c --- /dev/null +++ b/src/SIL.Machine/Corpora/ScriptureRefComparer.cs @@ -0,0 +1,37 @@ +using System.Collections.Generic; +using SIL.Extensions; +using SIL.Machine.Corpora; + +public class ScriptureRefComparer : IComparer, IEqualityComparer +{ + public static ScriptureRefComparer Default { get; } = new ScriptureRefComparer(compareSegments: true); + public static ScriptureRefComparer IgnoreSegments { get; } = new ScriptureRefComparer(compareSegments: false); + private readonly bool _compareSegments; + + public ScriptureRefComparer(bool compareSegments = true) + { + _compareSegments = compareSegments; + } + + public int Compare(ScriptureRef x, ScriptureRef y) + { + return x.CompareTo(y, _compareSegments); + } + + public bool Equals(ScriptureRef x, ScriptureRef y) + { + return x.CompareTo(y, _compareSegments) == 0; + } + + public int GetHashCode(ScriptureRef obj) + { + int hashCode = 23; + hashCode = + hashCode * 31 + + (_compareSegments ? obj.VerseRef.BBBCCCVVVS.GetHashCode() : obj.VerseRef.BBBCCCVVV.GetHashCode()); + hashCode = hashCode * 31 + obj.Versification.GetHashCode(); + // Using ToRelaxed is necessary to maintain equality across relaxed refs, Equals properly handles relaxed ref comparison + hashCode = hashCode * 31 + obj.ToRelaxed().Path.GetSequenceHashCode(); + return hashCode; + } +} diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index 2c268ebf..f5a52a79 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -18,7 +18,6 @@ public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase private VerseRef _curVerseRef; private readonly Stack _curElements; private readonly Stack _curTextType; - private bool _duplicateVerse = false; protected ScriptureRefUsfmParserHandlerBase() { @@ -29,6 +28,8 @@ protected ScriptureRefUsfmParserHandlerBase() protected ScriptureTextType CurrentTextType => _curTextType.Count == 0 ? ScriptureTextType.None : _curTextType.Peek(); + protected bool DuplicateVerse { get; private set; } + private static readonly string[] EmbedStyles = new[] { "f", "fe", "x", "fig" }; private static bool IsEmbedStyle(string marker) @@ -66,13 +67,13 @@ public override void Verse( string pubNumber ) { - if (state.VerseRef.Equals(_curVerseRef) && !_duplicateVerse) + if (state.VerseRef.Equals(_curVerseRef) && !DuplicateVerse) { if (state.VerseRef.VerseNum > 0) { EndVerseText(state, CreateVerseRefs()); // ignore duplicate verses - _duplicateVerse = true; + DuplicateVerse = true; } } else if (VerseRef.AreOverlappingVersesRanges(verse1: number, verse2: _curVerseRef.Verse)) @@ -251,14 +252,14 @@ protected virtual void EndEmbedText(UsfmParserState state, ScriptureRef scriptur private void StartVerseText(UsfmParserState state) { - _duplicateVerse = false; + DuplicateVerse = false; _curTextType.Push(ScriptureTextType.Verse); StartVerseText(state, CreateVerseRefs()); } private void EndVerseText(UsfmParserState state) { - if (!_duplicateVerse && _curVerseRef.VerseNum > 0) + if (!DuplicateVerse && _curVerseRef.VerseNum > 0) EndVerseText(state, CreateVerseRefs()); if (_curVerseRef.VerseNum > 0) _curTextType.Pop(); @@ -291,7 +292,7 @@ private void StartEmbedText(UsfmParserState state, string marker) { if (_curVerseRef.IsDefault) UpdateVerseRef(state.VerseRef, marker); - if (!_duplicateVerse) + if (!DuplicateVerse) { CheckConvertVerseParaToNonVerse(state); NextElement(marker); @@ -302,7 +303,7 @@ private void StartEmbedText(UsfmParserState state, string marker) private void EndEmbedText(UsfmParserState state) { - if (!_duplicateVerse && _curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.Embed) + if (!DuplicateVerse && _curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.Embed) { EndEmbedText(state, CreateNonVerseRef()); _curTextType.Pop(); diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 2e7f77c3..4b9c37ef 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Linq; +using SIL.Scripture; namespace SIL.Machine.Corpora { @@ -42,6 +43,12 @@ public UpdateUsfmRow( public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase { private readonly IReadOnlyList _rows; + private int _rowIndex; + private VerseRef _verseRowsRef; + private readonly List _verseRows; + private int _verseRowIndex; + private readonly Dictionary> _verseRowsMap; + private readonly ScrVers _updateRowsVersification; private readonly List _tokens; private readonly List _updatedText; private readonly List _embedTokens; @@ -55,10 +62,11 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase private readonly Stack _updateBlockHandlers; private readonly List _remarks; private readonly Stack _replace; - private int _rowIndex; private int _tokenIndex; private readonly Func _errorHandler; + private readonly bool _compareSegments; + /// UpdateUsfmRows must be in order public UpdateUsfmParserHandler( IReadOnlyList rows = null, string idText = null, @@ -69,10 +77,18 @@ public UpdateUsfmParserHandler( IEnumerable preserveParagraphStyles = null, IEnumerable updateBlockHandlers = null, IEnumerable remarks = null, - Func errorHandler = null + Func errorHandler = null, + bool compareSegments = false ) { _rows = rows ?? Array.Empty(); + _verseRows = new List(); + _verseRowsMap = new Dictionary>( + compareSegments ? VerseRefComparer.Default : VerseRefComparer.IgnoreSegments + ); + _updateRowsVersification = ScrVers.English; + if (_rows.Count > 0) + _updateRowsVersification = _rows.First(r => r.Refs.Count > 0).Refs[0].Versification; _tokens = new List(); _updatedText = new List(); _updateBlocks = new Stack(); @@ -95,6 +111,7 @@ public UpdateUsfmParserHandler( _errorHandler = errorHandler; if (_errorHandler == null) _errorHandler = (error) => false; + _compareSegments = compareSegments; } public IReadOnlyList Tokens => _tokens; @@ -107,6 +124,10 @@ public override void EndUsfm(UsfmParserState state) public override void StartBook(UsfmParserState state, string marker, string code) { + _verseRowsRef = state.VerseRef; + UpdateVerseRowsMap(); + UpdateVerseRows(); + CollectReadonlyTokens(state); _updateBlocks.Push(new UsfmUpdateBlock()); var startBookTokens = new List(); @@ -137,7 +158,7 @@ IReadOnlyList attributes if (state.IsVerseText) { // Only strip paragraph markers in a verse - if (_paragraphBehavior == UpdateUsfmMarkerBehavior.Preserve) + if (_paragraphBehavior == UpdateUsfmMarkerBehavior.Preserve && !DuplicateVerse) { CollectUpdatableTokens(state); } @@ -193,6 +214,13 @@ string pubNumber { UseUpdatedText(); + if (!_verseRowsRef.Equals(state.VerseRef)) + { + _verseRowsRef = state.VerseRef; + UpdateVerseRowsMap(); + UpdateVerseRows(); + } + base.Chapter(state, number, marker, altNumber, pubNumber); CollectReadonlyTokens(state); @@ -230,16 +258,31 @@ string pubNumber } } + if (!_verseRowsRef.Equals(state.VerseRef)) + { + _verseRowsRef = state.VerseRef; + UpdateVerseRows(); + } + base.Verse(state, number, marker, altNumber, pubNumber); - CollectReadonlyTokens(state); + if (DuplicateVerse) + { + SkipUpdatableTokens(state); + } + else + { + CollectReadonlyTokens(state); + } } public override void StartNote(UsfmParserState state, string marker, string caller, string category) { base.StartNote(state, marker, caller, category); - - CollectUpdatableTokens(state); + if (!DuplicateVerse) + CollectUpdatableTokens(state); + else + SkipUpdatableTokens(state); } public override void EndNote(UsfmParserState state, string marker, bool closed) @@ -319,7 +362,7 @@ public override void Text(UsfmParserState state, string text) base.Text(state, text); // strip out text in verses that are being replaced - if (ReplaceWithNewTokens(state)) + if (ReplaceWithNewTokens(state) || (DuplicateVerse && CurrentTextType == ScriptureTextType.Verse)) SkipUpdatableTokens(state); else CollectUpdatableTokens(state); @@ -390,15 +433,11 @@ public string GetUsfm(UsfmStylesheet stylesheet) remarkTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null)); remarkTokens.Add(new UsfmToken(remark)); } - - if (tokens.Count > 0 && tokens[0].Marker == "id") + if (tokens.Count > 0) { - int index = 1; - if (tokens.Count > 1 && tokens[1].Type == UsfmTokenType.Text) - { - index = 2; - } - while (tokens[index].Marker == "rem") + int index = 0; + HashSet markersToSkip = new HashSet() { "id", "ide", "rem" }; + while (markersToSkip.Contains(tokens[index].Marker)) { index++; if (tokens.Count > index && tokens[index].Type == UsfmTokenType.Text) @@ -407,6 +446,7 @@ public string GetUsfm(UsfmStylesheet stylesheet) tokens.InsertRange(index, remarkTokens); } } + return tokenizer.Detokenize(tokens); } @@ -418,11 +458,11 @@ IReadOnlyList segScrRefs Dictionary rowMetadata = null; int sourceIndex = 0; // search the sorted rows with updated text, starting from where we left off last. - while (_rowIndex < _rows.Count && sourceIndex < segScrRefs.Count) + while (_verseRowIndex < _verseRows.Count && sourceIndex < segScrRefs.Count) { // get the set of references for the current row int compare = 0; - UpdateUsfmRow row = _rows[_rowIndex]; + UpdateUsfmRow row = _rows[_verseRows[_verseRowIndex]]; (IReadOnlyList rowScrRefs, string text, IReadOnlyDictionary metadata) = ( row.Refs, row.Text, @@ -432,7 +472,7 @@ IReadOnlyList segScrRefs { while (sourceIndex < segScrRefs.Count) { - compare = rowScrRef.CompareTo(segScrRefs[sourceIndex], compareSegments: false); + compare = rowScrRef.CompareTo(segScrRefs[sourceIndex], compareSegments: _compareSegments); if (compare > 0) // row is ahead of source, increment source sourceIndex++; @@ -451,7 +491,7 @@ IReadOnlyList segScrRefs if (compare <= 0) { // source is ahead row, increment row - _rowIndex++; + _verseRowIndex++; } } return (rowTexts, rowMetadata); @@ -649,5 +689,63 @@ private bool IsNonverseParagraph(UsfmParserState state, UsfmUpdateBlockElement e UsfmTag paraTag = state.Stylesheet.GetTag(paraToken.Marker); return paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != UsfmTextType.NotSpecified; } + + private void UpdateVerseRowsMap() + { + _verseRowsMap.Clear(); + while (_rowIndex < _rows.Count && _rows[_rowIndex].Refs[0].ChapterNum == _verseRowsRef.ChapterNum) + { + UpdateUsfmRow row = _rows[_rowIndex]; + var ri = new RowInfo(_rowIndex); + foreach (ScriptureRef sr in row.Refs) + { + if (!_verseRowsMap.TryGetValue(sr.VerseRef, out List rows)) + { + rows = new List(); + _verseRowsMap[sr.VerseRef] = rows; + } + rows.Add(ri); + } + _rowIndex++; + } + } + + private void UpdateVerseRows() + { + VerseRef vref = _verseRowsRef; + // We are using a dictionary, which uses an equality comparer. As a result, we need to change the + // source verse ref to use the row versification. If we used a SortedList, it wouldn't be necessary, but it + // would be less efficient. + vref.ChangeVersification(_updateRowsVersification); + + _verseRows.Clear(); + _verseRowIndex = 0; + + foreach (VerseRef vr in vref.AllVerses()) + { + if (_verseRowsMap.TryGetValue(vr, out List rows)) + { + foreach (RowInfo row in rows) + { + if (!row.IsConsumed) + { + _verseRows.Add(row.RowIndex); + row.IsConsumed = true; + } + } + } + } + } + + private class RowInfo + { + public RowInfo(int rowIndex) + { + RowIndex = rowIndex; + } + + public int RowIndex { get; set; } + public bool IsConsumed { get; set; } + } } } diff --git a/src/SIL.Machine/Corpora/VerseRefComparer.cs b/src/SIL.Machine/Corpora/VerseRefComparer.cs index 1d49f422..c0aea648 100644 --- a/src/SIL.Machine/Corpora/VerseRefComparer.cs +++ b/src/SIL.Machine/Corpora/VerseRefComparer.cs @@ -6,10 +6,10 @@ namespace SIL.Machine.Corpora { - public class VerseRefComparer : IComparer + public class VerseRefComparer : IComparer, IEqualityComparer { - public static IComparer Default { get; } = new VerseRefComparer(compareSegments: true); - public static IComparer IgnoreSegments { get; } = new VerseRefComparer(compareSegments: false); + public static VerseRefComparer Default { get; } = new VerseRefComparer(compareSegments: true); + public static VerseRefComparer IgnoreSegments { get; } = new VerseRefComparer(compareSegments: false); private readonly bool _compareSegments; @@ -37,5 +37,18 @@ public int Compare(VerseRef x, VerseRef y) } return xArray.Length.CompareTo(yArray.Length); } + + public bool Equals(VerseRef x, VerseRef y) + { + return Compare(x, y) == 0; + } + + public int GetHashCode(VerseRef obj) + { + int hashCode = 23; + hashCode = hashCode * 31 + (_compareSegments ? obj.BBBCCCVVVS.GetHashCode() : obj.BBBCCCVVV.GetHashCode()); + hashCode = hashCode * 31 + obj.Versification.GetHashCode(); + return hashCode; + } } } diff --git a/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs index 9769e475..b7503770 100644 --- a/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs @@ -746,8 +746,8 @@ public void UpdateUsfm_VersesOutOfOrder() @"\id MAT \c 1 \v 2 new verse 2 -\v 1 -\p +\v 1 new verse 1 +\p new paragraph 2 "; AssertUsfmEquals(target, result); diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index a8fb6b2c..6255fd22 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -1209,6 +1209,103 @@ public void GetUsfm_HeaderReferenceParagraphs() AssertUsfmEquals(target, resultP); } + [Test] + public void GetUsfm_OutOfOrderVerses() + { + var rows = new List + { + new UpdateUsfmRow(ScrRef("MAT 1:1"), "new verse 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "new verse 2"), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "new verse 3"), + new UpdateUsfmRow(ScrRef("MAT 1:4"), "new verse 4"), + new UpdateUsfmRow(ScrRef("MAT 1:5"), "new verse 5"), + new UpdateUsfmRow(ScrRef("MAT 1:6a"), "new verse 6a"), + new UpdateUsfmRow(ScrRef("MAT 1:6b"), "new verse 6b"), + new UpdateUsfmRow(ScrRef("MAT 1:6b/1:s"), "new section"), + new UpdateUsfmRow(ScrRef("MAT 1:7"), "new verse 7"), + new UpdateUsfmRow(ScrRef("MAT 1:8"), "new verse 8"), + }; + + string usfm = + @"\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 verse 1 +\v 2 verse 2 +\v 3 verse 3 +\v 6b verse 6b +\s section +\v 7 verse 7 +\v 8 verse 8 +\v 4 verse 4 +\v 5 verse 5 +\v 6a verse 6a +"; + + string target = UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, + compareSegments: true + ); + string resultP = + @"\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 new verse 1 +\v 2 new verse 2 +\v 3 new verse 3 +\v 6b new verse 6b +\s new section +\v 7 new verse 7 +\v 8 new verse 8 +\v 4 new verse 4 +\v 5 new verse 5 +\v 6a new verse 6a +"; + AssertUsfmEquals(target, resultP); + } + + [Test] + public void GetUsfm_DuplicateVerses() + { + var rows = new List + { + new UpdateUsfmRow(ScrRef("MAT 1:1"), "new verse 1"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "new verse 2"), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "new verse 3"), + new UpdateUsfmRow(ScrRef("MAT 1:4"), "new verse 4"), + }; + + string usfm = + @"\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 verse 1 +\v 2 verse 2 +\v 3 verse 3 +\v 3 another verse 3\f \fr 1.3 \ft Some duplicate verse three note \f* 1 +\p more verse three +\v 4 verse 4 +"; + + string target = UpdateUsfm(rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Strip); + string resultP = + @"\id MAT +\c 1 +\s1 beginning-of-chapter header +\p +\v 1 new verse 1 +\v 2 new verse 2 +\v 3 new verse 3 +\v 4 new verse 4 +"; + AssertUsfmEquals(target, resultP); + } + [Test] public void GetUsfm_PreferExisting_AddRemark() { @@ -1219,6 +1316,7 @@ public void GetUsfm_PreferExisting_AddRemark() }; string usfm = @"\id MAT - Test +\ide UTF-8 \rem Existing remark \c 1 \v 1 Some text @@ -1233,6 +1331,7 @@ public void GetUsfm_PreferExisting_AddRemark() ); string result = @"\id MAT - Test +\ide UTF-8 \rem Existing remark \rem New remark \c 1 @@ -1251,6 +1350,7 @@ public void GetUsfm_PreferExisting_AddRemark() ); result = @"\id MAT - Test +\ide UTF-8 \rem Existing remark \rem New remark \rem New remark 2 @@ -1278,7 +1378,8 @@ private static string UpdateUsfm( UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable? preserveParagraphStyles = null, IEnumerable? usfmUpdateBlockHandlers = null, - IEnumerable? remarks = null + IEnumerable? remarks = null, + bool compareSegments = false ) { if (source is null) @@ -1294,7 +1395,9 @@ private static string UpdateUsfm( styleBehavior, preserveParagraphStyles, usfmUpdateBlockHandlers, - remarks + remarks, + (_) => false, + compareSegments ); } else @@ -1309,7 +1412,9 @@ private static string UpdateUsfm( styleBehavior, preserveParagraphStyles, usfmUpdateBlockHandlers, - remarks + remarks, + (_) => false, + compareSegments ); UsfmParser.Parse(source, updater); return updater.GetUsfm();