diff --git a/src/SIL.Machine/Corpora/IUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/IUsfmUpdateBlockHandler.cs new file mode 100644 index 00000000..c07fe823 --- /dev/null +++ b/src/SIL.Machine/Corpora/IUsfmUpdateBlockHandler.cs @@ -0,0 +1,26 @@ +using System; + +namespace SIL.Machine.Corpora +{ + public interface IUsfmUpdateBlockHandler + { + UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block); + } + + public class UsfmUpdateBlockHandlerException : Exception + { + public UsfmUpdateBlock Block { get; } + + public UsfmUpdateBlockHandlerException(string message, UsfmUpdateBlock block) + : base(message) + { + Block = block; + } + + public UsfmUpdateBlockHandlerException(string message, Exception exception, UsfmUpdateBlock block) + : base(message, exception) + { + Block = block; + } + } +} diff --git a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs index 485cb1fc..d93b8fe0 100644 --- a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs @@ -112,7 +112,6 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) IReadOnlyList targetTokens = alignmentInfo.TranslationTokens; int sourceTokenIndex = 0; - string sourceSentence = ""; string targetSentence = ""; var toPlace = new List(); var adjacentSourceTokens = new List(); @@ -123,16 +122,9 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) { if (element.Type == UsfmUpdateBlockElementType.Text) { - if ( - element.MarkedForRemoval - || ( - element.Type == UsfmUpdateBlockElementType.Paragraph - && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Strip - ) - ) + if (element.MarkedForRemoval) { string text = element.Tokens[0].ToUsfm(); - sourceSentence += text; // Track seen tokens while (sourceTokenIndex < sourceTokens.Count && text.Contains(sourceTokens[sourceTokenIndex])) @@ -152,7 +144,13 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) } } - if (element.MarkedForRemoval) + if ( + element.MarkedForRemoval + || ( + element.Type == UsfmUpdateBlockElementType.Paragraph + && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Strip + ) + ) { ignoredElements.Add(element); } @@ -174,7 +172,18 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) int prevLength = 0; foreach (string token in targetTokens) { - targetTokenStarts.Add(targetSentence.IndexOf(token, targetTokenStarts.LastOrDefault() + prevLength)); + int indexOfTargetTokenInSentence = targetSentence.IndexOf( + token, + targetTokenStarts.LastOrDefault() + prevLength + ); + if (indexOfTargetTokenInSentence == -1) + { + throw new UsfmUpdateBlockHandlerException( + $"No token \"{token}\" found in text \"{targetSentence}\" at or beyond index {targetTokenStarts.LastOrDefault() + prevLength}. Is the versification correctly specified?", + block + ); + } + targetTokenStarts.Add(indexOfTargetTokenInSentence); prevLength = token.Length; } diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 10466d98..2e7f77c3 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -57,6 +57,7 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase private readonly Stack _replace; private int _rowIndex; private int _tokenIndex; + private readonly Func _errorHandler; public UpdateUsfmParserHandler( IReadOnlyList rows = null, @@ -67,7 +68,8 @@ public UpdateUsfmParserHandler( UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable preserveParagraphStyles = null, IEnumerable updateBlockHandlers = null, - IEnumerable remarks = null + IEnumerable remarks = null, + Func errorHandler = null ) { _rows = rows ?? Array.Empty(); @@ -90,6 +92,9 @@ public UpdateUsfmParserHandler( ? new HashSet { "r", "rem" } : new HashSet(preserveParagraphStyles); _remarks = remarks == null ? new List() : remarks.ToList(); + _errorHandler = errorHandler; + if (_errorHandler == null) + _errorHandler = (error) => false; } public IReadOnlyList Tokens => _tokens; @@ -576,7 +581,16 @@ private void EndUpdateBlock(UsfmParserState state, IReadOnlyList s foreach (IUsfmUpdateBlockHandler handler in _updateBlockHandlers) { - updateBlock = handler.ProcessBlock(updateBlock); + try + { + updateBlock = handler.ProcessBlock(updateBlock); + } + catch (UsfmUpdateBlockHandlerException e) + { + bool shouldContinue = _errorHandler(e); + if (!shouldContinue) + throw; + } } List tokens = updateBlock.GetTokens(); foreach (UsfmUpdateBlockElement elem in Enumerable.Reverse(paraElems)) diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs deleted file mode 100644 index a255665f..00000000 --- a/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs +++ /dev/null @@ -1,7 +0,0 @@ -namespace SIL.Machine.Corpora -{ - public interface IUsfmUpdateBlockHandler - { - UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block); - } -}