From 44e775b865365f342ae642d420243709444d6dec Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 25 Sep 2025 13:48:02 -0400 Subject: [PATCH 1/2] Add custom exception; throw in place markers handler; add error handler; fix mis-ported lines --- .../Corpora/IUsfmUpdateBlockHandler.cs | 26 ++ .../PlaceMarkersUsfmUpdateBlockHandler.cs | 412 ++++++++++-------- .../Corpora/UsfmUpdateBlockHandler.cs | 7 - 3 files changed, 252 insertions(+), 193 deletions(-) create mode 100644 src/SIL.Machine/Corpora/IUsfmUpdateBlockHandler.cs delete mode 100644 src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs diff --git a/src/SIL.Machine/Corpora/IUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/IUsfmUpdateBlockHandler.cs new file mode 100644 index 00000000..c07fe823 --- /dev/null +++ b/src/SIL.Machine/Corpora/IUsfmUpdateBlockHandler.cs @@ -0,0 +1,26 @@ +using System; + +namespace SIL.Machine.Corpora +{ + public interface IUsfmUpdateBlockHandler + { + UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block); + } + + public class UsfmUpdateBlockHandlerException : Exception + { + public UsfmUpdateBlock Block { get; } + + public UsfmUpdateBlockHandlerException(string message, UsfmUpdateBlock block) + : base(message) + { + Block = block; + } + + public UsfmUpdateBlockHandlerException(string message, Exception exception, UsfmUpdateBlock block) + : base(message, exception) + { + Block = block; + } + } +} diff --git a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs index 485cb1fc..8782492b 100644 --- a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs @@ -34,95 +34,132 @@ UpdateUsfmMarkerBehavior styleBehavior public class PlaceMarkersUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler { - public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) + private readonly Func _errorHandler; + + /// + /// Error handler should return true if parsing should continue; error handler should return false if exception should be thrown. Default behavior is to rethrow. + /// + public PlaceMarkersUsfmUpdateBlockHandler(Func errorHandler = null) { - string reference = block.Refs.FirstOrDefault().ToString(); - var elements = block.Elements.ToList(); + _errorHandler = errorHandler; + if (_errorHandler == null) + _errorHandler = (error) => false; + } - // Nothing to do if there are no markers to place or no alignment to use - if (!block.Metadata.TryGetValue(PlaceMarkersAlignmentInfo.MetadataKey, out object alignmentObject)) - { - return block; - } - if (!(alignmentObject is PlaceMarkersAlignmentInfo alignmentInfo)) + public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) + { + try { - return block; - } - if ( - elements.Count == 0 - || alignmentInfo.Alignment.RowCount == 0 - || alignmentInfo.Alignment.ColumnCount == 0 - || !elements.Any(e => - ( - e.Type == UsfmUpdateBlockElementType.Paragraph - && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Preserve - && e.Tokens.Count == 1 - ) - || ( - e.Type == UsfmUpdateBlockElementType.Style - && alignmentInfo.StyleBehavior == UpdateUsfmMarkerBehavior.Preserve + string reference = block.Refs.FirstOrDefault().ToString(); + var elements = block.Elements.ToList(); + + // Nothing to do if there are no markers to place or no alignment to use + if (!block.Metadata.TryGetValue(PlaceMarkersAlignmentInfo.MetadataKey, out object alignmentObject)) + { + return block; + } + if (!(alignmentObject is PlaceMarkersAlignmentInfo alignmentInfo)) + { + return block; + } + if ( + elements.Count == 0 + || alignmentInfo.Alignment.RowCount == 0 + || alignmentInfo.Alignment.ColumnCount == 0 + || !elements.Any(e => + ( + e.Type == UsfmUpdateBlockElementType.Paragraph + && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Preserve + && e.Tokens.Count == 1 + ) + || ( + e.Type == UsfmUpdateBlockElementType.Style + && alignmentInfo.StyleBehavior == UpdateUsfmMarkerBehavior.Preserve + ) ) ) - ) - { - return block; - } + { + return block; + } - // Paragraph markers at the end of the block should stay there - // Section headers should be ignored but re-inserted in the same position relative to other paragraph markers - var endElements = new List(); - bool eobEmptyParas = true; - var headerElements = new List<(int ParaMarkersLeft, UsfmUpdateBlockElement Element)>(); - int paraMarkersLeft = 0; - foreach ((int i, UsfmUpdateBlockElement element) in elements.Select((e, i) => (i, e)).Reverse()) - { - if (element.Type == UsfmUpdateBlockElementType.Paragraph && !element.MarkedForRemoval) + // Paragraph markers at the end of the block should stay there + // Section headers should be ignored but re-inserted in the same position relative to other paragraph markers + var endElements = new List(); + bool eobEmptyParas = true; + var headerElements = new List<(int ParaMarkersLeft, UsfmUpdateBlockElement Element)>(); + int paraMarkersLeft = 0; + foreach ((int i, UsfmUpdateBlockElement element) in elements.Select((e, i) => (i, e)).Reverse()) { - if (element.Tokens.Count > 1) + if (element.Type == UsfmUpdateBlockElementType.Paragraph && !element.MarkedForRemoval) { - headerElements.Insert(0, (paraMarkersLeft, element)); - elements.RemoveAt(i); - } - else - { - paraMarkersLeft++; - - if (eobEmptyParas) + if (element.Tokens.Count > 1) { - endElements.Insert(0, element); + headerElements.Insert(0, (paraMarkersLeft, element)); elements.RemoveAt(i); } + else + { + paraMarkersLeft++; + + if (eobEmptyParas) + { + endElements.Insert(0, element); + elements.RemoveAt(i); + } + } } - } - else if ( - !( - element.Type == UsfmUpdateBlockElementType.Embed - || ( - element.Type == UsfmUpdateBlockElementType.Text - && element.Tokens[0].ToUsfm().Trim().Length == 0 + else if ( + !( + element.Type == UsfmUpdateBlockElementType.Embed + || ( + element.Type == UsfmUpdateBlockElementType.Text + && element.Tokens[0].ToUsfm().Trim().Length == 0 + ) ) ) - ) - { - eobEmptyParas = false; + { + eobEmptyParas = false; + } } - } - IReadOnlyList sourceTokens = alignmentInfo.SourceTokens; - IReadOnlyList targetTokens = alignmentInfo.TranslationTokens; - int sourceTokenIndex = 0; + IReadOnlyList sourceTokens = alignmentInfo.SourceTokens; + IReadOnlyList targetTokens = alignmentInfo.TranslationTokens; + int sourceTokenIndex = 0; - string sourceSentence = ""; - string targetSentence = ""; - var toPlace = new List(); - var adjacentSourceTokens = new List(); - var placedElements = new List(); - var embedElements = new List(); - var ignoredElements = new List(); - foreach (UsfmUpdateBlockElement element in elements) - { - if (element.Type == UsfmUpdateBlockElementType.Text) + string targetSentence = ""; + var toPlace = new List(); + var adjacentSourceTokens = new List(); + var placedElements = new List(); + var embedElements = new List(); + var ignoredElements = new List(); + foreach (UsfmUpdateBlockElement element in elements) { + if (element.Type == UsfmUpdateBlockElementType.Text) + { + if (element.MarkedForRemoval) + { + string text = element.Tokens[0].ToUsfm(); + + // Track seen tokens + while ( + sourceTokenIndex < sourceTokens.Count && text.Contains(sourceTokens[sourceTokenIndex]) + ) + { + text = text.Substring( + text.IndexOf(sourceTokens[sourceTokenIndex]) + sourceTokens[sourceTokenIndex].Length + ); + sourceTokenIndex++; + } + // Handle tokens split across text elements + if (text.Trim().Length > 0) + sourceTokenIndex++; + } + else + { + targetSentence += element.Tokens[0].ToUsfm(); + } + } + if ( element.MarkedForRemoval || ( @@ -131,146 +168,149 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) ) ) { - string text = element.Tokens[0].ToUsfm(); - sourceSentence += text; - - // Track seen tokens - while (sourceTokenIndex < sourceTokens.Count && text.Contains(sourceTokens[sourceTokenIndex])) - { - text = text.Substring( - text.IndexOf(sourceTokens[sourceTokenIndex]) + sourceTokens[sourceTokenIndex].Length - ); - sourceTokenIndex++; - } - // Handle tokens split across text elements - if (text.Trim().Length > 0) - sourceTokenIndex++; + ignoredElements.Add(element); } - else + else if (element.Type == UsfmUpdateBlockElementType.Embed) { - targetSentence += element.Tokens[0].ToUsfm(); + embedElements.Add(element); + } + else if ( + element.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style) + ) + { + toPlace.Add(element); + adjacentSourceTokens.Add(sourceTokenIndex); } } - if (element.MarkedForRemoval) - { - ignoredElements.Add(element); - } - else if (element.Type == UsfmUpdateBlockElementType.Embed) - { - embedElements.Add(element); - } - else if (element.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style)) + if (targetSentence.Trim().Length == 0) + return block; + + var targetTokenStarts = new List(); + int prevLength = 0; + foreach (string token in targetTokens) { - toPlace.Add(element); - adjacentSourceTokens.Add(sourceTokenIndex); + int indexOfTargetTokenInSentence = targetSentence.IndexOf( + token, + targetTokenStarts.LastOrDefault() + prevLength + ); + if (indexOfTargetTokenInSentence == -1) + { + throw new UsfmUpdateBlockHandlerException( + $"No token \"{token}\" found in text \"{targetSentence}\" at or beyond index {targetTokenStarts.LastOrDefault() + prevLength}. Is the versification correctly specified?", + block + ); + } + targetTokenStarts.Add(indexOfTargetTokenInSentence); + prevLength = token.Length; } - } - - if (targetSentence.Trim().Length == 0) - return block; - - var targetTokenStarts = new List(); - int prevLength = 0; - foreach (string token in targetTokens) - { - targetTokenStarts.Add(targetSentence.IndexOf(token, targetTokenStarts.LastOrDefault() + prevLength)); - prevLength = token.Length; - } - var toInsert = new List<(int Index, UsfmUpdateBlockElement Element)>(); - foreach ( - (UsfmUpdateBlockElement element, int adjacentSourceToken) in toPlace - .Zip(adjacentSourceTokens) - .Select(tuple => (tuple.Item1, tuple.Item2)) - ) - { - int adjacentTargetToken = PredictMarkerLocation( - alignmentInfo.Alignment, - adjacentSourceToken, - sourceTokens, - targetTokens - ); - int targetStringIndex; - if ( - adjacentSourceToken > 0 - && element.Type == UsfmUpdateBlockElementType.Style - && element.Tokens[0].Marker[element.Tokens[0].Marker.Length - 1] == '*' + var toInsert = new List<(int Index, UsfmUpdateBlockElement Element)>(); + foreach ( + (UsfmUpdateBlockElement element, int adjacentSourceToken) in toPlace + .Zip(adjacentSourceTokens) + .Select(tuple => (tuple.Item1, tuple.Item2)) ) { - targetStringIndex = - targetTokenStarts[adjacentTargetToken - 1] + targetTokens[adjacentTargetToken - 1].Length; - } - else if (adjacentTargetToken < targetTokenStarts.Count) - { - targetStringIndex = targetTokenStarts[adjacentTargetToken]; - } - else - { - targetStringIndex = targetSentence.Length; - } - toInsert.Add((targetStringIndex, element)); - } - toInsert.Sort((p1, p2) => p1.Index.CompareTo(p2.Index)); - toInsert.AddRange(embedElements.Concat(endElements).Select(e => (targetSentence.Length, e))); - - // Construct new text tokens to put between markers - // and reincorporate headers and empty end-of-verse paragraph markers - if (toInsert[0].Index > 0) - { - placedElements.Add( - new UsfmUpdateBlockElement( - UsfmUpdateBlockElementType.Text, - new List() { new UsfmToken(targetSentence.Substring(0, toInsert[0].Index)) } + int adjacentTargetToken = PredictMarkerLocation( + alignmentInfo.Alignment, + adjacentSourceToken, + sourceTokens, + targetTokens + ); + int targetStringIndex; + if ( + adjacentSourceToken > 0 + && element.Type == UsfmUpdateBlockElementType.Style + && element.Tokens[0].Marker[element.Tokens[0].Marker.Length - 1] == '*' ) - ); - } - - foreach ((int j, (int insertIndex, UsfmUpdateBlockElement element)) in toInsert.Select((p, i) => (i, p))) - { - if (element.Type == UsfmUpdateBlockElementType.Paragraph) - { - while (headerElements.Count > 0 && headerElements[0].ParaMarkersLeft == paraMarkersLeft) { - placedElements.Add(headerElements[0].Element); - headerElements.RemoveAt(0); + targetStringIndex = + targetTokenStarts[adjacentTargetToken - 1] + targetTokens[adjacentTargetToken - 1].Length; } - paraMarkersLeft--; + else if (adjacentTargetToken < targetTokenStarts.Count) + { + targetStringIndex = targetTokenStarts[adjacentTargetToken]; + } + else + { + targetStringIndex = targetSentence.Length; + } + toInsert.Add((targetStringIndex, element)); } + toInsert.Sort((p1, p2) => p1.Index.CompareTo(p2.Index)); + toInsert.AddRange(embedElements.Concat(endElements).Select(e => (targetSentence.Length, e))); - placedElements.Add(element); - if ( - insertIndex < targetSentence.Length - && (j + 1 == toInsert.Count || insertIndex < toInsert[j + 1].Index) + // Construct new text tokens to put between markers + // and reincorporate headers and empty end-of-verse paragraph markers + if (toInsert[0].Index > 0) + { + placedElements.Add( + new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + new List() { new UsfmToken(targetSentence.Substring(0, toInsert[0].Index)) } + ) + ); + } + + foreach ( + (int j, (int insertIndex, UsfmUpdateBlockElement element)) in toInsert.Select((p, i) => (i, p)) ) { - UsfmToken textToken; - if (j + 1 < toInsert.Count) + if (element.Type == UsfmUpdateBlockElementType.Paragraph) { - textToken = new UsfmToken( - targetSentence.Substring(insertIndex, toInsert[j + 1].Index - insertIndex) - ); + while (headerElements.Count > 0 && headerElements[0].ParaMarkersLeft == paraMarkersLeft) + { + placedElements.Add(headerElements[0].Element); + headerElements.RemoveAt(0); + } + paraMarkersLeft--; } - else + + placedElements.Add(element); + if ( + insertIndex < targetSentence.Length + && (j + 1 == toInsert.Count || insertIndex < toInsert[j + 1].Index) + ) { - textToken = new UsfmToken(targetSentence.Substring(insertIndex)); + UsfmToken textToken; + if (j + 1 < toInsert.Count) + { + textToken = new UsfmToken( + targetSentence.Substring(insertIndex, toInsert[j + 1].Index - insertIndex) + ); + } + else + { + textToken = new UsfmToken(targetSentence.Substring(insertIndex)); + } + placedElements.Add( + new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + new List { textToken } + ) + ); } - placedElements.Add( - new UsfmUpdateBlockElement(UsfmUpdateBlockElementType.Text, new List { textToken }) - ); } + while (headerElements.Count > 0) + { + placedElements.Add(headerElements[0].Element); + headerElements.RemoveAt(0); + } + + var processedBlock = new UsfmUpdateBlock( + refs: block.Refs, + elements: placedElements.Concat(ignoredElements) + ); + return processedBlock; } - while (headerElements.Count > 0) + catch (UsfmUpdateBlockHandlerException e) { - placedElements.Add(headerElements[0].Element); - headerElements.RemoveAt(0); + bool shouldContinue = _errorHandler(e); + if (!shouldContinue) + throw; + return block; } - - var processedBlock = new UsfmUpdateBlock( - refs: block.Refs, - elements: placedElements.Concat(ignoredElements) - ); - return processedBlock; } private int PredictMarkerLocation( diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs deleted file mode 100644 index a255665f..00000000 --- a/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs +++ /dev/null @@ -1,7 +0,0 @@ -namespace SIL.Machine.Corpora -{ - public interface IUsfmUpdateBlockHandler - { - UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block); - } -} From 532c321d25841f989d3c661ff2c058d74536711c Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 26 Sep 2025 13:00:55 -0400 Subject: [PATCH 2/2] Move error handler to UpdateUsfmParserHandler --- .../PlaceMarkersUsfmUpdateBlockHandler.cs | 423 ++++++++---------- .../Corpora/UpdateUsfmParserHandler.cs | 18 +- 2 files changed, 212 insertions(+), 229 deletions(-) diff --git a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs index 8782492b..d93b8fe0 100644 --- a/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs @@ -34,283 +34,252 @@ UpdateUsfmMarkerBehavior styleBehavior public class PlaceMarkersUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler { - private readonly Func _errorHandler; - - /// - /// Error handler should return true if parsing should continue; error handler should return false if exception should be thrown. Default behavior is to rethrow. - /// - public PlaceMarkersUsfmUpdateBlockHandler(Func errorHandler = null) - { - _errorHandler = errorHandler; - if (_errorHandler == null) - _errorHandler = (error) => false; - } - public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) { - try - { - string reference = block.Refs.FirstOrDefault().ToString(); - var elements = block.Elements.ToList(); + string reference = block.Refs.FirstOrDefault().ToString(); + var elements = block.Elements.ToList(); - // Nothing to do if there are no markers to place or no alignment to use - if (!block.Metadata.TryGetValue(PlaceMarkersAlignmentInfo.MetadataKey, out object alignmentObject)) - { - return block; - } - if (!(alignmentObject is PlaceMarkersAlignmentInfo alignmentInfo)) - { - return block; - } - if ( - elements.Count == 0 - || alignmentInfo.Alignment.RowCount == 0 - || alignmentInfo.Alignment.ColumnCount == 0 - || !elements.Any(e => - ( - e.Type == UsfmUpdateBlockElementType.Paragraph - && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Preserve - && e.Tokens.Count == 1 - ) - || ( - e.Type == UsfmUpdateBlockElementType.Style - && alignmentInfo.StyleBehavior == UpdateUsfmMarkerBehavior.Preserve - ) + // Nothing to do if there are no markers to place or no alignment to use + if (!block.Metadata.TryGetValue(PlaceMarkersAlignmentInfo.MetadataKey, out object alignmentObject)) + { + return block; + } + if (!(alignmentObject is PlaceMarkersAlignmentInfo alignmentInfo)) + { + return block; + } + if ( + elements.Count == 0 + || alignmentInfo.Alignment.RowCount == 0 + || alignmentInfo.Alignment.ColumnCount == 0 + || !elements.Any(e => + ( + e.Type == UsfmUpdateBlockElementType.Paragraph + && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Preserve + && e.Tokens.Count == 1 + ) + || ( + e.Type == UsfmUpdateBlockElementType.Style + && alignmentInfo.StyleBehavior == UpdateUsfmMarkerBehavior.Preserve ) ) - { - return block; - } + ) + { + return block; + } - // Paragraph markers at the end of the block should stay there - // Section headers should be ignored but re-inserted in the same position relative to other paragraph markers - var endElements = new List(); - bool eobEmptyParas = true; - var headerElements = new List<(int ParaMarkersLeft, UsfmUpdateBlockElement Element)>(); - int paraMarkersLeft = 0; - foreach ((int i, UsfmUpdateBlockElement element) in elements.Select((e, i) => (i, e)).Reverse()) + // Paragraph markers at the end of the block should stay there + // Section headers should be ignored but re-inserted in the same position relative to other paragraph markers + var endElements = new List(); + bool eobEmptyParas = true; + var headerElements = new List<(int ParaMarkersLeft, UsfmUpdateBlockElement Element)>(); + int paraMarkersLeft = 0; + foreach ((int i, UsfmUpdateBlockElement element) in elements.Select((e, i) => (i, e)).Reverse()) + { + if (element.Type == UsfmUpdateBlockElementType.Paragraph && !element.MarkedForRemoval) { - if (element.Type == UsfmUpdateBlockElementType.Paragraph && !element.MarkedForRemoval) + if (element.Tokens.Count > 1) { - if (element.Tokens.Count > 1) + headerElements.Insert(0, (paraMarkersLeft, element)); + elements.RemoveAt(i); + } + else + { + paraMarkersLeft++; + + if (eobEmptyParas) { - headerElements.Insert(0, (paraMarkersLeft, element)); + endElements.Insert(0, element); elements.RemoveAt(i); } - else - { - paraMarkersLeft++; - - if (eobEmptyParas) - { - endElements.Insert(0, element); - elements.RemoveAt(i); - } - } } - else if ( - !( - element.Type == UsfmUpdateBlockElementType.Embed - || ( - element.Type == UsfmUpdateBlockElementType.Text - && element.Tokens[0].ToUsfm().Trim().Length == 0 - ) + } + else if ( + !( + element.Type == UsfmUpdateBlockElementType.Embed + || ( + element.Type == UsfmUpdateBlockElementType.Text + && element.Tokens[0].ToUsfm().Trim().Length == 0 ) ) - { - eobEmptyParas = false; - } + ) + { + eobEmptyParas = false; } + } - IReadOnlyList sourceTokens = alignmentInfo.SourceTokens; - IReadOnlyList targetTokens = alignmentInfo.TranslationTokens; - int sourceTokenIndex = 0; + IReadOnlyList sourceTokens = alignmentInfo.SourceTokens; + IReadOnlyList targetTokens = alignmentInfo.TranslationTokens; + int sourceTokenIndex = 0; - string targetSentence = ""; - var toPlace = new List(); - var adjacentSourceTokens = new List(); - var placedElements = new List(); - var embedElements = new List(); - var ignoredElements = new List(); - foreach (UsfmUpdateBlockElement element in elements) + string targetSentence = ""; + var toPlace = new List(); + var adjacentSourceTokens = new List(); + var placedElements = new List(); + var embedElements = new List(); + var ignoredElements = new List(); + foreach (UsfmUpdateBlockElement element in elements) + { + if (element.Type == UsfmUpdateBlockElementType.Text) { - if (element.Type == UsfmUpdateBlockElementType.Text) + if (element.MarkedForRemoval) { - if (element.MarkedForRemoval) - { - string text = element.Tokens[0].ToUsfm(); + string text = element.Tokens[0].ToUsfm(); - // Track seen tokens - while ( - sourceTokenIndex < sourceTokens.Count && text.Contains(sourceTokens[sourceTokenIndex]) - ) - { - text = text.Substring( - text.IndexOf(sourceTokens[sourceTokenIndex]) + sourceTokens[sourceTokenIndex].Length - ); - sourceTokenIndex++; - } - // Handle tokens split across text elements - if (text.Trim().Length > 0) - sourceTokenIndex++; - } - else + // Track seen tokens + while (sourceTokenIndex < sourceTokens.Count && text.Contains(sourceTokens[sourceTokenIndex])) { - targetSentence += element.Tokens[0].ToUsfm(); + text = text.Substring( + text.IndexOf(sourceTokens[sourceTokenIndex]) + sourceTokens[sourceTokenIndex].Length + ); + sourceTokenIndex++; } + // Handle tokens split across text elements + if (text.Trim().Length > 0) + sourceTokenIndex++; } - - if ( - element.MarkedForRemoval - || ( - element.Type == UsfmUpdateBlockElementType.Paragraph - && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Strip - ) - ) - { - ignoredElements.Add(element); - } - else if (element.Type == UsfmUpdateBlockElementType.Embed) + else { - embedElements.Add(element); + targetSentence += element.Tokens[0].ToUsfm(); } - else if ( - element.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style) + } + + if ( + element.MarkedForRemoval + || ( + element.Type == UsfmUpdateBlockElementType.Paragraph + && alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Strip ) - { - toPlace.Add(element); - adjacentSourceTokens.Add(sourceTokenIndex); - } + ) + { + ignoredElements.Add(element); + } + else if (element.Type == UsfmUpdateBlockElementType.Embed) + { + embedElements.Add(element); } + else if (element.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style)) + { + toPlace.Add(element); + adjacentSourceTokens.Add(sourceTokenIndex); + } + } - if (targetSentence.Trim().Length == 0) - return block; + if (targetSentence.Trim().Length == 0) + return block; - var targetTokenStarts = new List(); - int prevLength = 0; - foreach (string token in targetTokens) + var targetTokenStarts = new List(); + int prevLength = 0; + foreach (string token in targetTokens) + { + int indexOfTargetTokenInSentence = targetSentence.IndexOf( + token, + targetTokenStarts.LastOrDefault() + prevLength + ); + if (indexOfTargetTokenInSentence == -1) { - int indexOfTargetTokenInSentence = targetSentence.IndexOf( - token, - targetTokenStarts.LastOrDefault() + prevLength + throw new UsfmUpdateBlockHandlerException( + $"No token \"{token}\" found in text \"{targetSentence}\" at or beyond index {targetTokenStarts.LastOrDefault() + prevLength}. Is the versification correctly specified?", + block ); - if (indexOfTargetTokenInSentence == -1) - { - throw new UsfmUpdateBlockHandlerException( - $"No token \"{token}\" found in text \"{targetSentence}\" at or beyond index {targetTokenStarts.LastOrDefault() + prevLength}. Is the versification correctly specified?", - block - ); - } - targetTokenStarts.Add(indexOfTargetTokenInSentence); - prevLength = token.Length; } + targetTokenStarts.Add(indexOfTargetTokenInSentence); + prevLength = token.Length; + } - var toInsert = new List<(int Index, UsfmUpdateBlockElement Element)>(); - foreach ( - (UsfmUpdateBlockElement element, int adjacentSourceToken) in toPlace - .Zip(adjacentSourceTokens) - .Select(tuple => (tuple.Item1, tuple.Item2)) + var toInsert = new List<(int Index, UsfmUpdateBlockElement Element)>(); + foreach ( + (UsfmUpdateBlockElement element, int adjacentSourceToken) in toPlace + .Zip(adjacentSourceTokens) + .Select(tuple => (tuple.Item1, tuple.Item2)) + ) + { + int adjacentTargetToken = PredictMarkerLocation( + alignmentInfo.Alignment, + adjacentSourceToken, + sourceTokens, + targetTokens + ); + int targetStringIndex; + if ( + adjacentSourceToken > 0 + && element.Type == UsfmUpdateBlockElementType.Style + && element.Tokens[0].Marker[element.Tokens[0].Marker.Length - 1] == '*' ) { - int adjacentTargetToken = PredictMarkerLocation( - alignmentInfo.Alignment, - adjacentSourceToken, - sourceTokens, - targetTokens - ); - int targetStringIndex; - if ( - adjacentSourceToken > 0 - && element.Type == UsfmUpdateBlockElementType.Style - && element.Tokens[0].Marker[element.Tokens[0].Marker.Length - 1] == '*' - ) - { - targetStringIndex = - targetTokenStarts[adjacentTargetToken - 1] + targetTokens[adjacentTargetToken - 1].Length; - } - else if (adjacentTargetToken < targetTokenStarts.Count) - { - targetStringIndex = targetTokenStarts[adjacentTargetToken]; - } - else - { - targetStringIndex = targetSentence.Length; - } - toInsert.Add((targetStringIndex, element)); + targetStringIndex = + targetTokenStarts[adjacentTargetToken - 1] + targetTokens[adjacentTargetToken - 1].Length; } - toInsert.Sort((p1, p2) => p1.Index.CompareTo(p2.Index)); - toInsert.AddRange(embedElements.Concat(endElements).Select(e => (targetSentence.Length, e))); - - // Construct new text tokens to put between markers - // and reincorporate headers and empty end-of-verse paragraph markers - if (toInsert[0].Index > 0) + else if (adjacentTargetToken < targetTokenStarts.Count) { - placedElements.Add( - new UsfmUpdateBlockElement( - UsfmUpdateBlockElementType.Text, - new List() { new UsfmToken(targetSentence.Substring(0, toInsert[0].Index)) } - ) - ); + targetStringIndex = targetTokenStarts[adjacentTargetToken]; } + else + { + targetStringIndex = targetSentence.Length; + } + toInsert.Add((targetStringIndex, element)); + } + toInsert.Sort((p1, p2) => p1.Index.CompareTo(p2.Index)); + toInsert.AddRange(embedElements.Concat(endElements).Select(e => (targetSentence.Length, e))); - foreach ( - (int j, (int insertIndex, UsfmUpdateBlockElement element)) in toInsert.Select((p, i) => (i, p)) - ) + // Construct new text tokens to put between markers + // and reincorporate headers and empty end-of-verse paragraph markers + if (toInsert[0].Index > 0) + { + placedElements.Add( + new UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.Text, + new List() { new UsfmToken(targetSentence.Substring(0, toInsert[0].Index)) } + ) + ); + } + + foreach ((int j, (int insertIndex, UsfmUpdateBlockElement element)) in toInsert.Select((p, i) => (i, p))) + { + if (element.Type == UsfmUpdateBlockElementType.Paragraph) { - if (element.Type == UsfmUpdateBlockElementType.Paragraph) + while (headerElements.Count > 0 && headerElements[0].ParaMarkersLeft == paraMarkersLeft) { - while (headerElements.Count > 0 && headerElements[0].ParaMarkersLeft == paraMarkersLeft) - { - placedElements.Add(headerElements[0].Element); - headerElements.RemoveAt(0); - } - paraMarkersLeft--; + placedElements.Add(headerElements[0].Element); + headerElements.RemoveAt(0); } + paraMarkersLeft--; + } - placedElements.Add(element); - if ( - insertIndex < targetSentence.Length - && (j + 1 == toInsert.Count || insertIndex < toInsert[j + 1].Index) - ) + placedElements.Add(element); + if ( + insertIndex < targetSentence.Length + && (j + 1 == toInsert.Count || insertIndex < toInsert[j + 1].Index) + ) + { + UsfmToken textToken; + if (j + 1 < toInsert.Count) { - UsfmToken textToken; - if (j + 1 < toInsert.Count) - { - textToken = new UsfmToken( - targetSentence.Substring(insertIndex, toInsert[j + 1].Index - insertIndex) - ); - } - else - { - textToken = new UsfmToken(targetSentence.Substring(insertIndex)); - } - placedElements.Add( - new UsfmUpdateBlockElement( - UsfmUpdateBlockElementType.Text, - new List { textToken } - ) + textToken = new UsfmToken( + targetSentence.Substring(insertIndex, toInsert[j + 1].Index - insertIndex) ); } + else + { + textToken = new UsfmToken(targetSentence.Substring(insertIndex)); + } + placedElements.Add( + new UsfmUpdateBlockElement(UsfmUpdateBlockElementType.Text, new List { textToken }) + ); } - while (headerElements.Count > 0) - { - placedElements.Add(headerElements[0].Element); - headerElements.RemoveAt(0); - } - - var processedBlock = new UsfmUpdateBlock( - refs: block.Refs, - elements: placedElements.Concat(ignoredElements) - ); - return processedBlock; } - catch (UsfmUpdateBlockHandlerException e) + while (headerElements.Count > 0) { - bool shouldContinue = _errorHandler(e); - if (!shouldContinue) - throw; - return block; + placedElements.Add(headerElements[0].Element); + headerElements.RemoveAt(0); } + + var processedBlock = new UsfmUpdateBlock( + refs: block.Refs, + elements: placedElements.Concat(ignoredElements) + ); + return processedBlock; } private int PredictMarkerLocation( diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 10466d98..2e7f77c3 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -57,6 +57,7 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase private readonly Stack _replace; private int _rowIndex; private int _tokenIndex; + private readonly Func _errorHandler; public UpdateUsfmParserHandler( IReadOnlyList rows = null, @@ -67,7 +68,8 @@ public UpdateUsfmParserHandler( UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable preserveParagraphStyles = null, IEnumerable updateBlockHandlers = null, - IEnumerable remarks = null + IEnumerable remarks = null, + Func errorHandler = null ) { _rows = rows ?? Array.Empty(); @@ -90,6 +92,9 @@ public UpdateUsfmParserHandler( ? new HashSet { "r", "rem" } : new HashSet(preserveParagraphStyles); _remarks = remarks == null ? new List() : remarks.ToList(); + _errorHandler = errorHandler; + if (_errorHandler == null) + _errorHandler = (error) => false; } public IReadOnlyList Tokens => _tokens; @@ -576,7 +581,16 @@ private void EndUpdateBlock(UsfmParserState state, IReadOnlyList s foreach (IUsfmUpdateBlockHandler handler in _updateBlockHandlers) { - updateBlock = handler.ProcessBlock(updateBlock); + try + { + updateBlock = handler.ProcessBlock(updateBlock); + } + catch (UsfmUpdateBlockHandlerException e) + { + bool shouldContinue = _errorHandler(e); + if (!shouldContinue) + throw; + } } List tokens = updateBlock.GetTokens(); foreach (UsfmUpdateBlockElement elem in Enumerable.Reverse(paraElems))