From c6cb8626b65ad680a1fe26ef6e45ff197a60774d Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 22 May 2025 14:51:05 -0400 Subject: [PATCH 1/9] Some porting --- .../Corpora/ParatextProjectTextUpdaterBase.cs | 2 + .../ScriptureRefUsfmParserHandlerBase.cs | 157 ++----- .../Corpora/UpdateUsfmParserHandler.cs | 384 +++++++++--------- src/SIL.Machine/Corpora/UsfmParserState.cs | 4 + src/SIL.Machine/Corpora/UsfmTextBase.cs | 22 +- src/SIL.Machine/Corpora/UsfmUpdateBlock.cs | 90 ++++ .../Corpora/UsfmUpdateBlockElement.cs | 36 ++ .../Corpora/UsfmUpdateBlockHandler.cs | 7 + 8 files changed, 384 insertions(+), 318 deletions(-) create mode 100644 src/SIL.Machine/Corpora/UsfmUpdateBlock.cs create mode 100644 src/SIL.Machine/Corpora/UsfmUpdateBlockElement.cs create mode 100644 src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 06faf34a4..8bc8e6b07 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -28,6 +28,7 @@ public string UpdateUsfm( UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IReadOnlyCollection preserveParagraphStyles = null, + IEnumerable updateBlockHandlers = null, IEnumerable remarks = null ) { @@ -49,6 +50,7 @@ public string UpdateUsfm( embedBehavior, styleBehavior, preserveParagraphStyles, + updateBlockHandlers, remarks ); try diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index ed30e7224..0db3581c9 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -10,7 +10,7 @@ public enum ScriptureTextType None, NonVerse, Verse, - NoteText + Embed } public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase @@ -19,9 +19,6 @@ public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase private readonly Stack _curElements; private readonly Stack _curTextType; private bool _duplicateVerse = false; - private bool _inEmbed; - protected bool InNoteText { get; private set; } - private bool _inNestedEmbed; protected ScriptureRefUsfmParserHandlerBase() { @@ -32,8 +29,12 @@ protected ScriptureRefUsfmParserHandlerBase() protected ScriptureTextType CurrentTextType => _curTextType.Count == 0 ? ScriptureTextType.None : _curTextType.Peek(); - private static readonly string[] EmbedStyles = new[] { "f", "fe", "fig", "fm", "x" }; - private static readonly char[] EmbedPartStartCharStyles = new[] { 'f', 'x', 'z' }; + private static readonly string[] EmbedStyles = new[] { "f", "fe", "x", "fig" }; + + private static bool IsEmbedStyle(string marker) + { + return marker != null && (EmbedStyles.Contains(marker.Trim('*')) || marker.StartsWith("z")); + } public override void EndUsfm(UsfmParserState state) { @@ -158,36 +159,6 @@ public override void EndSidebar(UsfmParserState state, string marker, bool close EndParentElement(); } - public override void StartNote(UsfmParserState state, string marker, string caller, string category) - { - _inEmbed = true; - StartEmbed(state, marker); - } - - public override void EndNote(UsfmParserState state, string marker, bool closed) - { - EndNoteTextWrapper(state); - EndEmbed(state, marker, null, closed); - _inEmbed = false; - } - - protected void StartEmbed(UsfmParserState state, string marker) - { - if (_curVerseRef.IsDefault) - UpdateVerseRef(state.VerseRef, marker); - - if (!_duplicateVerse) - { - // if we hit a note in a verse paragraph and we aren't in a verse, then start a non-verse segment - CheckConvertVerseParaToNonVerse(state); - NextElement(marker); - } - - StartEmbed(state, CreateNonVerseRef()); - } - - protected virtual void StartEmbed(UsfmParserState state, ScriptureRef scriptureRef) { } - protected virtual void EndEmbed( UsfmParserState state, string marker, @@ -214,23 +185,11 @@ public override void StartChar( IReadOnlyList attributes ) { - if (IsEmbedPartStyle(markerWithoutPlus) & InNoteText) - _inNestedEmbed = true; - // if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse // segment CheckConvertVerseParaToNonVerse(state); - if (IsEmbedStyle(markerWithoutPlus)) - { - _inEmbed = true; - StartEmbed(state, markerWithoutPlus); - } - - if (IsNoteText(markerWithoutPlus)) - { - StartNoteTextWrapper(state); - } + StartEmbedTextWrapper(state, markerWithoutPlus); } public override void EndChar( @@ -240,22 +199,18 @@ public override void EndChar( bool closed ) { - if (IsEmbedPartStyle(marker)) - { - if (_inNestedEmbed) - { - _inNestedEmbed = false; - } - else - { - EndNoteTextWrapper(state); - } - } if (IsEmbedStyle(marker)) - { - EndEmbed(state, marker, attributes, closed); - _inEmbed = false; - } + EndEmbedTextWrapper(state); + } + + public override void StartNote(UsfmParserState state, string marker, string caller, string category) + { + StartEmbedTextWrapper(state, marker); + } + + public override void EndNote(UsfmParserState state, string marker, bool closed) + { + EndEmbedTextWrapper(state); } protected virtual void StartVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { } @@ -266,26 +221,9 @@ protected virtual void StartNonVerseText(UsfmParserState state, ScriptureRef scr protected virtual void EndNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) { } - protected virtual void StartNoteTextWrapper(UsfmParserState state) - { - InNoteText = true; - _curTextType.Push(ScriptureTextType.NoteText); - StartNoteText(state); - } - - protected virtual void StartNoteText(UsfmParserState state) { } - - protected virtual void EndNoteTextWrapper(UsfmParserState state) - { - if (_curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.NoteText) - { - EndNoteText(state, CreateNonVerseRef()); - _curTextType.Pop(); - InNoteText = false; - } - } + protected virtual void StartEmbedText(UsfmParserState state, ScriptureRef scriptureRef) { } - protected virtual void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef) { } + protected virtual void EndEmbedText(UsfmParserState state, ScriptureRef scriptureRef) { } private void StartVerseText(UsfmParserState state) { @@ -325,6 +263,28 @@ private void UpdateVerseRef(VerseRef verseRef, string marker) _curVerseRef = verseRef; } + private void StartEmbedTextWrapper(UsfmParserState state, string marker) + { + if (_curVerseRef.IsDefault) + UpdateVerseRef(state.VerseRef, marker); + if (!_duplicateVerse) + { + CheckConvertVerseParaToNonVerse(state); + NextElement(marker); + _curTextType.Push(ScriptureTextType.Embed); + StartEmbedText(state, CreateNonVerseRef()); + } + } + + private void EndEmbedTextWrapper(UsfmParserState state) + { + if (!_duplicateVerse && _curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.Embed) + { + EndEmbedText(state, CreateNonVerseRef()); + _curTextType.Pop(); + } + } + private void NextElement(string marker) { ScriptureElement prevElem = _curElements.Pop(); @@ -378,36 +338,5 @@ private void CheckConvertVerseParaToNonVerse(UsfmParserState state) StartNonVerseText(state); } } - - protected bool IsInEmbed(string marker) - { - return _inEmbed || IsEmbedStyle(marker); - } - - protected bool IsInNestedEmbed(string marker) - { - return _inNestedEmbed - || ( - !(marker is null) - && marker.StartsWith("+") - && marker.Length > 1 - && IsEmbedPartStyle(marker.Substring(1)) - ); - } - - protected static bool IsNoteText(string marker) - { - return marker == "ft"; - } - - protected static bool IsEmbedPartStyle(string marker) - { - return !(marker is null) && marker.Length > 0 && marker[0].IsOneOf(EmbedPartStartCharStyles); - } - - protected static bool IsEmbedStyle(string marker) - { - return !(marker is null) && marker.Trim('*').IsOneOf(EmbedStyles); - } } } diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 532a89f0c..8300e2d95 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -25,21 +25,20 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase { private readonly IReadOnlyList<(IReadOnlyList, string)> _rows; private readonly List _tokens; - private readonly List _newTokens; - private readonly List _newEmbedTokens; + private readonly List _updatedText; + private readonly List _embedTokens; private readonly string _idText; private readonly UpdateUsfmTextBehavior _textBehavior; private readonly UpdateUsfmMarkerBehavior _paragraphBehavior; private readonly UpdateUsfmMarkerBehavior _embedBehavior; private readonly UpdateUsfmMarkerBehavior _styleBehavior; private readonly HashSet _preserveParagraphStyles; + private readonly Stack _updateBlocks; + private readonly Stack _updateBlockHandlers; private readonly List _remarks; private readonly Stack _replace; private int _rowIndex; private int _tokenIndex; - private bool _embedUpdated; - private bool _inPreservedParagraph; - private List _embedRowTexts; public UpdateUsfmParserHandler( IReadOnlyList<(IReadOnlyList, string)> rows = null, @@ -49,39 +48,44 @@ public UpdateUsfmParserHandler( UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IReadOnlyCollection preserveParagraphStyles = null, + IReadOnlyCollection updateBlockHandlers = null, IEnumerable remarks = null ) { _rows = rows ?? Array.Empty<(IReadOnlyList, string)>(); _tokens = new List(); - _newTokens = new List(); - _newEmbedTokens = new List(); + _updatedText = new List(); + _embedTokens = new List(); _idText = idText; _replace = new Stack(); _textBehavior = textBehavior; _paragraphBehavior = paragraphBehavior; _embedBehavior = embedBehavior; _styleBehavior = styleBehavior; + _updateBlockHandlers = + updateBlockHandlers == null + ? new Stack() + : new Stack(updateBlockHandlers); _preserveParagraphStyles = preserveParagraphStyles == null ? new HashSet { "r", "rem" } : new HashSet(preserveParagraphStyles); _remarks = remarks == null ? new List() : remarks.ToList(); - _embedUpdated = false; - _embedRowTexts = new List(); + CurrentTextType = ScriptureTextType.None; } public IReadOnlyList Tokens => _tokens; public override void EndUsfm(UsfmParserState state) { - CollectTokens(state); + CollectUpdatableTokens(state); base.EndUsfm(state); } public override void StartBook(UsfmParserState state, string marker, string code) { - CollectTokens(state); + CollectReadonlyTokens(state); + _updateBlocks.Push(new UsfmUpdateBlock()); var startBookTokens = new List(); if (_idText != null) startBookTokens.Add(new UsfmToken(_idText + " ")); @@ -93,15 +97,17 @@ public override void StartBook(UsfmParserState state, string marker, string code startBookTokens.Add(new UsfmToken(remark)); } } - PushNewTokens(startBookTokens); + PushUpdatedText(startBookTokens); base.StartBook(state, marker, code); } public override void EndBook(UsfmParserState state, string marker) { + UseUpdatedText(); PopNewTokens(); - + UsfmUpdateBlock updateBlock = _updateBlocks.Pop(); + _tokens.AddRange(updateBlock.GetTokens()); base.EndBook(state, marker); } @@ -112,56 +118,39 @@ public override void StartPara( IReadOnlyList attributes ) { - if (marker != null && _preserveParagraphStyles.Contains(marker)) - { - _inPreservedParagraph = true; - } if ( state.IsVerseText && (HasNewText() || _textBehavior == UpdateUsfmTextBehavior.StripExisting) && _paragraphBehavior == UpdateUsfmMarkerBehavior.Strip ) { - SkipTokens(state); + SkipUpdatableTokens(state); } else { - CollectTokens(state); + CollectUpdatableTokens(state); } base.StartPara(state, marker, unknown, attributes); } - public override void EndPara(UsfmParserState state, string marker) - { - base.EndPara(state, marker); - _inPreservedParagraph = false; - } - public override void StartRow(UsfmParserState state, string marker) { - CollectTokens(state); + CollectUpdatableTokens(state); base.StartRow(state, marker); } public override void StartCell(UsfmParserState state, string marker, string align, int colspan) { - CollectTokens(state); + CollectUpdatableTokens(state); base.StartCell(state, marker, align, colspan); } - public override void EndCell(UsfmParserState state, string marker) - { - CollectTokens(state); - - base.EndCell(state, marker); - } - public override void StartSidebar(UsfmParserState state, string marker, string category) { - CollectTokens(state); + CollectUpdatableTokens(state); base.StartSidebar(state, marker, category); } @@ -169,7 +158,7 @@ public override void StartSidebar(UsfmParserState state, string marker, string c public override void EndSidebar(UsfmParserState state, string marker, bool closed) { if (closed) - CollectTokens(state); + CollectUpdatableTokens(state); base.EndSidebar(state, marker, closed); } @@ -182,7 +171,7 @@ public override void Chapter( string pubNumber ) { - CollectTokens(state); + UseUpdatedText(); base.Chapter(state, number, marker, altNumber, pubNumber); } @@ -194,7 +183,7 @@ public override void Milestone( IReadOnlyList attributes ) { - CollectTokens(state); + CollectUpdatableTokens(state); base.Milestone(state, marker, startMilestone, attributes); } @@ -207,9 +196,25 @@ public override void Verse( string pubNumber ) { - CollectTokens(state); + UseUpdatedText(); base.Verse(state, number, marker, altNumber, pubNumber); + + CollectReadonlyTokens(state); + } + + public override void StartNote(UsfmParserState state, string marker, string caller, string category) + { + base.StartNote(state, marker, caller, category); + + CollectUpdatableTokens(state); + } + + public override void EndNote(UsfmParserState state, string marker, bool closed) + { + if (closed) + CollectUpdatableTokens(state); + base.EndNote(state, marker, closed); } public override void StartChar( @@ -219,13 +224,24 @@ public override void StartChar( IReadOnlyList attributes ) { - // strip out char-style markers in verses that are being replaced - if (ReplaceWithNewTokens(state)) - SkipTokens(state); - else - CollectTokens(state); - base.StartChar(state, markerWithoutPlus, unknown, attributes); + + if (CurrentTextType == ScriptureTextType.Embed) + { + CollectUpdatableTokens(state); + } + else + { + ReplaceWithNewTokens(state); + if (_styleBehavior == UpdateUsfmMarkerBehavior.Strip) + { + SkipUpdatableTokens(state); + } + else + { + CollectUpdatableTokens(state); + } + } } public override void EndChar( @@ -236,54 +252,34 @@ bool closed ) { // strip out char-style markers in verses that are being replaced - if (ReplaceWithNewTokens(state, closed: closed)) - SkipTokens(state); + if (CurrentTextType == ScriptureTextType.Embed) + { + CollectUpdatableTokens(state); + } else - CollectTokens(state); + { + ReplaceWithNewTokens(state); + if (_styleBehavior == UpdateUsfmMarkerBehavior.Strip) + { + SkipUpdatableTokens(state); + } + else + { + CollectUpdatableTokens(state); + } + } base.EndChar(state, marker, attributes, closed); } - protected override void StartEmbed(UsfmParserState state, ScriptureRef scriptureRef) - { - _embedRowTexts = AdvanceRows(new[] { scriptureRef }).ToList(); - _embedUpdated = _embedRowTexts.Count > 0; - - // strip out notes in verses that are being replaced - if (ReplaceWithNewTokens(state)) - SkipTokens(state); - else - CollectTokens(state); - } - - protected override void EndEmbed( - UsfmParserState state, - string marker, - IReadOnlyList attributes, - bool closed - ) - { - // strip out notes in verses that are being replaced - if (ReplaceWithNewTokens(state, closed: closed)) - SkipTokens(state); - else - CollectTokens(state); - - _embedRowTexts.Clear(); - _embedUpdated = false; - - base.EndEmbed(state, marker, attributes, closed); - } - public override void Ref(UsfmParserState state, string marker, string display, string target) { - // strip out ref in verses that are being replaced + base.Ref(state, marker, display, target); + if (ReplaceWithNewTokens(state)) - SkipTokens(state); + SkipUpdatableTokens(state); else - CollectTokens(state); - - base.Ref(state, marker, display, target); + CollectUpdatableTokens(state); } public override void Text(UsfmParserState state, string text) @@ -292,64 +288,56 @@ public override void Text(UsfmParserState state, string text) // strip out text in verses that are being replaced if (ReplaceWithNewTokens(state)) - SkipTokens(state); + SkipUpdatableTokens(state); else - CollectTokens(state); + CollectUpdatableTokens(state); } public override void OptBreak(UsfmParserState state) { - // strip out optbreaks in verses that are being replaced + base.OptBreak(state); if (ReplaceWithNewTokens(state)) - SkipTokens(state); + SkipUpdatableTokens(state); else - CollectTokens(state); - - base.OptBreak(state); + CollectUpdatableTokens(state); } public override void Unmatched(UsfmParserState state, string marker) { - // strip out unmatched end markers in verses that are being replaced + base.Unmatched(state, marker); + if (ReplaceWithNewTokens(state)) - SkipTokens(state); + SkipUpdatableTokens(state); else - CollectTokens(state); - - base.Unmatched(state, marker); + CollectUpdatableTokens(state); } protected override void StartVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { - IReadOnlyList rowTexts = AdvanceRows(scriptureRefs); - PushNewTokens(rowTexts.Select(t => new UsfmToken(t + " "))); + StartUpdateBlock(scriptureRefs); } protected override void EndVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { - PopNewTokens(); + EndUpdateBlock(scriptureRefs); } protected override void StartNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) { - IReadOnlyList rowTexts = AdvanceRows(new[] { scriptureRef }); - PushNewTokens(rowTexts.Select(t => new UsfmToken(t + " "))); + StartUpdateBlock(new[] { scriptureRef }); } protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) { - PopNewTokens(); + EndUpdateBlock(new[] { scriptureRef }); } - protected override void StartNoteText(UsfmParserState state) + protected override void EndEmbedText(UsfmParserState state, ScriptureRef scriptureRef) { - PushNewEmbedTokens(_embedRowTexts.Select(t => new UsfmToken(t + " "))); - } - - protected override void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef) - { - _embedRowTexts.Clear(); - PopNewTokens(); + _updateBlocks + .Peek() + .AddEmbed(_embedTokens, markedForRemoval: _embedBehavior == UpdateUsfmMarkerBehavior.Strip); + base.EndEmbedText(state, scriptureRef); } public string GetUsfm(string stylesheetFileName = "usfm.sty") @@ -401,83 +389,101 @@ private IReadOnlyList AdvanceRows(IReadOnlyList segScrRefs return rowTexts; } - private void CollectTokens(UsfmParserState state) + private void CollectUpdatableTokens(UsfmParserState state) { - _tokens.AddRange(_newTokens); - _newTokens.Clear(); + UseUpdatedText(); while (_tokenIndex <= state.Index + state.SpecialTokenCount) { - _tokens.Add(state.Tokens[_tokenIndex]); + UsfmToken token = state.Tokens[_tokenIndex]; + if (CurrentTextType == ScriptureTextType.Embed) + { + _embedTokens.Add(token); + } + else if ( + CurrentTextType != ScriptureTextType.None + || (state.ParaTag != null && state.ParaTag.Marker == "id") && _updateBlocks.Count > 0 + ) + { + _updateBlocks.Peek().AddToken(token); + } + else + { + _tokens.Add(token); + } _tokenIndex++; } } - private void SkipTokens(UsfmParserState state) + private void CollectReadonlyTokens(UsfmParserState state) { - _tokenIndex = state.Index + 1 + state.SpecialTokenCount; + while (_tokenIndex <= state.Index + state.SpecialTokenCount) + { + UsfmToken token = state.Tokens[_tokenIndex]; + if (_updateBlocks.Count > 0) + { + _updateBlocks.Peek().AddToken(token); + } + else + { + _tokens.Add(token); + } + _tokenIndex++; + } } - private bool ReplaceWithNewTokens(UsfmParserState state, bool closed = true) + private void SkipUpdatableTokens(UsfmParserState state) { - string marker = state?.Token?.Marker; - bool inEmbed = IsInEmbed(marker); + while (_tokenIndex <= state.Index + state.SpecialTokenCount) + { + UsfmToken token = state.Tokens[_tokenIndex]; + if ( + CurrentTextType != ScriptureTextType.None + || (state.ParaTag != null && state.ParaTag.Marker == "id") + ) + { + if (_updateBlocks.Count > 0) + { + _updateBlocks.Peek().AddToken(token); + } + _tokenIndex++; + } + } + _tokenIndex = state.Index + state.SpecialTokenCount + 1; + } - bool inNestedEmbed = IsInNestedEmbed(marker); - bool isStyleTag = marker != null && !IsEmbedPartStyle(marker); + private bool ReplaceWithNewTokens(UsfmParserState state, bool closed = true) + { + if (CurrentTextType == ScriptureTextType.Embed) + return false; bool existingText = state .Tokens.Skip(_tokenIndex) .Take(state.Index + 1 + state.SpecialTokenCount - _tokenIndex) .Any(t => t.Type == UsfmTokenType.Text && t.Text.Length > 0); - bool useNewTokens = - !IsInPreservedParagraph(marker) - && ( - _textBehavior == UpdateUsfmTextBehavior.StripExisting - || (HasNewText() && (!existingText || _textBehavior != UpdateUsfmTextBehavior.PreferExisting)) - ) - && (!inEmbed || (InNoteText && !inNestedEmbed && _embedBehavior == UpdateUsfmMarkerBehavior.Preserve)); - - if (useNewTokens) + bool useNewTokens = true; + if (IsInPreservedParagraph(state)) { - if (inEmbed) - AddNewEmbedTokens(); - else - AddNewTokens(); + useNewTokens = false; } - if ( - existingText - && (_textBehavior == UpdateUsfmTextBehavior.PreferExisting || IsInPreservedParagraph(marker)) + else if ( + _textBehavior != UpdateUsfmTextBehavior.StripExisting + && (!HasNewText() || (existingText && _textBehavior == UpdateUsfmTextBehavior.PreferExisting)) ) { - if (inEmbed) - ClearNewEmbedTokens(); - else - ClearNewTokens(); + useNewTokens = false; } - // figure out when to skip the existing text - bool embedInNewVerseText = - (_replace.Any(r => r) || _textBehavior == UpdateUsfmTextBehavior.StripExisting) && inEmbed; - if (embedInNewVerseText || _embedUpdated) - { - if (_embedBehavior == UpdateUsfmMarkerBehavior.Strip) - { - ClearNewEmbedTokens(); - return true; - } - - if (!InNoteText || inNestedEmbed) - return false; - } + if (useNewTokens) + UseUpdatedText(); - bool skipTokens = useNewTokens && closed; + bool clearNewTokens = + existingText + && (_textBehavior == UpdateUsfmTextBehavior.PreferExisting || IsInPreservedParagraph(state)); + if (clearNewTokens) + ClearUpdatedText(); - if (useNewTokens && isStyleTag) - { - skipTokens = _styleBehavior == UpdateUsfmMarkerBehavior.Strip; - } - return skipTokens; + return useNewTokens; } private bool HasNewText() @@ -485,41 +491,53 @@ private bool HasNewText() return _replace.Count > 0 && _replace.Peek(); } - private void PushNewTokens(IEnumerable tokens) + private void StartUpdateBlock(IReadOnlyList scriptureRefs) { - _replace.Push(tokens.Any()); - _newTokens.AddRange(tokens); - } - - private void AddNewTokens() - { - if (_newTokens.Count > 0) - _tokens.AddRange(_newTokens); - _newTokens.Clear(); + _updateBlocks.Push(new UsfmUpdateBlock(scriptureRefs)); + IReadOnlyList rowTexts = AdvanceRows(scriptureRefs); + PushUpdatedText(rowTexts.Select(t => new UsfmToken(UsfmTokenType.Text, text: t + " "))); } - private void ClearNewTokens() + private void EndUpdateBlock(IReadOnlyList scriptureRefs) { - _newTokens.Clear(); + UseUpdatedText(); + PopNewTokens(); + UsfmUpdateBlock updateBlock = _updateBlocks.Pop(); + updateBlock.UpdateRefs(scriptureRefs); + foreach (UsfmUpdateBlockHandler handler in _updateBlockHandlers) + { + updateBlock = handler.ProcessBlock(updateBlock); + } + if ( + _updateBlocks.Count > 0 + && _updateBlocks.Peek().Elements.Last().Type == UsfmUpdateBlockElementType.Paragraph + ) + { + _updateBlocks.Peek().ExtendLastElement(updateBlock.GetTokens()); + } + else + { + _tokens.AddRange(updateBlock.GetTokens()); + } } - private void PushNewEmbedTokens(IEnumerable tokens) + private void PushUpdatedText(IEnumerable tokens) { _replace.Push(tokens.Any()); if (tokens.Any()) - _newEmbedTokens.AddRange(tokens); + _updatedText.AddRange(tokens); } - private void AddNewEmbedTokens() + private void UseUpdatedText() { - if (_newEmbedTokens.Count > 0) - _tokens.AddRange(_newEmbedTokens); - _newEmbedTokens.Clear(); + if (_updatedText.Count > 0) + _updateBlocks.Peek().AddText(_updatedText); + _updatedText.Clear(); } - private void ClearNewEmbedTokens() + private void ClearUpdatedText() { - _newEmbedTokens.Clear(); + _updatedText.Clear(); } private void PopNewTokens() @@ -527,9 +545,9 @@ private void PopNewTokens() _replace.Pop(); } - private bool IsInPreservedParagraph(string marker) + private bool IsInPreservedParagraph(UsfmParserState state) { - return _inPreservedParagraph || _preserveParagraphStyles.Contains(marker); + return state.ParaTag != null && _preserveParagraphStyles.Contains(state.ParaTag.Marker); } } } diff --git a/src/SIL.Machine/Corpora/UsfmParserState.cs b/src/SIL.Machine/Corpora/UsfmParserState.cs index 1ad2c85b0..88ade395a 100644 --- a/src/SIL.Machine/Corpora/UsfmParserState.cs +++ b/src/SIL.Machine/Corpora/UsfmParserState.cs @@ -157,6 +157,10 @@ public bool IsVerseText { get { + // Anything before verse 1 is not verse text + if (VerseRef.VerseNum == 0) + return false; + // Sidebars and notes are not verse text if (_stack.Any(e => e.Type == UsfmElementType.Sidebar || e.Type == UsfmElementType.Note)) return false; diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs index ae6fb59fa..9a16b1029 100644 --- a/src/SIL.Machine/Corpora/UsfmTextBase.cs +++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs @@ -258,9 +258,7 @@ public override void Text(UsfmParserState state, string text) } else if (text.Length > 0 && (CurrentTextType != ScriptureTextType.Verse || state.IsVerseText)) { - bool isEmbedOrNestedDontUpdate = - IsInEmbed(state.Token.Marker) && (!InNoteText || IsInNestedEmbed(state.Token.Marker)); - if (isEmbedOrNestedDontUpdate) + if (CurrentTextType == ScriptureTextType.Embed) return; if ( @@ -298,24 +296,6 @@ protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scri _rows.Add(_text.CreateRow(scriptureRef, text, _sentenceStart)); } - protected override void StartNoteText(UsfmParserState state) - { - if (_text._includeMarkers) - return; - - _rowTexts.Push(new StringBuilder()); - } - - protected override void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef) - { - if (_text._includeMarkers) - return; - - string text = _rowTexts.Pop().ToString(); - if (_text._includeAllText) - _rows.Add(_text.CreateRow(scriptureRef, text, _sentenceStart)); - } - private void OutputMarker(UsfmParserState state) { if (!_text._includeMarkers || _rowTexts.Count == 0) diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs new file mode 100644 index 000000000..30f2b13f1 --- /dev/null +++ b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs @@ -0,0 +1,90 @@ +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.Corpora +{ + public class UsfmUpdateBlock + { + public List Refs { get; } + public List Elements { get; } + + public UsfmUpdateBlock(IEnumerable refs, IEnumerable elements) + { + Refs = refs.ToList(); + Elements = elements.ToList(); + } + + public void AddText(IEnumerable tokens) + { + Elements.Add(new UsfmUpdateBlockElement(UsfmUpdateBlockElementType.Text, tokens.ToList())); + } + + public void AddToken(UsfmToken token, bool markedForRemoval = false) + { + UsfmUpdateBlockElementType type; + switch (token.Type) + { + case UsfmTokenType.Text: + type = UsfmUpdateBlockElementType.Text; + break; + case UsfmTokenType.Paragraph: + type = UsfmUpdateBlockElementType.Paragraph; + break; + case UsfmTokenType.Character: + case UsfmTokenType.End: + type = UsfmUpdateBlockElementType.Style; + break; + default: + type = UsfmUpdateBlockElementType.Other; + break; + } + Elements.Add(new UsfmUpdateBlockElement(type, new List { token }, markedForRemoval)); + } + + public void AddEmbed(IEnumerable tokens, bool markedForRemoval = false) + { + Elements.Add( + new UsfmUpdateBlockElement(UsfmUpdateBlockElementType.Embed, tokens.ToList(), markedForRemoval) + ); + } + + public void ExtendLastElement(IEnumerable tokens) + { + Elements.Last().Tokens.AddRange(tokens); + } + + public void UpdateRefs(IEnumerable refs) + { + Refs.Clear(); + Refs.AddRange(refs); + } + + public List GetTokens() + { + return Elements.SelectMany(e => e.GetTokens()).ToList(); + } + + public override bool Equals(object obj) + { + if (!(obj is UsfmUpdateBlock)) + return false; + + UsfmUpdateBlock other = (UsfmUpdateBlock)obj; + + return Refs.SequenceEqual(other.Refs) && Elements.SequenceEqual(other.Elements); + } + + public override int GetHashCode() + { + int hash = 23; + hash = hash * 31 + Refs.GetHashCode(); + hash = hash * 31 + Elements.GetHashCode(); + return hash; + } + + public UsfmUpdateBlock Clone() + { + return new UsfmUpdateBlock(Refs, Elements); + } + } +} diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlockElement.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlockElement.cs new file mode 100644 index 000000000..cb5a31319 --- /dev/null +++ b/src/SIL.Machine/Corpora/UsfmUpdateBlockElement.cs @@ -0,0 +1,36 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora +{ + public enum UsfmUpdateBlockElementType + { + Text, + Paragraph, + Embed, + Style, + Other + } + + public class UsfmUpdateBlockElement + { + public UsfmUpdateBlockElementType Type { get; } + public List Tokens { get; } + public bool MarkedForRemoval { get; } + + public UsfmUpdateBlockElement( + UsfmUpdateBlockElementType type, + List tokens, + bool markedForRemoval = false + ) + { + Type = type; + Tokens = tokens; + MarkedForRemoval = markedForRemoval; + } + + public List GetTokens() + { + return MarkedForRemoval ? new List() : new List(Tokens); + } + } +} diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs new file mode 100644 index 000000000..68eb0d53f --- /dev/null +++ b/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs @@ -0,0 +1,7 @@ +namespace SIL.Machine.Corpora +{ + public abstract class UsfmUpdateBlockHandler + { + public abstract UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block); + } +} From a8d085539a8a007e481aec12c23f439ad4ea500d Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 23 May 2025 16:25:23 -0400 Subject: [PATCH 2/9] More porting --- .../Corpora/ParatextProjectTextUpdaterBase.cs | 2 +- .../Corpora/UpdateUsfmParserHandler.cs | 9 +- src/SIL.Machine/Corpora/UsfmUpdateBlock.cs | 9 +- .../Corpora/UpdateUsfmParserHandlerTests.cs | 542 ++++++++++++------ .../Corpora/UsfmFileTextTests.cs | 70 +-- .../Corpora/UsfmMemoryTextTests.cs | 16 +- 6 files changed, 420 insertions(+), 228 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 8bc8e6b07..983b32132 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -27,7 +27,7 @@ public string UpdateUsfm( UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, - IReadOnlyCollection preserveParagraphStyles = null, + IEnumerable preserveParagraphStyles = null, IEnumerable updateBlockHandlers = null, IEnumerable remarks = null ) diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 8300e2d95..791b47b60 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -47,8 +47,8 @@ public UpdateUsfmParserHandler( UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, - IReadOnlyCollection preserveParagraphStyles = null, - IReadOnlyCollection updateBlockHandlers = null, + IEnumerable preserveParagraphStyles = null, + IEnumerable updateBlockHandlers = null, IEnumerable remarks = null ) { @@ -71,7 +71,6 @@ public UpdateUsfmParserHandler( ? new HashSet { "r", "rem" } : new HashSet(preserveParagraphStyles); _remarks = remarks == null ? new List() : remarks.ToList(); - CurrentTextType = ScriptureTextType.None; } public IReadOnlyList Tokens => _tokens; @@ -451,7 +450,7 @@ private void SkipUpdatableTokens(UsfmParserState state) _tokenIndex = state.Index + state.SpecialTokenCount + 1; } - private bool ReplaceWithNewTokens(UsfmParserState state, bool closed = true) + private bool ReplaceWithNewTokens(UsfmParserState state) { if (CurrentTextType == ScriptureTextType.Embed) return false; @@ -495,7 +494,7 @@ private void StartUpdateBlock(IReadOnlyList scriptureRefs) { _updateBlocks.Push(new UsfmUpdateBlock(scriptureRefs)); IReadOnlyList rowTexts = AdvanceRows(scriptureRefs); - PushUpdatedText(rowTexts.Select(t => new UsfmToken(UsfmTokenType.Text, text: t + " "))); + PushUpdatedText(rowTexts.Select(t => new UsfmToken(t + " "))); } private void EndUpdateBlock(IReadOnlyList scriptureRefs) diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs index 30f2b13f1..8ee7cce44 100644 --- a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs +++ b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs @@ -8,10 +8,13 @@ public class UsfmUpdateBlock public List Refs { get; } public List Elements { get; } - public UsfmUpdateBlock(IEnumerable refs, IEnumerable elements) + public UsfmUpdateBlock( + IEnumerable refs = null, + IEnumerable elements = null + ) { - Refs = refs.ToList(); - Elements = elements.ToList(); + Refs = refs != null ? refs.ToList() : new List(); + Elements = elements != null ? elements.ToList() : new List(); } public void AddText(IEnumerable tokens) diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 0fbdf7652..40ca62dd2 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -45,8 +45,8 @@ public void GetUsfm_StripAllText() \r keep this reference \rem and this reference too \ip but remove this text -\v 1 Chapter \add one\add*, \p verse \f + \fr 2:1: \ft This is a \fm ∆\fm* footnote.\f*one. -\v 2 Chapter \add one\add*, \p verse \f + \fr 2:1: \ft This is a \fm ∆\fm* footnote.\f*two. +\v 1 Chapter \add one\add*, \p verse \f + \fr 1:1: \ft This is a \+bd ∆\+bd* footnote.\f*one. +\v 2 Chapter \add one\add*, \p verse \f + \fr 1:2: \ft This is a \+bd ∆\+bd* footnote.\f*two. \v 3 Verse 3 \v 4 Verse 4 "; @@ -67,13 +67,13 @@ public void GetUsfm_StripAllText() \rem and this reference too \ip \v 1 Update 1 \add \add* -\p \f + \fr 2:1: \ft \fm ∆\fm*\f* +\p \f + \fr 1:1: \ft This is a \+bd ∆\+bd* footnote.\f* \v 2 \add \add* -\p \f + \fr 2:1: \ft \fm ∆\fm*\f* +\p \f + \fr 1:2: \ft This is a \+bd ∆\+bd* footnote.\f* \v 3 Update 3 \v 4 "; - Assess(target, result); + AssertUsfmEquals(target, result); target = UpdateUsfm( rows, @@ -95,7 +95,7 @@ public void GetUsfm_StripAllText() \v 3 Update 3 \v 4 "; - Assess(target, result); + AssertUsfmEquals(target, result); } [Test] @@ -131,7 +131,7 @@ public void GetUsfm_StripParagraphs_PreserveParagraphStyles() \v 1 Update 1 "; - Assess(target, result); + AssertUsfmEquals(target, result); var targetDiffParagraph = UpdateUsfm( rows, @@ -149,7 +149,7 @@ public void GetUsfm_StripParagraphs_PreserveParagraphStyles() \v 1 Update 1 "; - Assess(targetDiffParagraph, resultDiffParagraph); + AssertUsfmEquals(targetDiffParagraph, resultDiffParagraph); } [Test] @@ -179,7 +179,7 @@ public void GetUsfm_PreserveParagraphs() \v 1 Update 1 "; - Assess(target, result); + AssertUsfmEquals(target, result); var targetDiffParagraph = UpdateUsfm( rows, @@ -196,7 +196,7 @@ public void GetUsfm_PreserveParagraphs() \v 1 Update 1 "; - Assess(targetDiffParagraph, resultDiffParagraph); + AssertUsfmEquals(targetDiffParagraph, resultDiffParagraph); } [Test] @@ -206,6 +206,7 @@ public void GetUsfm_ParagraphInVerse() string usfm = @"\id MAT - Test \c 1 +\p paragraph not in a verse \v 1 verse 1 \p inner verse paragraph \s1 Section Header \v 2 Verse 2 \p inner verse paragraph @@ -216,12 +217,13 @@ public void GetUsfm_ParagraphInVerse() string result = @"\id MAT - Test \c 1 +\p paragraph not in a verse \v 1 Update 1 \s1 Section Header \v 2 Verse 2 \p inner verse paragraph "; - Assess(target, result); + AssertUsfmEquals(target, result); string targetStrip = UpdateUsfm( rows, @@ -233,12 +235,13 @@ public void GetUsfm_ParagraphInVerse() string resultStrip = @"\id MAT \c 1 +\p \v 1 Update 1 \s1 \v 2 "; - Assess(targetStrip, resultStrip); + AssertUsfmEquals(targetStrip, resultStrip); } [Test] @@ -264,7 +267,7 @@ public void GetUsfm_PreferExisting() \v 2 Update 2 \v 3 Other text "; - Assess(target, result); + AssertUsfmEquals(target, result); } [Test] @@ -294,13 +297,9 @@ public void GetUsfm_Verse_StripNote() } [Test] - public void GetUsfm_Verse_ReplaceNote() + public void GetUsfm_Verse_ReplaceWithNote() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:1"), "updated text"), - (ScrRef("MAT 1:1/1:f"), "This is a new footnote.") - }; + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "updated text") }; var usfm = @"\id MAT - Test \c 1 @@ -310,9 +309,9 @@ public void GetUsfm_Verse_ReplaceNote() var result = @"\id MAT - Test \c 1 -\v 1 updated text \f + \fr 2:1: \ft This is a new footnote. \f* +\v 1 updated text \f + \fr 2:1: \ft This is a footnote.\f* "; - Assess(target, result); + AssertUsfmEquals(target, result); } [Test] @@ -434,7 +433,7 @@ public void GetUsfm_MergeVerseSegments() }; string target = UpdateUsfm(rows); - Assert.That(target, Contains.Substring("\\v 2-3 Verse 2. Verse 2a. Verse 2b. \\fm ∆\\fm*\r\n")); + Assert.That(target, Contains.Substring("\\v 2-3 Verse 2. Verse 2a. Verse 2b.\r\n")); } [Test] @@ -533,7 +532,7 @@ public void GetUsfm_NonVerse_Sidebar() { var rows = new List<(IReadOnlyList, string)> { - (ScrRef("MAT 2:3/2:esb/1:ms"), "The first paragraph of the sidebar.") + (ScrRef("MAT 2:3/1:esb/1:ms"), "The first paragraph of the sidebar.") }; string target = UpdateUsfm(rows); @@ -565,7 +564,7 @@ public void GetUsfm_NonVerse_OptBreak() { var rows = new List<(IReadOnlyList, string)> { - (ScrRef("MAT 2:3/2:esb/2:p"), "The second paragraph of the sidebar.") + (ScrRef("MAT 2:3/1:esb/2:p"), "The second paragraph of the sidebar.") }; string target = UpdateUsfm(rows); @@ -597,18 +596,17 @@ public void GetUsfm_NonVerse_SkipNote() } [Test] - public void GetUsfm_NonVerse_ReplaceNote() + public void GetUsfm_NonVerse_ReplaceWithNote() { var rows = new List<(IReadOnlyList, string)> { - (ScrRef("MAT 1:0/3:ip"), "The introductory paragraph."), - (ScrRef("MAT 1:0/3:ip/1:fe"), "This is a new endnote.") + (ScrRef("MAT 1:0/3:ip"), "The introductory paragraph.") }; string target = UpdateUsfm(rows); Assert.That( target, - Contains.Substring("\\ip The introductory paragraph. \\fe + \\ft This is a new endnote. \\fe*\r\n") + Contains.Substring("\\ip The introductory paragraph. \\fe + \\ft This is an endnote.\\fe*\r\n") ); } @@ -671,84 +669,6 @@ public void GetUsfm_Verse_PretranslationsBeforeText() ); } - [Test] - public void EmbedStylePreservation() - { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:1"), "Update the greeting"), - (ScrRef("MAT 1:1/1:f"), "Update the comment"), - (ScrRef("MAT 1:2"), "Update the greeting only"), - (ScrRef("MAT 1:3/1:f"), "Update the comment only"), - }; - var usfm = - @"\id MAT - Test -\c 1 -\v 1 Hello \f \fr 1.1 \ft Some \+bd note\+bd* \f*\bd World \bd* -\v 2 Good \f \fr 1.2 \ft Some other \+bd note\+bd* \f*\bd Morning \bd* -\v 3 Pleasant \f \fr 1.3 \ft A third \+bd note\+bd* \f*\bd Evening \bd* -"; - var target = UpdateUsfm( - rows, - usfm, - embedBehavior: UpdateUsfmMarkerBehavior.Preserve, - styleBehavior: UpdateUsfmMarkerBehavior.Preserve - ); - var resultPp = - @"\id MAT - Test -\c 1 -\v 1 Update the greeting \f \fr 1.1 \ft Update the comment \+bd \+bd*\f*\bd \bd* -\v 2 Update the greeting only \f \fr 1.2 \ft Some other \+bd note\+bd* \f*\bd \bd* -\v 3 Pleasant \f \fr 1.3 \ft Update the comment only \+bd \+bd*\f*\bd Evening \bd* -"; - Assess(target, resultPp); - - target = UpdateUsfm( - rows, - usfm, - embedBehavior: UpdateUsfmMarkerBehavior.Preserve, - styleBehavior: UpdateUsfmMarkerBehavior.Strip - ); - var resultPs = - @"\id MAT - Test -\c 1 -\v 1 Update the greeting \f \fr 1.1 \ft Update the comment \f* -\v 2 Update the greeting only \f \fr 1.2 \ft Some other \+bd note\+bd* \f* -\v 3 Pleasant \f \fr 1.3 \ft Update the comment only \f*\bd Evening \bd* -"; - Assess(target, resultPs); - - target = UpdateUsfm( - rows, - usfm, - embedBehavior: UpdateUsfmMarkerBehavior.Strip, - styleBehavior: UpdateUsfmMarkerBehavior.Preserve - ); - var resultSp = - @"\id MAT - Test -\c 1 -\v 1 Update the greeting \bd \bd* -\v 2 Update the greeting only \bd \bd* -\v 3 Pleasant \bd Evening \bd* -"; - Assess(target, resultSp); - - target = UpdateUsfm( - rows, - usfm, - embedBehavior: UpdateUsfmMarkerBehavior.Strip, - styleBehavior: UpdateUsfmMarkerBehavior.Strip - ); - var resultSs = - @"\id MAT - Test -\c 1 -\v 1 Update the greeting -\v 2 Update the greeting only -\v 3 Pleasant \bd Evening \bd* -"; - Assess(target, resultSs); - } - [Test] public void GetUsfm_StripParagraphs() { @@ -780,7 +700,7 @@ public void GetUsfm_StripParagraphs() \v 2 Hello \p World "; - Assess(target, resultP); + AssertUsfmEquals(target, resultP); target = UpdateUsfm(rows, usfm, paragraphBehavior: UpdateUsfmMarkerBehavior.Strip); var resultS = @@ -792,7 +712,7 @@ public void GetUsfm_StripParagraphs() \v 2 Hello \p World "; - Assess(target, resultS); + AssertUsfmEquals(target, resultS); } [Test] @@ -815,7 +735,7 @@ public void GetUsfm_PreservationRawStrings() \c 1 \v 1 Update all in one row \f \fr 1.1 \ft Some note \f* "; - Assess(target, result); + AssertUsfmEquals(target, result); } [Test] @@ -835,25 +755,7 @@ public void GetUsfm_BeginningOfVerseEmbed() \c 1 \v 1 Updated text "; - Assess(target, result); - } - - [Test] - public void EmptyNote() - { - var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1/1:f"), "Update the note") }; - var usfm = - @"\id MAT - Test -\c 1 -\v 1 Empty Note \f \fr 1.1 \ft \f* -"; - var target = UpdateUsfm(rows, usfm); - var result = - @"\id MAT - Test -\c 1 -\v 1 Empty Note \f \fr 1.1 \ft Update the note \f* -"; - Assess(target, result); + AssertUsfmEquals(target, result); } [Test] @@ -874,29 +776,29 @@ public void CrossReferenceDontUpdate() \c 1 \v 1 Cross reference verse \x - \xo 2:3-4 \xt Cool Book 3:24 \xta The annotation \x* and more content. "; - Assess(target, result); + AssertUsfmEquals(target, result); } [Test] - public void PreserveFigAndFm() + public void PreserveFig() { var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update"), }; var usfm = @"\id MAT - Test \c 1 -\v 1 initial text \fig stuff\fig* more text \fm * \fm* and more. +\v 1 initial text \fig stuff\fig* more text and more. "; var target = UpdateUsfm(rows, usfm); var result = @"\id MAT - Test \c 1 -\v 1 Update \fig stuff\fig*\fm * \fm* +\v 1 Update \fig stuff\fig* "; - Assess(target, result); + AssertUsfmEquals(target, result); } [Test] - public void NestedXt() + public void NoteExplicitEndMarkers() { var rows = new List<(IReadOnlyList, string)> { @@ -906,15 +808,15 @@ public void NestedXt() var usfm = @"\id MAT - Test \c 1 -\v 1 initial text \f + \fr 15.8 \ft Text (\+xt reference\+xt*). And more.\f* and the end. +\v 1 initial text \f + \fr 2.4\fr* \fk The \+nd Lord\+nd*:\fk* \ft See \+nd Lord\+nd* in Word List.\ft*\f* and the end. "; var target = UpdateUsfm(rows, usfm); var result = @"\id MAT - Test \c 1 -\v 1 Update text \f + \fr 15.8 \ft Update note \+xt reference\+xt*\f* +\v 1 Update text \f + \fr 2.4\fr* \fk The \+nd Lord\+nd*:\fk* \ft See \+nd Lord\+nd* in Word List.\ft*\f* "; - Assess(target, result); + AssertUsfmEquals(target, result); target = UpdateUsfm(rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip); var result2 = @@ -922,67 +824,327 @@ public void NestedXt() \c 1 \v 1 Update text "; - Assess(target, result2); + AssertUsfmEquals(target, result2); } [Test] - public void NonNestedXt() + public void UpdateBlock_Verse_PreserveParas() { - var rows = new List<(IReadOnlyList, string)> - { - (ScrRef("MAT 1:1"), "Update text"), - (ScrRef("MAT 1:1/1:f"), "Update note"), - }; + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; var usfm = @"\id MAT - Test \c 1 -\v 1 initial text \f + \fr 15.8 \ft Text \xt reference\f* and the end. +\v 1 verse 1 \p inner verse paragraph "; - var target = UpdateUsfm(rows, usfm); - var result = + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm( + rows, + usfm, + embedBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] + ); + + Assert.That(usfmUpdateBlockHandler.Blocks.Count == 1); + + UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; + AssertUpdateBlockEquals( + usfmUpdateBlock, + "MAT 1:1", + (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), + (UsfmUpdateBlockElementType.Text, "Update 1 ", false), + (UsfmUpdateBlockElementType.Text, "verse 1 ", true), + (UsfmUpdateBlockElementType.Paragraph, "\\p ", false), + (UsfmUpdateBlockElementType.Text, "inner verse paragraph ", true) + ); + } + + [Test] + public void UpdateBlock_Verse_StripParas() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var usfm = @"\id MAT - Test \c 1 -\v 1 Update text \f + \fr 15.8 \ft Update note \xt reference\f* +\v 1 verse 1 \p inner verse paragraph "; - Assess(target, result); + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm( + rows, + usfm, + paragraphBehavior: UpdateUsfmMarkerBehavior.Strip, + usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] + ); - target = UpdateUsfm(rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip); - var result2 = + Assert.That(usfmUpdateBlockHandler.Blocks.Count == 1); + + UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; + AssertUpdateBlockEquals( + usfmUpdateBlock, + "MAT 1:1", + (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), + (UsfmUpdateBlockElementType.Text, "Update 1 ", false), + (UsfmUpdateBlockElementType.Text, "verse 1 ", true), + (UsfmUpdateBlockElementType.Paragraph, "\\p ", true), + (UsfmUpdateBlockElementType.Text, "inner verse paragraph ", true) + ); + } + + [Test] + public void UpdateBlock_Verse_Range() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var usfm = @"\id MAT - Test \c 1 -\v 1 Update text +\v 1-3 verse 1 through 3 "; - Assess(target, result2); + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm( + rows, + usfm, + embedBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] + ); + + Assert.That(usfmUpdateBlockHandler.Blocks.Count == 1); + + UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; + AssertUpdateBlockEquals( + usfmUpdateBlock, + "MAT 1:1", + (UsfmUpdateBlockElementType.Other, "\\v 1-3 ", false), + (UsfmUpdateBlockElementType.Text, "Update 1 ", false), + (UsfmUpdateBlockElementType.Text, "verse 1 through 3 ", true) + ); + } + + [Test] + public void UpdateBlock_Footnote_PreserveEmbeds() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var usfm = + @"\id MAT - Test +\c 1 +\v 1 verse\f \fr 1.1 \ft Some note \f* 1 +"; + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm( + rows, + usfm, + embedBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] + ); + + Assert.That(usfmUpdateBlockHandler.Blocks.Count == 1); + + UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; + AssertUpdateBlockEquals( + usfmUpdateBlock, + "MAT 1:1", + (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), + (UsfmUpdateBlockElementType.Text, "Update 1 ", false), + (UsfmUpdateBlockElementType.Text, "verse", true), + (UsfmUpdateBlockElementType.Embed, "\\f \\fr 1.1 \\ft Some note \\f*", false), + (UsfmUpdateBlockElementType.Text, " 1 ", true) + ); + } + + [Test] + public void UpdateBlock_Footnote_StripEmbeds() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var usfm = + @"\id MAT - Test +\c 1 +\v 1 verse\f \fr 1.1 \ft Some note \f* 1 +"; + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm( + rows, + usfm, + embedBehavior: UpdateUsfmMarkerBehavior.Strip, + usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] + ); + + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(1)); + + UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; + AssertUpdateBlockEquals( + usfmUpdateBlock, + "MAT 1:1", + (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), + (UsfmUpdateBlockElementType.Text, "Update 1 ", false), + (UsfmUpdateBlockElementType.Text, "verse", true), + (UsfmUpdateBlockElementType.Embed, "\\f \\fr 1.1 \\ft Some note \\f*", true), + (UsfmUpdateBlockElementType.Text, " 1 ", true) + ); } [Test] - public void MultipleFtOnlyUpdateFirst() + public void UpdateBlock_NonVerse() { var rows = new List<(IReadOnlyList, string)> { - (ScrRef("MAT 1:1"), "Update text"), - (ScrRef("MAT 1:1/1:f"), "Update note"), + (ScrRef("MAT 1:0/1:s"), "Updated section header") }; var usfm = @"\id MAT - Test +\s Section header \c 1 -\v 1 initial text \f + \fr 15.8 \ft first note \ft second note\f* and the end. +\v 1 verse 1 "; - var target = UpdateUsfm(rows, usfm); - var result = + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm(rows, usfm, usfmUpdateBlockHandlers: [usfmUpdateBlockHandler]); + + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(2)); + + UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; + AssertUpdateBlockEquals( + usfmUpdateBlock, + "MAT 1:0/1:s", + (UsfmUpdateBlockElementType.Text, "Updated section Header ", false), + (UsfmUpdateBlockElementType.Text, "Section header ", true) + ); + } + + [Test] + public void UpdateBlock_Verse_PreserveStyles() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var usfm = @"\id MAT - Test \c 1 -\v 1 Update text \f + \fr 15.8 \ft Update note \ft second note\f* +\v 1 verse \bd 1\bd* "; - Assess(target, result); + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm( + rows, + usfm, + styleBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] + ); - target = UpdateUsfm(rows, usfm, embedBehavior: UpdateUsfmMarkerBehavior.Strip); - var result2 = + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(1)); + + UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; + AssertUpdateBlockEquals( + usfmUpdateBlock, + "MAT 1:1", + (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), + (UsfmUpdateBlockElementType.Text, "Update 1 ", false), + (UsfmUpdateBlockElementType.Text, "verse ", true), + (UsfmUpdateBlockElementType.Style, "\\bd ", false), + (UsfmUpdateBlockElementType.Text, "1", true), + (UsfmUpdateBlockElementType.Style, "\\bd*", false), + (UsfmUpdateBlockElementType.Text, " ", true) + ); + } + + [Test] + public void UpdateBlock_Verse_StripStyles() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var usfm = @"\id MAT - Test \c 1 -\v 1 Update text +\v 1 verse \bd 1\bd* +"; + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm( + rows, + usfm, + styleBehavior: UpdateUsfmMarkerBehavior.Strip, + usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] + ); + + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(1)); + + UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; + AssertUpdateBlockEquals( + usfmUpdateBlock, + "MAT 1:1", + (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), + (UsfmUpdateBlockElementType.Text, "Update 1 ", false), + (UsfmUpdateBlockElementType.Text, "verse ", true), + (UsfmUpdateBlockElementType.Style, "\\bd ", true), + (UsfmUpdateBlockElementType.Text, "1", true), + (UsfmUpdateBlockElementType.Style, "\\bd*", true), + (UsfmUpdateBlockElementType.Text, " ", true) + ); + } + + [Test] + public void UpdateBlock_Verse_SectionHeader() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var usfm = + @"\id MAT - Test +\c 1 +\p +\v 1 Verse 1 +\s Section header +\p +\v 2 Verse 2 "; - Assess(target, result2); + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm(rows, usfm, usfmUpdateBlockHandlers: [usfmUpdateBlockHandler]); + + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(4)); + AssertUpdateBlockEquals(usfmUpdateBlockHandler.Blocks[0], "MAT 1:0/1:p"); + AssertUpdateBlockEquals( + usfmUpdateBlockHandler.Blocks[1], + "MAT 1:1/1:s", + (UsfmUpdateBlockElementType.Text, "Section header ", false) + ); + AssertUpdateBlockEquals( + usfmUpdateBlockHandler.Blocks[2], + "MAT 1:1", + (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), + (UsfmUpdateBlockElementType.Text, "Update 1 ", false), + (UsfmUpdateBlockElementType.Text, "Verse 1 ", true), + (UsfmUpdateBlockElementType.Paragraph, "\\s Section header ", false), + (UsfmUpdateBlockElementType.Paragraph, "\\p ", false) + ); + AssertUpdateBlockEquals( + usfmUpdateBlockHandler.Blocks[3], + "MAT 1:2", + (UsfmUpdateBlockElementType.Other, "\\v 2 ", false), + (UsfmUpdateBlockElementType.Text, "Verse 2 ", false) + ); + } + + [Test] + public void UpdateBlock_Verse_SectionHeaderInVerse() + { + var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 1:1"), "Update 1") }; + var usfm = + @"\id MAT - Test +\c 1 +\p +\v 1 Beginning of verse +\s Section header +\p end of verse +"; + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + UpdateUsfm(rows, usfm, usfmUpdateBlockHandlers: [usfmUpdateBlockHandler]); + + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(3)); + AssertUpdateBlockEquals(usfmUpdateBlockHandler.Blocks[0], "MAT 1:0/1:p"); + AssertUpdateBlockEquals( + usfmUpdateBlockHandler.Blocks[1], + "MAT 1:1/1:s", + (UsfmUpdateBlockElementType.Text, "Section header ", false) + ); + AssertUpdateBlockEquals( + usfmUpdateBlockHandler.Blocks[2], + "MAT 1:1", + (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), + (UsfmUpdateBlockElementType.Text, "Update 1 ", false), + (UsfmUpdateBlockElementType.Text, "Beginning of verse ", true), + (UsfmUpdateBlockElementType.Paragraph, "\\s Section header ", false), + (UsfmUpdateBlockElementType.Paragraph, "\\p ", false), + (UsfmUpdateBlockElementType.Text, "end of verse ", true) + ); } private static ScriptureRef[] ScrRef(params string[] refs) @@ -998,7 +1160,8 @@ private static string UpdateUsfm( UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, - IReadOnlyCollection? preserveParagraphStyles = null + IEnumerable? preserveParagraphStyles = null, + IEnumerable? usfmUpdateBlockHandlers = null ) { if (source is null) @@ -1012,7 +1175,8 @@ private static string UpdateUsfm( paragraphBehavior, embedBehavior, styleBehavior, - preserveParagraphStyles + preserveParagraphStyles, + usfmUpdateBlockHandlers ); } else @@ -1032,14 +1196,52 @@ private static string UpdateUsfm( } } - private static void Assess(string target, string truth) + private static void AssertUsfmEquals(string target, string truth) { Assert.That(target, Is.Not.Null); - var target_lines = target.Split(new[] { "\n" }, StringSplitOptions.None); - var truth_lines = truth.Split(new[] { "\n" }, StringSplitOptions.None); + var target_lines = target.Split(["\n"], StringSplitOptions.None); + var truth_lines = truth.Split(["\n"], StringSplitOptions.None); for (int i = 0; i < truth_lines.Length; i++) { Assert.That(target_lines[i].Trim(), Is.EqualTo(truth_lines[i].Trim()), message: $"Line {i}"); } } + + private static void AssertUpdateBlockEquals( + UsfmUpdateBlock block, + string expectedRef, + params (UsfmUpdateBlockElementType, string, bool)[] expectedElements + ) + { + Assert.That(block.Refs.SequenceEqual([ScriptureRef.Parse(expectedRef)])); + Assert.That(block.Elements.Count, Is.EqualTo(expectedElements.Length)); + foreach ( + ( + UsfmUpdateBlockElement element, + (UsfmUpdateBlockElementType expectedType, string expectedUsfm, bool expectedMarkedForRemoval) + ) in block.Elements.Zip(expectedElements) + ) + { + Assert.That(element.Type, Is.EqualTo(expectedType)); + Assert.That(string.Join("", element.Tokens.Select(t => t.ToUsfm())), Is.EqualTo(expectedUsfm)); + Assert.That(element.MarkedForRemoval, Is.EqualTo(expectedMarkedForRemoval)); + } + } + + private class TestUsfmUpdateBlockHandler : UsfmUpdateBlockHandler + { + public List Blocks { get; } + + public TestUsfmUpdateBlockHandler() + { + Blocks = new List(); + } + + public override UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) + { + UsfmUpdateBlock newBlock = block.Clone(); + Blocks.Add(newBlock); + return newBlock; + } + } } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs index c00888e28..14f720013 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs @@ -73,7 +73,7 @@ public void GetRows_NonEmptyText_AllText() IText text = corpus["MAT"]; TextRow[] rows = text.GetRows().ToArray(); - Assert.That(rows, Has.Length.EqualTo(50)); + Assert.That(rows, Has.Length.EqualTo(48)); Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/1:h", corpus.Versification))); Assert.That(rows[0].Text, Is.EqualTo("Matthew")); @@ -84,14 +84,11 @@ public void GetRows_NonEmptyText_AllText() Assert.That(rows[2].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/3:ip", corpus.Versification))); Assert.That(rows[2].Text, Is.EqualTo("An introduction to Matthew")); - Assert.That(rows[3].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/3:ip/1:fe", corpus.Versification))); - Assert.That(rows[3].Text, Is.EqualTo("This is an endnote.")); - - Assert.That(rows[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/4:p", corpus.Versification))); - Assert.That(rows[4].Text, Is.EqualTo("MAT 1 Here is another paragraph.")); + Assert.That(rows[3].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/4:p", corpus.Versification))); + Assert.That(rows[3].Text, Is.EqualTo("MAT 1 Here is another paragraph.")); Assert.That( - rows[7].Ref, + rows[6].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/7:weirdtaglookingthing", corpus.Versification)) ); Assert.That(rows[7].Text, Is.EqualTo("that is not an actual tag.")); @@ -99,44 +96,35 @@ public void GetRows_NonEmptyText_AllText() Assert.That(rows[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/8:s", corpus.Versification))); Assert.That(rows[8].Text, Is.EqualTo("Chapter One")); - Assert.That(rows[10].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1/1:f", corpus.Versification))); - Assert.That(rows[10].Text, Is.EqualTo("This is a footnote for v1.")); - - Assert.That(rows[12].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2/1:f", corpus.Versification))); - Assert.That(rows[12].Text, Is.EqualTo("This is a footnote for v2.")); - - Assert.That(rows[19].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/1:tc1", corpus.Versification))); - Assert.That(rows[19].Text, Is.EqualTo("Row one, column one.")); + Assert.That(rows[16].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/1:tc1", corpus.Versification))); + Assert.That(rows[16].Text, Is.EqualTo("Row one, column one.")); - Assert.That(rows[20].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/2:tc2", corpus.Versification))); - Assert.That(rows[20].Text, Is.EqualTo("Row one, column two.")); + Assert.That(rows[17].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/2:tc2", corpus.Versification))); + Assert.That(rows[17].Text, Is.EqualTo("Row one, column two.")); - Assert.That(rows[21].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/2:tr/1:tc1", corpus.Versification))); - Assert.That(rows[21].Text, Is.EqualTo("Row two, column one.")); + Assert.That(rows[18].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/2:tr/1:tc1", corpus.Versification))); + Assert.That(rows[18].Text, Is.EqualTo("Row two, column one.")); - Assert.That(rows[22].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/2:tr/2:tc2", corpus.Versification))); - Assert.That(rows[22].Text, Is.EqualTo("Row two, column two.")); + Assert.That(rows[19].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/2:tr/2:tc2", corpus.Versification))); + Assert.That(rows[19].Text, Is.EqualTo("Row two, column two.")); - Assert.That(rows[23].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/3:s1", corpus.Versification))); - Assert.That(rows[23].Text, Is.EqualTo("Chapter Two")); - - Assert.That(rows[24].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/4:p", corpus.Versification))); - Assert.That(rows[24].Text, Is.Empty); + Assert.That(rows[20].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/3:s1", corpus.Versification))); + Assert.That(rows[20].Text, Is.EqualTo("Chapter Two")); - Assert.That(rows[26].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1/1:f", corpus.Versification))); - Assert.That(rows[26].Text, Is.EqualTo("This is a footnote.")); + Assert.That(rows[21].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/4:p", corpus.Versification))); + Assert.That(rows[21].Text, Is.Empty); - Assert.That(rows[29].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/2:esb/1:ms", corpus.Versification))); - Assert.That(rows[29].Text, Is.EqualTo("This is a sidebar")); + Assert.That(rows[26].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/2:esb/1:ms", corpus.Versification))); + Assert.That(rows[26].Text, Is.EqualTo("This is a sidebar")); - Assert.That(rows[30].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/2:esb/2:p", corpus.Versification))); - Assert.That(rows[30].Text, Is.EqualTo("Here is some sidebar content.")); + Assert.That(rows[27].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/2:esb/2:p", corpus.Versification))); + Assert.That(rows[27].Text, Is.EqualTo("Here is some sidebar content.")); - Assert.That(rows[36].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:7a/1:s", corpus.Versification))); - Assert.That(rows[36].Text, Is.EqualTo("Section header")); + Assert.That(rows[33].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:7a/1:s", corpus.Versification))); + Assert.That(rows[33].Text, Is.EqualTo("Section header")); - Assert.That(rows[43].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:12/1:restore", corpus.Versification))); - Assert.That(rows[43].Text, Is.EqualTo("restore information")); + Assert.That(rows[40].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:12/1:restore", corpus.Versification))); + Assert.That(rows[40].Text, Is.EqualTo("restore information")); } [Test] @@ -256,7 +244,7 @@ public void GetRows_IncludeMarkers_AllText() IText text = corpus["MAT"]; TextRow[] rows = text.GetRows().ToArray(); - Assert.That(rows, Has.Length.EqualTo(46)); + Assert.That(rows, Has.Length.EqualTo(48)); Assert.That(rows[2].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/3:ip", corpus.Versification))); Assert.That(rows[2].Text, Is.EqualTo("An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*")); @@ -286,13 +274,13 @@ public void GetRows_IncludeMarkers_AllText() Assert.That(rows[20].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/3:s1", corpus.Versification))); Assert.That(rows[20].Text, Is.EqualTo("Chapter \\it Two \\it*")); - Assert.That(rows[22].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1", corpus.Versification))); + Assert.That(rows[23].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1", corpus.Versification))); Assert.That( - rows[22].Text, + rows[23].Text, Is.EqualTo("Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one.") ); - Assert.That(rows[26].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/2:esb/2:p", corpus.Versification))); - Assert.That(rows[26].Text, Is.EqualTo("Here is some sidebar // content.")); + Assert.That(rows[27].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/2:esb/2:p", corpus.Versification))); + Assert.That(rows[27].Text, Is.EqualTo("Here is some sidebar // content.")); } } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index d2589e5a6..32d25a67e 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -148,11 +148,11 @@ public void GetRows_VersePara_BeginningNonVerseSegment() includeAllText: true ); - Assert.That(rows, Has.Length.EqualTo(5), string.Join(",", rows.Select(tr => tr.Text))); + Assert.That(rows, Has.Length.EqualTo(4), string.Join(",", rows.Select(tr => tr.Text))); Assert.That(rows[0].Text, Is.EqualTo("")); Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/1:q1"))); - Assert.That(rows[1].Text, Is.EqualTo("World")); - Assert.That(rows[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/1:q1/1:f"))); + Assert.That(rows[1].Text, Is.EqualTo("First verse in line!?!")); + Assert.That(rows[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1"))); } [Test] @@ -169,11 +169,11 @@ public void GetRows_VersePara_CommentFirst() includeAllText: true ); - Assert.That(rows[0].Text, Is.EqualTo("World")); - Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/1:f"))); - Assert.That(rows[1].Text, Is.EqualTo("This is a comment")); - Assert.That(rows[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/2:ip"))); - Assert.That(rows, Has.Length.EqualTo(3), string.Join(",", rows.Select(tr => tr.Text))); + Assert.That(rows[0].Text, Is.EqualTo("This is a comment")); + Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/2:ip"))); + Assert.That(rows[1].Text, Is.EqualTo("First verse in line!?!")); + Assert.That(rows[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1"))); + Assert.That(rows, Has.Length.EqualTo(2), string.Join(",", rows.Select(tr => tr.Text))); } [Test] From 0227ecf175796eb68bffd59b9cc561b7928edd8f Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 23 May 2025 16:31:04 -0400 Subject: [PATCH 3/9] Missing initialization --- src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 791b47b60..28ea9b5e7 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -55,6 +55,7 @@ public UpdateUsfmParserHandler( _rows = rows ?? Array.Empty<(IReadOnlyList, string)>(); _tokens = new List(); _updatedText = new List(); + _updateBlocks = new Stack(); _embedTokens = new List(); _idText = idText; _replace = new Stack(); From de12129415da2e2daca6f291071b2e5f7f0f2a00 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Sat, 24 May 2025 21:08:58 -0400 Subject: [PATCH 4/9] More fixes --- src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 28ea9b5e7..87e353d67 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -108,6 +108,7 @@ public override void EndBook(UsfmParserState state, string marker) PopNewTokens(); UsfmUpdateBlock updateBlock = _updateBlocks.Pop(); _tokens.AddRange(updateBlock.GetTokens()); + base.EndBook(state, marker); } @@ -174,6 +175,8 @@ string pubNumber UseUpdatedText(); base.Chapter(state, number, marker, altNumber, pubNumber); + + CollectReadonlyTokens(state); } public override void Milestone( @@ -400,8 +403,11 @@ private void CollectUpdatableTokens(UsfmParserState state) _embedTokens.Add(token); } else if ( - CurrentTextType != ScriptureTextType.None - || (state.ParaTag != null && state.ParaTag.Marker == "id") && _updateBlocks.Count > 0 + ( + CurrentTextType != ScriptureTextType.None + || (state.ParaTag != null && state.ParaTag.Marker == "id") + ) + && _updateBlocks.Count > 0 ) { _updateBlocks.Peek().AddToken(token); From 6cdd593c0a0444078fa5b17c9029e2cc833f90ca Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 27 May 2025 09:13:15 -0400 Subject: [PATCH 5/9] Add missing parameter passing --- src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 87e353d67..bda45155b 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -449,7 +449,7 @@ private void SkipUpdatableTokens(UsfmParserState state) { if (_updateBlocks.Count > 0) { - _updateBlocks.Peek().AddToken(token); + _updateBlocks.Peek().AddToken(token, markedForRemoval: true); } _tokenIndex++; } From 6e3e3705885dd1abf2b01d27d8c8e5ab8c770c62 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 27 May 2025 10:00:01 -0400 Subject: [PATCH 6/9] A couple more mis-ported lines --- .../Corpora/UpdateUsfmParserHandler.cs | 2 +- .../Corpora/UpdateUsfmParserHandlerTests.cs | 48 ++++++++++--------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index bda45155b..5c161d433 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -340,7 +340,7 @@ protected override void EndEmbedText(UsfmParserState state, ScriptureRef scriptu _updateBlocks .Peek() .AddEmbed(_embedTokens, markedForRemoval: _embedBehavior == UpdateUsfmMarkerBehavior.Strip); - base.EndEmbedText(state, scriptureRef); + _embedTokens.Clear(); } public string GetUsfm(string stylesheetFileName = "usfm.sty") diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 40ca62dd2..46c2bbfd3 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -844,12 +844,12 @@ public void UpdateBlock_Verse_PreserveParas() usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] ); - Assert.That(usfmUpdateBlockHandler.Blocks.Count == 1); + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(1)); UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; AssertUpdateBlockEquals( usfmUpdateBlock, - "MAT 1:1", + ["MAT 1:1"], (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse 1 ", true), @@ -875,12 +875,12 @@ public void UpdateBlock_Verse_StripParas() usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] ); - Assert.That(usfmUpdateBlockHandler.Blocks.Count == 1); + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(1)); UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; AssertUpdateBlockEquals( usfmUpdateBlock, - "MAT 1:1", + ["MAT 1:1"], (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse 1 ", true), @@ -906,12 +906,12 @@ public void UpdateBlock_Verse_Range() usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] ); - Assert.That(usfmUpdateBlockHandler.Blocks.Count == 1); + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(1)); UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; AssertUpdateBlockEquals( usfmUpdateBlock, - "MAT 1:1", + ["MAT 1:1", "MAT 1:2", "MAT 1:3"], (UsfmUpdateBlockElementType.Other, "\\v 1-3 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse 1 through 3 ", true) @@ -935,12 +935,12 @@ public void UpdateBlock_Footnote_PreserveEmbeds() usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] ); - Assert.That(usfmUpdateBlockHandler.Blocks.Count == 1); + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(1)); UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; AssertUpdateBlockEquals( usfmUpdateBlock, - "MAT 1:1", + ["MAT 1:1"], (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse", true), @@ -971,7 +971,7 @@ public void UpdateBlock_Footnote_StripEmbeds() UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; AssertUpdateBlockEquals( usfmUpdateBlock, - "MAT 1:1", + ["MAT 1:1"], (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse", true), @@ -985,7 +985,7 @@ public void UpdateBlock_NonVerse() { var rows = new List<(IReadOnlyList, string)> { - (ScrRef("MAT 1:0/1:s"), "Updated section header") + (ScrRef("MAT 1:0/1:s"), "Updated section Header") }; var usfm = @"\id MAT - Test @@ -1001,7 +1001,7 @@ public void UpdateBlock_NonVerse() UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; AssertUpdateBlockEquals( usfmUpdateBlock, - "MAT 1:0/1:s", + ["MAT 1:0/1:s"], (UsfmUpdateBlockElementType.Text, "Updated section Header ", false), (UsfmUpdateBlockElementType.Text, "Section header ", true) ); @@ -1029,7 +1029,7 @@ public void UpdateBlock_Verse_PreserveStyles() UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; AssertUpdateBlockEquals( usfmUpdateBlock, - "MAT 1:1", + ["MAT 1:1"], (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse ", true), @@ -1062,7 +1062,7 @@ public void UpdateBlock_Verse_StripStyles() UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; AssertUpdateBlockEquals( usfmUpdateBlock, - "MAT 1:1", + ["MAT 1:1"], (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "verse ", true), @@ -1090,15 +1090,15 @@ public void UpdateBlock_Verse_SectionHeader() UpdateUsfm(rows, usfm, usfmUpdateBlockHandlers: [usfmUpdateBlockHandler]); Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(4)); - AssertUpdateBlockEquals(usfmUpdateBlockHandler.Blocks[0], "MAT 1:0/1:p"); + AssertUpdateBlockEquals(usfmUpdateBlockHandler.Blocks[0], ["MAT 1:0/1:p"]); AssertUpdateBlockEquals( usfmUpdateBlockHandler.Blocks[1], - "MAT 1:1/1:s", + ["MAT 1:1/1:s"], (UsfmUpdateBlockElementType.Text, "Section header ", false) ); AssertUpdateBlockEquals( usfmUpdateBlockHandler.Blocks[2], - "MAT 1:1", + ["MAT 1:1"], (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "Verse 1 ", true), @@ -1107,7 +1107,7 @@ public void UpdateBlock_Verse_SectionHeader() ); AssertUpdateBlockEquals( usfmUpdateBlockHandler.Blocks[3], - "MAT 1:2", + ["MAT 1:2"], (UsfmUpdateBlockElementType.Other, "\\v 2 ", false), (UsfmUpdateBlockElementType.Text, "Verse 2 ", false) ); @@ -1129,15 +1129,15 @@ public void UpdateBlock_Verse_SectionHeaderInVerse() UpdateUsfm(rows, usfm, usfmUpdateBlockHandlers: [usfmUpdateBlockHandler]); Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(3)); - AssertUpdateBlockEquals(usfmUpdateBlockHandler.Blocks[0], "MAT 1:0/1:p"); + AssertUpdateBlockEquals(usfmUpdateBlockHandler.Blocks[0], ["MAT 1:0/1:p"]); AssertUpdateBlockEquals( usfmUpdateBlockHandler.Blocks[1], - "MAT 1:1/1:s", + ["MAT 1:1/1:s"], (UsfmUpdateBlockElementType.Text, "Section header ", false) ); AssertUpdateBlockEquals( usfmUpdateBlockHandler.Blocks[2], - "MAT 1:1", + ["MAT 1:1"], (UsfmUpdateBlockElementType.Other, "\\v 1 ", false), (UsfmUpdateBlockElementType.Text, "Update 1 ", false), (UsfmUpdateBlockElementType.Text, "Beginning of verse ", true), @@ -1189,7 +1189,8 @@ private static string UpdateUsfm( paragraphBehavior, embedBehavior, styleBehavior, - preserveParagraphStyles + preserveParagraphStyles, + usfmUpdateBlockHandlers ); UsfmParser.Parse(source, updater); return updater.GetUsfm(); @@ -1209,11 +1210,12 @@ private static void AssertUsfmEquals(string target, string truth) private static void AssertUpdateBlockEquals( UsfmUpdateBlock block, - string expectedRef, + string[] expectedRefs, params (UsfmUpdateBlockElementType, string, bool)[] expectedElements ) { - Assert.That(block.Refs.SequenceEqual([ScriptureRef.Parse(expectedRef)])); + var parsedExtractedRefs = expectedRefs.Select(r => ScriptureRef.Parse(r)); + Assert.That(block.Refs.SequenceEqual(parsedExtractedRefs)); Assert.That(block.Elements.Count, Is.EqualTo(expectedElements.Length)); foreach ( ( From 2d90a769eb79e6f2a8ec0b8ad819cbbd63211c03 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 27 May 2025 10:51:46 -0400 Subject: [PATCH 7/9] Fix misported test assertions --- tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs index 14f720013..142887057 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs @@ -91,10 +91,10 @@ public void GetRows_NonEmptyText_AllText() rows[6].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/7:weirdtaglookingthing", corpus.Versification)) ); - Assert.That(rows[7].Text, Is.EqualTo("that is not an actual tag.")); + Assert.That(rows[6].Text, Is.EqualTo("that is not an actual tag.")); - Assert.That(rows[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/8:s", corpus.Versification))); - Assert.That(rows[8].Text, Is.EqualTo("Chapter One")); + Assert.That(rows[7].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/8:s", corpus.Versification))); + Assert.That(rows[7].Text, Is.EqualTo("Chapter One")); Assert.That(rows[16].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/1:tc1", corpus.Versification))); Assert.That(rows[16].Text, Is.EqualTo("Row one, column one.")); @@ -114,10 +114,10 @@ public void GetRows_NonEmptyText_AllText() Assert.That(rows[21].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/4:p", corpus.Versification))); Assert.That(rows[21].Text, Is.Empty); - Assert.That(rows[26].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/2:esb/1:ms", corpus.Versification))); + Assert.That(rows[26].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/1:ms", corpus.Versification))); Assert.That(rows[26].Text, Is.EqualTo("This is a sidebar")); - Assert.That(rows[27].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/2:esb/2:p", corpus.Versification))); + Assert.That(rows[27].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/2:p", corpus.Versification))); Assert.That(rows[27].Text, Is.EqualTo("Here is some sidebar content.")); Assert.That(rows[33].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:7a/1:s", corpus.Versification))); @@ -280,7 +280,7 @@ public void GetRows_IncludeMarkers_AllText() Is.EqualTo("Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one.") ); - Assert.That(rows[27].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/2:esb/2:p", corpus.Versification))); + Assert.That(rows[27].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/2:p", corpus.Versification))); Assert.That(rows[27].Text, Is.EqualTo("Here is some sidebar // content.")); } } From 09a33e6ad8c2e89bf477c8d1813db7778dd5705d Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 27 May 2025 16:09:55 -0400 Subject: [PATCH 8/9] Fix test --- tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index 32d25a67e..113da68ae 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -216,7 +216,6 @@ public void GetRows_OptBreak_OutsideOfSegment() @"\id MAT - Test \c 1 // -\p \v 1 This is the first verse. ", includeAllText: true, From f2f7fd72adc5cfa9bb721ab4da2f6c060dcdde9a Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 28 May 2025 08:44:22 -0400 Subject: [PATCH 9/9] Reviewer requested changes --- .../Corpora/ParatextProjectTextUpdaterBase.cs | 2 +- .../ScriptureRefUsfmParserHandlerBase.cs | 12 +++--- .../Corpora/UpdateUsfmParserHandler.cs | 20 ++++------ src/SIL.Machine/Corpora/UsfmUpdateBlock.cs | 39 ++++++++++++------- .../Corpora/UsfmUpdateBlockHandler.cs | 4 +- .../Corpora/UpdateUsfmParserHandlerTests.cs | 6 +-- .../Corpora/UsfmMemoryTextTests.cs | 6 ++- 7 files changed, 47 insertions(+), 42 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 983b32132..02cf07e3e 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -28,7 +28,7 @@ public string UpdateUsfm( UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable preserveParagraphStyles = null, - IEnumerable updateBlockHandlers = null, + IEnumerable updateBlockHandlers = null, IEnumerable remarks = null ) { diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index 0db3581c9..242cc32d9 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -189,7 +189,7 @@ IReadOnlyList attributes // segment CheckConvertVerseParaToNonVerse(state); if (IsEmbedStyle(markerWithoutPlus)) - StartEmbedTextWrapper(state, markerWithoutPlus); + StartEmbedText(state, markerWithoutPlus); } public override void EndChar( @@ -200,17 +200,17 @@ bool closed ) { if (IsEmbedStyle(marker)) - EndEmbedTextWrapper(state); + EndEmbedText(state); } public override void StartNote(UsfmParserState state, string marker, string caller, string category) { - StartEmbedTextWrapper(state, marker); + StartEmbedText(state, marker); } public override void EndNote(UsfmParserState state, string marker, bool closed) { - EndEmbedTextWrapper(state); + EndEmbedText(state); } protected virtual void StartVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { } @@ -263,7 +263,7 @@ private void UpdateVerseRef(VerseRef verseRef, string marker) _curVerseRef = verseRef; } - private void StartEmbedTextWrapper(UsfmParserState state, string marker) + private void StartEmbedText(UsfmParserState state, string marker) { if (_curVerseRef.IsDefault) UpdateVerseRef(state.VerseRef, marker); @@ -276,7 +276,7 @@ private void StartEmbedTextWrapper(UsfmParserState state, string marker) } } - private void EndEmbedTextWrapper(UsfmParserState state) + private void EndEmbedText(UsfmParserState state) { if (!_duplicateVerse && _curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.Embed) { diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 5c161d433..ae28245f0 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -34,7 +34,7 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase private readonly UpdateUsfmMarkerBehavior _styleBehavior; private readonly HashSet _preserveParagraphStyles; private readonly Stack _updateBlocks; - private readonly Stack _updateBlockHandlers; + private readonly Stack _updateBlockHandlers; private readonly List _remarks; private readonly Stack _replace; private int _rowIndex; @@ -48,7 +48,7 @@ public UpdateUsfmParserHandler( UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable preserveParagraphStyles = null, - IEnumerable updateBlockHandlers = null, + IEnumerable updateBlockHandlers = null, IEnumerable remarks = null ) { @@ -65,8 +65,8 @@ public UpdateUsfmParserHandler( _styleBehavior = styleBehavior; _updateBlockHandlers = updateBlockHandlers == null - ? new Stack() - : new Stack(updateBlockHandlers); + ? new Stack() + : new Stack(updateBlockHandlers); _preserveParagraphStyles = preserveParagraphStyles == null ? new HashSet { "r", "rem" } @@ -403,10 +403,7 @@ private void CollectUpdatableTokens(UsfmParserState state) _embedTokens.Add(token); } else if ( - ( - CurrentTextType != ScriptureTextType.None - || (state.ParaTag != null && state.ParaTag.Marker == "id") - ) + (CurrentTextType != ScriptureTextType.None || state.ParaTag?.Marker == "id") && _updateBlocks.Count > 0 ) { @@ -442,10 +439,7 @@ private void SkipUpdatableTokens(UsfmParserState state) while (_tokenIndex <= state.Index + state.SpecialTokenCount) { UsfmToken token = state.Tokens[_tokenIndex]; - if ( - CurrentTextType != ScriptureTextType.None - || (state.ParaTag != null && state.ParaTag.Marker == "id") - ) + if (CurrentTextType != ScriptureTextType.None || (state.ParaTag?.Marker == "id")) { if (_updateBlocks.Count > 0) { @@ -510,7 +504,7 @@ private void EndUpdateBlock(IReadOnlyList scriptureRefs) PopNewTokens(); UsfmUpdateBlock updateBlock = _updateBlocks.Pop(); updateBlock.UpdateRefs(scriptureRefs); - foreach (UsfmUpdateBlockHandler handler in _updateBlockHandlers) + foreach (IUsfmUpdateBlockHandler handler in _updateBlockHandlers) { updateBlock = handler.ProcessBlock(updateBlock); } diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs index 8ee7cce44..df64a4d60 100644 --- a/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs +++ b/src/SIL.Machine/Corpora/UsfmUpdateBlock.cs @@ -5,21 +5,30 @@ namespace SIL.Machine.Corpora { public class UsfmUpdateBlock { - public List Refs { get; } - public List Elements { get; } + public IReadOnlyList Refs + { + get => _refs; + } + public IReadOnlyList Elements + { + get => _elements; + } + + private readonly List _refs; + private readonly List _elements; public UsfmUpdateBlock( IEnumerable refs = null, IEnumerable elements = null ) { - Refs = refs != null ? refs.ToList() : new List(); - Elements = elements != null ? elements.ToList() : new List(); + _refs = refs != null ? refs.ToList() : new List(); + _elements = elements != null ? elements.ToList() : new List(); } public void AddText(IEnumerable tokens) { - Elements.Add(new UsfmUpdateBlockElement(UsfmUpdateBlockElementType.Text, tokens.ToList())); + _elements.Add(new UsfmUpdateBlockElement(UsfmUpdateBlockElementType.Text, tokens.ToList())); } public void AddToken(UsfmToken token, bool markedForRemoval = false) @@ -41,30 +50,30 @@ public void AddToken(UsfmToken token, bool markedForRemoval = false) type = UsfmUpdateBlockElementType.Other; break; } - Elements.Add(new UsfmUpdateBlockElement(type, new List { token }, markedForRemoval)); + _elements.Add(new UsfmUpdateBlockElement(type, new List { token }, markedForRemoval)); } public void AddEmbed(IEnumerable tokens, bool markedForRemoval = false) { - Elements.Add( + _elements.Add( new UsfmUpdateBlockElement(UsfmUpdateBlockElementType.Embed, tokens.ToList(), markedForRemoval) ); } public void ExtendLastElement(IEnumerable tokens) { - Elements.Last().Tokens.AddRange(tokens); + _elements.Last().Tokens.AddRange(tokens); } public void UpdateRefs(IEnumerable refs) { - Refs.Clear(); - Refs.AddRange(refs); + _refs.Clear(); + _refs.AddRange(refs); } public List GetTokens() { - return Elements.SelectMany(e => e.GetTokens()).ToList(); + return _elements.SelectMany(e => e.GetTokens()).ToList(); } public override bool Equals(object obj) @@ -74,20 +83,20 @@ public override bool Equals(object obj) UsfmUpdateBlock other = (UsfmUpdateBlock)obj; - return Refs.SequenceEqual(other.Refs) && Elements.SequenceEqual(other.Elements); + return _refs.SequenceEqual(other._refs) && _elements.SequenceEqual(other._elements); } public override int GetHashCode() { int hash = 23; - hash = hash * 31 + Refs.GetHashCode(); - hash = hash * 31 + Elements.GetHashCode(); + hash = hash * 31 + _refs.GetHashCode(); + hash = hash * 31 + _elements.GetHashCode(); return hash; } public UsfmUpdateBlock Clone() { - return new UsfmUpdateBlock(Refs, Elements); + return new UsfmUpdateBlock(_refs, _elements); } } } diff --git a/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs b/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs index 68eb0d53f..a255665fa 100644 --- a/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs +++ b/src/SIL.Machine/Corpora/UsfmUpdateBlockHandler.cs @@ -1,7 +1,7 @@ namespace SIL.Machine.Corpora { - public abstract class UsfmUpdateBlockHandler + public interface IUsfmUpdateBlockHandler { - public abstract UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block); + UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block); } } diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 46c2bbfd3..2b24b167c 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -1161,7 +1161,7 @@ private static string UpdateUsfm( UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve, UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip, IEnumerable? preserveParagraphStyles = null, - IEnumerable? usfmUpdateBlockHandlers = null + IEnumerable? usfmUpdateBlockHandlers = null ) { if (source is null) @@ -1230,7 +1230,7 @@ private static void AssertUpdateBlockEquals( } } - private class TestUsfmUpdateBlockHandler : UsfmUpdateBlockHandler + private class TestUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler { public List Blocks { get; } @@ -1239,7 +1239,7 @@ public TestUsfmUpdateBlockHandler() Blocks = new List(); } - public override UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) + public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) { UsfmUpdateBlock newBlock = block.Clone(); Blocks.Add(newBlock); diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index 113da68ae..028968b37 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -216,6 +216,7 @@ public void GetRows_OptBreak_OutsideOfSegment() @"\id MAT - Test \c 1 // +\p \v 1 This is the first verse. ", includeAllText: true, @@ -223,8 +224,9 @@ public void GetRows_OptBreak_OutsideOfSegment() ); Assert.Multiple(() => { - Assert.That(rows, Has.Length.EqualTo(1), string.Join(",", rows.Select(tr => tr.Text))); - Assert.That(rows[0].Text, Is.EqualTo("This is the first verse.")); + Assert.That(rows, Has.Length.EqualTo(2), string.Join(",", rows.Select(tr => tr.Text))); + Assert.That(rows[0].Text, Is.EqualTo("")); + Assert.That(rows[1].Text, Is.EqualTo("This is the first verse.")); }); }