From 9f433b6e2fc3b367c5c86c8705a6915c8f869645 Mon Sep 17 00:00:00 2001 From: Peter Chapman Date: Tue, 30 Sep 2025 12:21:13 +1300 Subject: [PATCH] Correctly parse and place verse text in verse 0 --- .../ScriptureRefUsfmParserHandlerBase.cs | 40 ++++++++----- .../Corpora/UpdateUsfmParserHandler.cs | 10 +++- src/SIL.Machine/Corpora/UsfmParser.cs | 3 + src/SIL.Machine/Corpora/UsfmParserState.cs | 14 +++-- ...PlaceMarkersUsfmUpdateBlockHandlerTests.cs | 57 +++++++++++++++++++ .../Corpora/UsfmMemoryTextTests.cs | 45 ++++++++++++++- 6 files changed, 145 insertions(+), 24 deletions(-) diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index f5a52a79..9e34968e 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -1,6 +1,5 @@ -using System.Collections.Generic; +using System.Collections.Generic; using System.Linq; -using SIL.Extensions; using SIL.Scripture; namespace SIL.Machine.Corpora @@ -67,7 +66,11 @@ public override void Verse( string pubNumber ) { - if (state.VerseRef.Equals(_curVerseRef) && !DuplicateVerse) + if (state.ChapterHasVerseZero && state.VerseRef.VerseNum == 0) + { + // Fall through for the special case of verse 0 being specified in the USFM + } + else if (state.VerseRef.Equals(_curVerseRef) && !DuplicateVerse) { if (state.VerseRef.VerseNum > 0) { @@ -75,6 +78,8 @@ string pubNumber // ignore duplicate verses DuplicateVerse = true; } + + return; } else if (VerseRef.AreOverlappingVersesRanges(verse1: number, verse2: _curVerseRef.Verse)) { @@ -82,16 +87,15 @@ string pubNumber VerseRef verseRef = _curVerseRef.Clone(); verseRef.Verse = CorporaUtils.MergeVerseRanges(number, _curVerseRef.Verse); UpdateVerseRef(verseRef, marker); + return; } + + if (CurrentTextType == ScriptureTextType.NonVerse) + EndNonVerseText(state); else - { - if (CurrentTextType == ScriptureTextType.NonVerse) - EndNonVerseText(state); - else - EndVerseText(state); - UpdateVerseRef(state.VerseRef, marker); - StartVerseText(state); - } + EndVerseText(state); + UpdateVerseRef(state.VerseRef, marker); + StartVerseText(state); } public override void StartPara( @@ -259,9 +263,9 @@ private void StartVerseText(UsfmParserState state) private void EndVerseText(UsfmParserState state) { - if (!DuplicateVerse && _curVerseRef.VerseNum > 0) + if (!DuplicateVerse && (_curVerseRef.VerseNum > 0 || state.ChapterHasVerseZero)) EndVerseText(state, CreateVerseRefs()); - if (_curVerseRef.VerseNum > 0) + if (_curVerseRef.VerseNum > 0 || state.ChapterHasVerseZero) _curTextType.Pop(); } @@ -280,7 +284,14 @@ private void EndNonVerseText(UsfmParserState state) private void UpdateVerseRef(VerseRef verseRef, string marker) { - if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef)) + if (_curVerseRef.VerseNum == 0 && verseRef.VerseNum == 0 && marker == "v") + { + // As the verse 0 marker appears within the middle of verse 0, + // we should not break the position of current element stack by clearing it. + // Instead, we just need to pop the current element off the stack. + _curElements.Pop(); + } + else if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef)) { _curElements.Clear(); _curElements.Push(new ScriptureElement(0, marker)); @@ -357,6 +368,7 @@ private void CheckConvertVerseParaToNonVerse(UsfmParserState state) && paraTag.Marker != "tr" && state.IsVersePara && _curVerseRef.VerseNum == 0 + && !state.ChapterHasVerseZero && !IsPrivateUseMarker(paraTag.Marker) ) { diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 4b9c37ef..d6a23618 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; using System.Linq; using SIL.Scripture; @@ -107,7 +107,7 @@ public UpdateUsfmParserHandler( preserveParagraphStyles == null ? new HashSet { "r", "rem" } : new HashSet(preserveParagraphStyles); - _remarks = remarks == null ? new List() : remarks.ToList(); + _remarks = remarks?.ToList() ?? new List(); _errorHandler = errorHandler; if (_errorHandler == null) _errorHandler = (error) => false; @@ -457,6 +457,12 @@ IReadOnlyList segScrRefs var rowTexts = new List(); Dictionary rowMetadata = null; int sourceIndex = 0; + + // handle the special case of verse 0, which although first in the rows, + // it will be retrieved some of other segments in the verse. + if (segScrRefs.Count > 0 && segScrRefs[0].VerseNum == 0 && segScrRefs[0].Path.Count == 0) + _verseRowIndex = 0; + // search the sorted rows with updated text, starting from where we left off last. while (_verseRowIndex < _verseRows.Count && sourceIndex < segScrRefs.Count) { diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index 8028b2fa..cdf72b97 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -355,6 +355,7 @@ public bool ProcessToken() vref = State.VerseRef; vref.Chapter = token.Data; vref.VerseNum = 0; + State.ChapterHasVerseZero = false; State.VerseRef = vref; // Verse offset is not zeroed for chapter 1, as it is part of intro if (State.VerseRef.ChapterNum != 1) @@ -391,6 +392,8 @@ public bool ProcessToken() // Verse vref = State.VerseRef; vref.Verse = token.Data; + if (vref.VerseNum == 0) + State.ChapterHasVerseZero = true; State.VerseRef = vref; State.VerseOffset = 0; diff --git a/src/SIL.Machine/Corpora/UsfmParserState.cs b/src/SIL.Machine/Corpora/UsfmParserState.cs index b6784096..66f9e3e0 100644 --- a/src/SIL.Machine/Corpora/UsfmParserState.cs +++ b/src/SIL.Machine/Corpora/UsfmParserState.cs @@ -76,6 +76,11 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn /// public int SpecialTokenCount { get; internal set; } + /// + /// true if a chapter has verse 0 specified. + /// + public bool ChapterHasVerseZero { get; internal set; } + /// /// True if the token processed is a figure. /// @@ -104,10 +109,7 @@ public UsfmTag ParaTag /// /// Innermost character tag or null for none /// - public UsfmTag CharTag - { - get { return CharTags.FirstOrDefault(); } - } + public UsfmTag CharTag => CharTags.FirstOrDefault(); /// /// Current note tag or null for none @@ -157,8 +159,8 @@ public bool IsVerseText { get { - // Anything before verse 1 is not verse text - if (VerseRef.VerseNum == 0) + // Anything before verse 1 is not verse text, unless the USFM specified verse 0 + if (VerseRef.VerseNum == 0 && !ChapterHasVerseZero) return false; // Sidebars and notes are not verse text diff --git a/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs index b7503770..0fa163e7 100644 --- a/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs @@ -805,6 +805,63 @@ public void UpdateUsfm_StripParagraphsWithHeaders() AssertUsfmEquals(target, result); } + [Test] + public void UpdateUsfm_SupportVerseZero() + { + // Note: Verse 0 has an empty paragraph as the paragraph occurs before verse text, + // so is not included in the verse text as it is for the paragraphs for the other verses. + IReadOnlyList rows = + [ + new UpdateUsfmRow(ScrRef("MAT 1:0"), "New verse 0"), + new UpdateUsfmRow(ScrRef("MAT 1:0/1:mt"), "New book header"), + new UpdateUsfmRow(ScrRef("MAT 1:0/2:s"), "New chapter header"), + new UpdateUsfmRow(ScrRef("MAT 1:0/3:p"), ""), + new UpdateUsfmRow(ScrRef("MAT 1:0/4:ms"), "New major section header"), + new UpdateUsfmRow(ScrRef("MAT 1:0/5:s"), "New section header 1"), + new UpdateUsfmRow(ScrRef("MAT 1:1"), "New verse 1"), + new UpdateUsfmRow(ScrRef("MAT 1:1/1:s"), "New section header 2"), + new UpdateUsfmRow(ScrRef("MAT 1:2"), "New verse 2"), + new UpdateUsfmRow(ScrRef("MAT 1:3"), "New verse 3"), + ]; + string usfm = + @"\id MAT +\mt Old book header +\c 1 +\s Old chapter header +\p +\v 0 Old verse 0 +\ms Old major section header +\s Old section header 1 +\p +\v 1 Old verse 1 +\s Old section header 2 +\p +\v 2 Old verse 2 +\v 3 Old verse 3 +"; + + string target = UpdateUsfm(rows, usfm, usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()]); + + string result = + @"\id MAT +\mt New book header +\c 1 +\s New chapter header +\p +\v 0 New verse 0 +\ms New major section header +\s New section header 1 +\p +\v 1 New verse 1 +\s New section header 2 +\p +\v 2 New verse 2 +\v 3 New verse 3 +"; + + AssertUsfmEquals(target, result); + } + private static ScriptureRef[] ScrRef(params string[] refs) { return refs.Select(r => ScriptureRef.Parse(r)).ToArray(); diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index d7874af1..dfbe8e7d 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -294,14 +294,55 @@ public void GetRows_VerseZero() Assert.Multiple(() => { - Assert.That(rows, Has.Length.EqualTo(1)); + Assert.That(rows, Has.Length.EqualTo(2)); + + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:0")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That(rows[0].Text, Is.Empty, string.Join(",", rows.ToList().Select(tr => tr.Text))); + + Assert.That( + rows[1].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That(rows[1].Text, Is.EqualTo("Verse one."), string.Join(",", rows.ToList().Select(tr => tr.Text))); + }); + } + + [Test] + public void GetRows_VerseZeroWithText() + { + TextRow[] rows = GetRows( + @"\id MAT - Test +\h +\mt +\c 1 +\p \v 0 Verse zero. +\s +\p \v 1 Verse one. +" + ); + + Assert.Multiple(() => + { + Assert.That(rows, Has.Length.EqualTo(2)); Assert.That( rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:0")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That(rows[0].Text, Is.EqualTo("Verse zero."), string.Join(",", rows.ToList().Select(tr => tr.Text))); + + Assert.That( + rows[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) ); - Assert.That(rows[0].Text, Is.EqualTo("Verse one."), string.Join(",", rows.ToList().Select(tr => tr.Text))); + Assert.That(rows[1].Text, Is.EqualTo("Verse one."), string.Join(",", rows.ToList().Select(tr => tr.Text))); }); }