diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index 242cc32d..2c268ebf 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -33,7 +33,12 @@ protected ScriptureRefUsfmParserHandlerBase() private static bool IsEmbedStyle(string marker) { - return marker != null && (EmbedStyles.Contains(marker.Trim('*')) || marker.StartsWith("z")); + return marker != null && EmbedStyles.Contains(marker.Trim('*')); + } + + private static bool IsPrivateUseMarker(string marker) + { + return marker != null && marker.StartsWith("z"); } public override void EndUsfm(UsfmParserState state) @@ -63,9 +68,12 @@ string pubNumber { if (state.VerseRef.Equals(_curVerseRef) && !_duplicateVerse) { - EndVerseText(state, CreateVerseRefs()); - // ignore duplicate verses - _duplicateVerse = true; + if (state.VerseRef.VerseNum > 0) + { + EndVerseText(state, CreateVerseRefs()); + // ignore duplicate verses + _duplicateVerse = true; + } } else if (VerseRef.AreOverlappingVersesRanges(verse1: number, verse2: _curVerseRef.Verse)) { @@ -92,6 +100,10 @@ public override void StartPara( IReadOnlyList attributes ) { + // ignore private-use markers + if (IsPrivateUseMarker(marker)) + return; + if (_curVerseRef.IsDefault) UpdateVerseRef(state.VerseRef, marker); @@ -104,6 +116,10 @@ IReadOnlyList attributes public override void EndPara(UsfmParserState state, string marker) { + // ignore private-use markers + if (IsPrivateUseMarker(marker)) + return; + if (CurrentTextType == ScriptureTextType.NonVerse) { EndParentElement(); @@ -185,6 +201,10 @@ public override void StartChar( IReadOnlyList attributes ) { + // ignore private-use markers + if (IsPrivateUseMarker(markerWithoutPlus)) + return; + // if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse // segment CheckConvertVerseParaToNonVerse(state); @@ -199,6 +219,10 @@ public override void EndChar( bool closed ) { + // ignore private-use markers + if (IsPrivateUseMarker(marker)) + return; + if (IsEmbedStyle(marker)) EndEmbedText(state); } @@ -332,6 +356,7 @@ private void CheckConvertVerseParaToNonVerse(UsfmParserState state) && paraTag.Marker != "tr" && state.IsVersePara && _curVerseRef.VerseNum == 0 + && !IsPrivateUseMarker(paraTag.Marker) ) { StartParentElement(paraTag.Marker); diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index d9c7f88d..d7874af1 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -278,6 +278,63 @@ public void GetRows_StyleStartingNonVerseParagraphAfterEmptyParagraph() }); } + [Test] + public void GetRows_VerseZero() + { + TextRow[] rows = GetRows( + @"\id MAT - Test +\h +\mt +\c 1 +\p \v 0 +\s +\p \v 1 Verse one. +" + ); + + Assert.Multiple(() => + { + Assert.That(rows, Has.Length.EqualTo(1)); + + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That(rows[0].Text, Is.EqualTo("Verse one."), string.Join(",", rows.ToList().Select(tr => tr.Text))); + }); + } + + [Test] + public void GetRows_PrivateUseMarker() + { + TextRow[] rows = GetRows( + @"\id FRT - Test English Apocrypha +\zmt Ignore this paragraph +\mt1 Test English Apocrypha +\pc Copyright Statement \zimagecopyrights +\pc Further copyright statements +", + includeAllText: true + ); + + Assert.Multiple(() => + { + Assert.That(rows, Has.Length.EqualTo(3)); + + Assert.That( + rows[1].Ref, + Is.EqualTo(ScriptureRef.Parse("FRT 1:0/2:pc")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That( + rows[1].Text, + Is.EqualTo("Copyright Statement"), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); + }); + } + private static TextRow[] GetRows(string usfm, bool includeMarkers = false, bool includeAllText = false) { UsfmMemoryText text =